FreeRDP
Loading...
Searching...
No Matches
prim_templates.h
1/* prim_templates.h
2 * vi:ts=4 sw=4
3 *
4 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License. You may obtain
7 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing
12 * permissions and limitations under the License. Algorithms used by
13 * this code may be covered by patents by HP, Microsoft, or other parties.
14 */
15
16#pragma once
17
18#include "prim_avxsse.h"
19
20/* These are prototypes for SSE (potentially NEON) routines that do a
21 * simple SSE operation over an array of data. Since so much of this
22 * code is shared except for the operation itself, these prototypes are
23 * used rather than duplicating code. The naming convention depends on
24 * the parameters: S=Source param; C=Constant; D=Destination.
25 * All the macros have parameters for a fallback procedure if the data
26 * is too small and an operation "the slow way" for use at 16-byte edges.
27 */
28
29/* SSE3 note: If someone needs to support an SSE2 version of these without
30 * SSE3 support, an alternative version could be added that merely checks
31 * that 16-byte alignment on both destination and source(s) can be
32 * achieved, rather than use LDDQU for unaligned reads.
33 */
34
35/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
36 * It easily can't do that if the value is stored in a variable.
37 * So don't save it as an intermediate value.
38 */
39
40/* ----------------------------------------------------------------------------
41 * SCD = Source, Constant, Destination
42 */
43#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44 WINPR_ATTR_NODISCARD \
45 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
46 _type_* WINPR_RESTRICT pDst, UINT32 ulen) \
47 { \
48 size_t len = ulen; \
49 INT32 shifts = 0; \
50 const _type_* sptr = pSrc; \
51 _type_* dptr = pDst; \
52 if (val == 0) \
53 return PRIMITIVES_SUCCESS; \
54 if (val >= 16) \
55 return -1; \
56 if (sizeof(_type_) == 1) \
57 shifts = 1; \
58 else if (sizeof(_type_) == 2) \
59 shifts = 2; \
60 else if (sizeof(_type_) == 4) \
61 shifts = 3; \
62 else if (sizeof(_type_) == 8) \
63 shifts = 4; \
64 /* Use 8 128-bit SSE registers. */ \
65 size_t count = len >> (8 - shifts); \
66 len -= count << (8 - shifts); \
67 \
68 while (count--) \
69 { \
70 __m128i xmm0 = LOAD_SI128(sptr); \
71 sptr += (16 / sizeof(_type_)); \
72 __m128i xmm1 = LOAD_SI128(sptr); \
73 sptr += (16 / sizeof(_type_)); \
74 __m128i xmm2 = LOAD_SI128(sptr); \
75 sptr += (16 / sizeof(_type_)); \
76 __m128i xmm3 = LOAD_SI128(sptr); \
77 sptr += (16 / sizeof(_type_)); \
78 __m128i xmm4 = LOAD_SI128(sptr); \
79 sptr += (16 / sizeof(_type_)); \
80 __m128i xmm5 = LOAD_SI128(sptr); \
81 sptr += (16 / sizeof(_type_)); \
82 __m128i xmm6 = LOAD_SI128(sptr); \
83 sptr += (16 / sizeof(_type_)); \
84 __m128i xmm7 = LOAD_SI128(sptr); \
85 sptr += (16 / sizeof(_type_)); \
86 xmm0 = _op_(xmm0, (_op_type_)val); \
87 xmm1 = _op_(xmm1, (_op_type_)val); \
88 xmm2 = _op_(xmm2, (_op_type_)val); \
89 xmm3 = _op_(xmm3, (_op_type_)val); \
90 xmm4 = _op_(xmm4, (_op_type_)val); \
91 xmm5 = _op_(xmm5, (_op_type_)val); \
92 xmm6 = _op_(xmm6, (_op_type_)val); \
93 xmm7 = _op_(xmm7, (_op_type_)val); \
94 STORE_SI128(dptr, xmm0); \
95 dptr += (16 / sizeof(_type_)); \
96 STORE_SI128(dptr, xmm1); \
97 dptr += (16 / sizeof(_type_)); \
98 STORE_SI128(dptr, xmm2); \
99 dptr += (16 / sizeof(_type_)); \
100 STORE_SI128(dptr, xmm3); \
101 dptr += (16 / sizeof(_type_)); \
102 STORE_SI128(dptr, xmm4); \
103 dptr += (16 / sizeof(_type_)); \
104 STORE_SI128(dptr, xmm5); \
105 dptr += (16 / sizeof(_type_)); \
106 STORE_SI128(dptr, xmm6); \
107 dptr += (16 / sizeof(_type_)); \
108 STORE_SI128(dptr, xmm7); \
109 dptr += (16 / sizeof(_type_)); \
110 } \
111 \
112 /* Use a single 128-bit SSE register. */ \
113 count = len >> (5 - shifts); \
114 len -= count << (5 - shifts); \
115 while (count--) \
116 { \
117 __m128i xmm0 = LOAD_SI128(sptr); \
118 sptr += (16 / sizeof(_type_)); \
119 xmm0 = _op_(xmm0, (_op_type_)val); \
120 STORE_SI128(dptr, xmm0); \
121 dptr += (16 / sizeof(_type_)); \
122 } \
123 /* Finish off the remainder. */ \
124 while (len--) \
125 { \
126 _slowWay_; \
127 } \
128 return PRIMITIVES_SUCCESS; \
129 }
130
131/* ----------------------------------------------------------------------------
132 * SCD = Source, Constant, Destination
133 * PRE = preload xmm0 with the constant.
134 */
135#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
136 WINPR_ATTR_NODISCARD \
137 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
138 _type_* WINPR_RESTRICT pDst, INT32 ilen) \
139 { \
140 size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
141 int shifts = 0; \
142 const _type_* sptr = pSrc; \
143 _type_* dptr = pDst; \
144 __m128i xmm0; \
145 if (sizeof(_type_) == 1) \
146 shifts = 1; \
147 else if (sizeof(_type_) == 2) \
148 shifts = 2; \
149 else if (sizeof(_type_) == 4) \
150 shifts = 3; \
151 else if (sizeof(_type_) == 8) \
152 shifts = 4; \
153 /* Use 4 128-bit SSE registers. */ \
154 size_t count = len >> (7 - shifts); \
155 len -= count << (7 - shifts); \
156 xmm0 = mm_set1_epu32(val); \
157 for (size_t x = 0; x < count; x++) \
158 { \
159 __m128i xmm1 = LOAD_SI128(sptr); \
160 sptr += (16 / sizeof(_type_)); \
161 __m128i xmm2 = LOAD_SI128(sptr); \
162 sptr += (16 / sizeof(_type_)); \
163 __m128i xmm3 = LOAD_SI128(sptr); \
164 sptr += (16 / sizeof(_type_)); \
165 __m128i xmm4 = LOAD_SI128(sptr); \
166 sptr += (16 / sizeof(_type_)); \
167 xmm1 = _op_(xmm1, xmm0); \
168 xmm2 = _op_(xmm2, xmm0); \
169 xmm3 = _op_(xmm3, xmm0); \
170 xmm4 = _op_(xmm4, xmm0); \
171 STORE_SI128(dptr, xmm1); \
172 dptr += (16 / sizeof(_type_)); \
173 STORE_SI128(dptr, xmm2); \
174 dptr += (16 / sizeof(_type_)); \
175 STORE_SI128(dptr, xmm3); \
176 dptr += (16 / sizeof(_type_)); \
177 STORE_SI128(dptr, xmm4); \
178 dptr += (16 / sizeof(_type_)); \
179 } \
180 /* Use a single 128-bit SSE register. */ \
181 count = len >> (5 - shifts); \
182 len -= count << (5 - shifts); \
183 for (size_t x = 0; x < count; x++) \
184 { \
185 __m128i xmm1 = LOAD_SI128(sptr); \
186 sptr += (16 / sizeof(_type_)); \
187 xmm1 = _op_(xmm1, xmm0); \
188 STORE_SI128(dptr, xmm1); \
189 dptr += (16 / sizeof(_type_)); \
190 } \
191 /* Finish off the remainder. */ \
192 for (size_t x = 0; x < len; x++) \
193 { \
194 _slowWay_; \
195 } \
196 return PRIMITIVES_SUCCESS; \
197 }
198
199/* ----------------------------------------------------------------------------
200 * SSD = Source1, Source2, Destination
201 */
202#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
203 WINPR_ATTR_NODISCARD \
204 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
205 const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
206 UINT32 ulen) \
207 { \
208 size_t len = ulen; \
209 int shifts = 0; \
210 const _type_* sptr1 = pSrc1; \
211 const _type_* sptr2 = pSrc2; \
212 _type_* dptr = pDst; \
213 size_t count; \
214 if (sizeof(_type_) == 1) \
215 shifts = 1; \
216 else if (sizeof(_type_) == 2) \
217 shifts = 2; \
218 else if (sizeof(_type_) == 4) \
219 shifts = 3; \
220 else if (sizeof(_type_) == 8) \
221 shifts = 4; \
222 /* Use 4 128-bit SSE registers. */ \
223 count = len >> (7 - shifts); \
224 len -= count << (7 - shifts); \
225 /* Aligned loads */ \
226 while (count--) \
227 { \
228 __m128i xmm0 = LOAD_SI128(sptr1); \
229 sptr1 += (16 / sizeof(_type_)); \
230 __m128i xmm1 = LOAD_SI128(sptr1); \
231 sptr1 += (16 / sizeof(_type_)); \
232 __m128i xmm2 = LOAD_SI128(sptr1); \
233 sptr1 += (16 / sizeof(_type_)); \
234 __m128i xmm3 = LOAD_SI128(sptr1); \
235 sptr1 += (16 / sizeof(_type_)); \
236 __m128i xmm4 = LOAD_SI128(sptr2); \
237 sptr2 += (16 / sizeof(_type_)); \
238 __m128i xmm5 = LOAD_SI128(sptr2); \
239 sptr2 += (16 / sizeof(_type_)); \
240 __m128i xmm6 = LOAD_SI128(sptr2); \
241 sptr2 += (16 / sizeof(_type_)); \
242 __m128i xmm7 = LOAD_SI128(sptr2); \
243 sptr2 += (16 / sizeof(_type_)); \
244 xmm0 = _op_(xmm0, xmm4); \
245 xmm1 = _op_(xmm1, xmm5); \
246 xmm2 = _op_(xmm2, xmm6); \
247 xmm3 = _op_(xmm3, xmm7); \
248 STORE_SI128(dptr, xmm0); \
249 dptr += (16 / sizeof(_type_)); \
250 STORE_SI128(dptr, xmm1); \
251 dptr += (16 / sizeof(_type_)); \
252 STORE_SI128(dptr, xmm2); \
253 dptr += (16 / sizeof(_type_)); \
254 STORE_SI128(dptr, xmm3); \
255 dptr += (16 / sizeof(_type_)); \
256 } \
257 /* Use a single 128-bit SSE register. */ \
258 count = len >> (5 - shifts); \
259 len -= count << (5 - shifts); \
260 while (count--) \
261 { \
262 __m128i xmm0 = LOAD_SI128(sptr1); \
263 sptr1 += (16 / sizeof(_type_)); \
264 __m128i xmm1 = LOAD_SI128(sptr2); \
265 sptr2 += (16 / sizeof(_type_)); \
266 xmm0 = _op_(xmm0, xmm1); \
267 STORE_SI128(dptr, xmm0); \
268 dptr += (16 / sizeof(_type_)); \
269 } \
270 /* Finish off the remainder. */ \
271 while (len--) \
272 { \
273 const pstatus_t rc = _slowWay_; \
274 if (rc != PRIMITIVES_SUCCESS) \
275 return rc; \
276 } \
277 return PRIMITIVES_SUCCESS; \
278 }