 |
FreeRDP
|
Loading...
Searching...
No Matches
18#include "prim_avxsse.h"
43#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44 WINPR_ATTR_NODISCARD \
45 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
46 _type_* WINPR_RESTRICT pDst, UINT32 ulen) \
50 const _type_* sptr = pSrc; \
51 _type_* dptr = pDst; \
53 return PRIMITIVES_SUCCESS; \
56 if (sizeof(_type_) == 1) \
58 else if (sizeof(_type_) == 2) \
60 else if (sizeof(_type_) == 4) \
62 else if (sizeof(_type_) == 8) \
65 size_t count = len >> (8 - shifts); \
66 len -= count << (8 - shifts); \
70 __m128i xmm0 = LOAD_SI128(sptr); \
71 sptr += (16 / sizeof(_type_)); \
72 __m128i xmm1 = LOAD_SI128(sptr); \
73 sptr += (16 / sizeof(_type_)); \
74 __m128i xmm2 = LOAD_SI128(sptr); \
75 sptr += (16 / sizeof(_type_)); \
76 __m128i xmm3 = LOAD_SI128(sptr); \
77 sptr += (16 / sizeof(_type_)); \
78 __m128i xmm4 = LOAD_SI128(sptr); \
79 sptr += (16 / sizeof(_type_)); \
80 __m128i xmm5 = LOAD_SI128(sptr); \
81 sptr += (16 / sizeof(_type_)); \
82 __m128i xmm6 = LOAD_SI128(sptr); \
83 sptr += (16 / sizeof(_type_)); \
84 __m128i xmm7 = LOAD_SI128(sptr); \
85 sptr += (16 / sizeof(_type_)); \
86 xmm0 = _op_(xmm0, (_op_type_)val); \
87 xmm1 = _op_(xmm1, (_op_type_)val); \
88 xmm2 = _op_(xmm2, (_op_type_)val); \
89 xmm3 = _op_(xmm3, (_op_type_)val); \
90 xmm4 = _op_(xmm4, (_op_type_)val); \
91 xmm5 = _op_(xmm5, (_op_type_)val); \
92 xmm6 = _op_(xmm6, (_op_type_)val); \
93 xmm7 = _op_(xmm7, (_op_type_)val); \
94 STORE_SI128(dptr, xmm0); \
95 dptr += (16 / sizeof(_type_)); \
96 STORE_SI128(dptr, xmm1); \
97 dptr += (16 / sizeof(_type_)); \
98 STORE_SI128(dptr, xmm2); \
99 dptr += (16 / sizeof(_type_)); \
100 STORE_SI128(dptr, xmm3); \
101 dptr += (16 / sizeof(_type_)); \
102 STORE_SI128(dptr, xmm4); \
103 dptr += (16 / sizeof(_type_)); \
104 STORE_SI128(dptr, xmm5); \
105 dptr += (16 / sizeof(_type_)); \
106 STORE_SI128(dptr, xmm6); \
107 dptr += (16 / sizeof(_type_)); \
108 STORE_SI128(dptr, xmm7); \
109 dptr += (16 / sizeof(_type_)); \
113 count = len >> (5 - shifts); \
114 len -= count << (5 - shifts); \
117 __m128i xmm0 = LOAD_SI128(sptr); \
118 sptr += (16 / sizeof(_type_)); \
119 xmm0 = _op_(xmm0, (_op_type_)val); \
120 STORE_SI128(dptr, xmm0); \
121 dptr += (16 / sizeof(_type_)); \
128 return PRIMITIVES_SUCCESS; \
135#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
136 WINPR_ATTR_NODISCARD \
137 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
138 _type_* WINPR_RESTRICT pDst, INT32 ilen) \
140 size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
142 const _type_* sptr = pSrc; \
143 _type_* dptr = pDst; \
145 if (sizeof(_type_) == 1) \
147 else if (sizeof(_type_) == 2) \
149 else if (sizeof(_type_) == 4) \
151 else if (sizeof(_type_) == 8) \
154 size_t count = len >> (7 - shifts); \
155 len -= count << (7 - shifts); \
156 xmm0 = mm_set1_epu32(val); \
157 for (size_t x = 0; x < count; x++) \
159 __m128i xmm1 = LOAD_SI128(sptr); \
160 sptr += (16 / sizeof(_type_)); \
161 __m128i xmm2 = LOAD_SI128(sptr); \
162 sptr += (16 / sizeof(_type_)); \
163 __m128i xmm3 = LOAD_SI128(sptr); \
164 sptr += (16 / sizeof(_type_)); \
165 __m128i xmm4 = LOAD_SI128(sptr); \
166 sptr += (16 / sizeof(_type_)); \
167 xmm1 = _op_(xmm1, xmm0); \
168 xmm2 = _op_(xmm2, xmm0); \
169 xmm3 = _op_(xmm3, xmm0); \
170 xmm4 = _op_(xmm4, xmm0); \
171 STORE_SI128(dptr, xmm1); \
172 dptr += (16 / sizeof(_type_)); \
173 STORE_SI128(dptr, xmm2); \
174 dptr += (16 / sizeof(_type_)); \
175 STORE_SI128(dptr, xmm3); \
176 dptr += (16 / sizeof(_type_)); \
177 STORE_SI128(dptr, xmm4); \
178 dptr += (16 / sizeof(_type_)); \
181 count = len >> (5 - shifts); \
182 len -= count << (5 - shifts); \
183 for (size_t x = 0; x < count; x++) \
185 __m128i xmm1 = LOAD_SI128(sptr); \
186 sptr += (16 / sizeof(_type_)); \
187 xmm1 = _op_(xmm1, xmm0); \
188 STORE_SI128(dptr, xmm1); \
189 dptr += (16 / sizeof(_type_)); \
192 for (size_t x = 0; x < len; x++) \
196 return PRIMITIVES_SUCCESS; \
202#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
203 WINPR_ATTR_NODISCARD \
204 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
205 const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
210 const _type_* sptr1 = pSrc1; \
211 const _type_* sptr2 = pSrc2; \
212 _type_* dptr = pDst; \
214 if (sizeof(_type_) == 1) \
216 else if (sizeof(_type_) == 2) \
218 else if (sizeof(_type_) == 4) \
220 else if (sizeof(_type_) == 8) \
223 count = len >> (7 - shifts); \
224 len -= count << (7 - shifts); \
228 __m128i xmm0 = LOAD_SI128(sptr1); \
229 sptr1 += (16 / sizeof(_type_)); \
230 __m128i xmm1 = LOAD_SI128(sptr1); \
231 sptr1 += (16 / sizeof(_type_)); \
232 __m128i xmm2 = LOAD_SI128(sptr1); \
233 sptr1 += (16 / sizeof(_type_)); \
234 __m128i xmm3 = LOAD_SI128(sptr1); \
235 sptr1 += (16 / sizeof(_type_)); \
236 __m128i xmm4 = LOAD_SI128(sptr2); \
237 sptr2 += (16 / sizeof(_type_)); \
238 __m128i xmm5 = LOAD_SI128(sptr2); \
239 sptr2 += (16 / sizeof(_type_)); \
240 __m128i xmm6 = LOAD_SI128(sptr2); \
241 sptr2 += (16 / sizeof(_type_)); \
242 __m128i xmm7 = LOAD_SI128(sptr2); \
243 sptr2 += (16 / sizeof(_type_)); \
244 xmm0 = _op_(xmm0, xmm4); \
245 xmm1 = _op_(xmm1, xmm5); \
246 xmm2 = _op_(xmm2, xmm6); \
247 xmm3 = _op_(xmm3, xmm7); \
248 STORE_SI128(dptr, xmm0); \
249 dptr += (16 / sizeof(_type_)); \
250 STORE_SI128(dptr, xmm1); \
251 dptr += (16 / sizeof(_type_)); \
252 STORE_SI128(dptr, xmm2); \
253 dptr += (16 / sizeof(_type_)); \
254 STORE_SI128(dptr, xmm3); \
255 dptr += (16 / sizeof(_type_)); \
258 count = len >> (5 - shifts); \
259 len -= count << (5 - shifts); \
262 __m128i xmm0 = LOAD_SI128(sptr1); \
263 sptr1 += (16 / sizeof(_type_)); \
264 __m128i xmm1 = LOAD_SI128(sptr2); \
265 sptr2 += (16 / sizeof(_type_)); \
266 xmm0 = _op_(xmm0, xmm1); \
267 STORE_SI128(dptr, xmm0); \
268 dptr += (16 / sizeof(_type_)); \
273 const pstatus_t rc = _slowWay_; \
274 if (rc != PRIMITIVES_SUCCESS) \
277 return PRIMITIVES_SUCCESS; \