FreeRDP
Loading...
Searching...
No Matches
nsc_sse2.c
1
20#include <winpr/assert.h>
21#include <winpr/cast.h>
22#include <winpr/platform.h>
23#include <freerdp/config.h>
24
25#include "../nsc_types.h"
26#include "nsc_sse2.h"
27
28#include "../../core/simd.h"
29#include "../../primitives/sse/prim_avxsse.h"
30
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#include <xmmintrin.h>
37#include <emmintrin.h>
38
39#include <freerdp/codec/color.h>
40#include <winpr/crt.h>
41#include <winpr/sysinfo.h>
42
43static inline size_t nsc_encode_next_bgrx32(const BYTE* src, __m128i* r_val, __m128i* g_val,
44 __m128i* b_val, __m128i* a_val)
45{
46 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
47 *(src + 8), *(src + 4), *src);
48 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
49 *(src + 9), *(src + 5), *(src + 1));
50 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
51 *(src + 10), *(src + 6), *(src + 2));
52 *a_val = _mm_set1_epi16(0xFF);
53 return 32;
54}
55
56static inline size_t nsc_encode_next_bgra32(const BYTE* src, __m128i* r_val, __m128i* g_val,
57 __m128i* b_val, __m128i* a_val)
58{
59 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
60 *(src + 8), *(src + 4), *src);
61 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
62 *(src + 9), *(src + 5), *(src + 1));
63 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
64 *(src + 10), *(src + 6), *(src + 2));
65 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
66 *(src + 11), *(src + 7), *(src + 3));
67 return 32;
68}
69
70static inline size_t nsc_encode_next_rgbx32(const BYTE* src, __m128i* r_val, __m128i* g_val,
71 __m128i* b_val, __m128i* a_val)
72{
73 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
74 *(src + 8), *(src + 4), *src);
75 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
76 *(src + 9), *(src + 5), *(src + 1));
77 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
78 *(src + 10), *(src + 6), *(src + 2));
79 *a_val = _mm_set1_epi16(0xFF);
80 return 32;
81}
82
83static inline size_t nsc_encode_next_rgba32(const BYTE* src, __m128i* r_val, __m128i* g_val,
84 __m128i* b_val, __m128i* a_val)
85{
86 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
87 *(src + 8), *(src + 4), *src);
88 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
89 *(src + 9), *(src + 5), *(src + 1));
90 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
91 *(src + 10), *(src + 6), *(src + 2));
92 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
93 *(src + 11), *(src + 7), *(src + 3));
94 return 32;
95}
96
97static inline size_t nsc_encode_next_bgr24(const BYTE* src, __m128i* r_val, __m128i* g_val,
98 __m128i* b_val, __m128i* a_val)
99{
100 *b_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
101 *(src + 6), *(src + 3), *src);
102 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
103 *(src + 7), *(src + 4), *(src + 1));
104 *r_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
105 *(src + 8), *(src + 5), *(src + 2));
106 *a_val = _mm_set1_epi16(0xFF);
107 return 24;
108}
109
110static inline size_t nsc_encode_next_rgb24(const BYTE* src, __m128i* r_val, __m128i* g_val,
111 __m128i* b_val, __m128i* a_val)
112{
113 *r_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
114 *(src + 6), *(src + 3), *src);
115 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
116 *(src + 7), *(src + 4), *(src + 1));
117 *b_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
118 *(src + 8), *(src + 5), *(src + 2));
119 *a_val = _mm_set1_epi16(0xFF);
120 return 24;
121}
122
123static inline size_t nsc_encode_next_bgr16(const BYTE* src, __m128i* r_val, __m128i* g_val,
124 __m128i* b_val, __m128i* a_val)
125{
126 *b_val = _mm_set_epi16(
127 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 15)) & 0xF8) | ((*(src + 15)) >> 5)),
128 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 13)) & 0xF8) | ((*(src + 13)) >> 5)),
129 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 11)) & 0xF8) | ((*(src + 11)) >> 5)),
130 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 9)) & 0xF8) | ((*(src + 9)) >> 5)),
131 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 7)) & 0xF8) | ((*(src + 7)) >> 5)),
132 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 5)) & 0xF8) | ((*(src + 5)) >> 5)),
133 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 3)) & 0xF8) | ((*(src + 3)) >> 5)),
134 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 1)) & 0xF8) | ((*(src + 1)) >> 5)));
135 *g_val = _mm_set_epi16(
136 WINPR_ASSERTING_INT_CAST(INT16,
137 (((*(src + 15)) & 0x07) << 5) | (((*(src + 14)) & 0xE0) >> 3)),
138 WINPR_ASSERTING_INT_CAST(INT16,
139 (((*(src + 13)) & 0x07) << 5) | (((*(src + 12)) & 0xE0) >> 3)),
140 WINPR_ASSERTING_INT_CAST(INT16,
141 (((*(src + 11)) & 0x07) << 5) | (((*(src + 10)) & 0xE0) >> 3)),
142 WINPR_ASSERTING_INT_CAST(INT16,
143 (((*(src + 9)) & 0x07) << 5) | (((*(src + 8)) & 0xE0) >> 3)),
144 WINPR_ASSERTING_INT_CAST(INT16,
145 (((*(src + 7)) & 0x07) << 5) | (((*(src + 6)) & 0xE0) >> 3)),
146 WINPR_ASSERTING_INT_CAST(INT16,
147 (((*(src + 5)) & 0x07) << 5) | (((*(src + 4)) & 0xE0) >> 3)),
148 WINPR_ASSERTING_INT_CAST(INT16,
149 (((*(src + 3)) & 0x07) << 5) | (((*(src + 2)) & 0xE0) >> 3)),
150 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) | (((*src) & 0xE0) >> 3)));
151 *r_val = _mm_set_epi16(
152 WINPR_ASSERTING_INT_CAST(INT16,
153 (((*(src + 14)) & 0x1F) << 3) | (((*(src + 14)) >> 2) & 0x07)),
154 WINPR_ASSERTING_INT_CAST(INT16,
155 (((*(src + 12)) & 0x1F) << 3) | (((*(src + 12)) >> 2) & 0x07)),
156 WINPR_ASSERTING_INT_CAST(INT16,
157 (((*(src + 10)) & 0x1F) << 3) | (((*(src + 10)) >> 2) & 0x07)),
158 WINPR_ASSERTING_INT_CAST(INT16,
159 (((*(src + 8)) & 0x1F) << 3) | (((*(src + 8)) >> 2) & 0x07)),
160 WINPR_ASSERTING_INT_CAST(INT16,
161 (((*(src + 6)) & 0x1F) << 3) | (((*(src + 6)) >> 2) & 0x07)),
162 WINPR_ASSERTING_INT_CAST(INT16,
163 (((*(src + 4)) & 0x1F) << 3) | (((*(src + 4)) >> 2) & 0x07)),
164 WINPR_ASSERTING_INT_CAST(INT16,
165 (((*(src + 2)) & 0x1F) << 3) | (((*(src + 2)) >> 2) & 0x07)),
166 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
167 *a_val = _mm_set1_epi16(0xFF);
168 return 16;
169}
170
171static inline size_t nsc_encode_next_rgb16(const BYTE* src, __m128i* r_val, __m128i* g_val,
172 __m128i* b_val, __m128i* a_val)
173{
174 *r_val = _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, ((src[15] & 0xF8) | (src[15] >> 5))),
175 WINPR_ASSERTING_INT_CAST(INT16, ((src[13] & 0xF8) | (src[13] >> 5))),
176 WINPR_ASSERTING_INT_CAST(INT16, ((src[11] & 0xF8) | (src[11] >> 5))),
177 WINPR_ASSERTING_INT_CAST(INT16, ((src[9] & 0xF8) | (src[9] >> 5))),
178 WINPR_ASSERTING_INT_CAST(INT16, ((src[7] & 0xF8) | (src[7] >> 5))),
179 WINPR_ASSERTING_INT_CAST(INT16, ((src[5] & 0xF8) | (src[5] >> 5))),
180 WINPR_ASSERTING_INT_CAST(INT16, ((src[3] & 0xF8) | (src[3] >> 5))),
181 WINPR_ASSERTING_INT_CAST(INT16, ((src[1] & 0xF8) | (src[1] >> 5))));
182 *g_val = _mm_set_epi16(
183 WINPR_ASSERTING_INT_CAST(INT16,
184 (((*(src + 15)) & 0x07) << 5) | (((*(src + 14)) & 0xE0) >> 3)),
185 WINPR_ASSERTING_INT_CAST(INT16,
186 (((*(src + 13)) & 0x07) << 5) | (((*(src + 12)) & 0xE0) >> 3)),
187 WINPR_ASSERTING_INT_CAST(INT16,
188 (((*(src + 11)) & 0x07) << 5) | (((*(src + 10)) & 0xE0) >> 3)),
189 WINPR_ASSERTING_INT_CAST(INT16,
190 (((*(src + 9)) & 0x07) << 5) | (((*(src + 8)) & 0xE0) >> 3)),
191 WINPR_ASSERTING_INT_CAST(INT16,
192 (((*(src + 7)) & 0x07) << 5) | (((*(src + 6)) & 0xE0) >> 3)),
193 WINPR_ASSERTING_INT_CAST(INT16,
194 (((*(src + 5)) & 0x07) << 5) | (((*(src + 4)) & 0xE0) >> 3)),
195 WINPR_ASSERTING_INT_CAST(INT16,
196 (((*(src + 3)) & 0x07) << 5) | (((*(src + 2)) & 0xE0) >> 3)),
197 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) | (((*src) & 0xE0) >> 3)));
198 *b_val = _mm_set_epi16(
199 WINPR_ASSERTING_INT_CAST(INT16,
200 (((*(src + 14)) & 0x1F) << 3) | (((*(src + 14)) >> 2) & 0x07)),
201 WINPR_ASSERTING_INT_CAST(INT16,
202 (((*(src + 12)) & 0x1F) << 3) | (((*(src + 12)) >> 2) & 0x07)),
203 WINPR_ASSERTING_INT_CAST(INT16,
204 (((*(src + 10)) & 0x1F) << 3) | (((*(src + 10)) >> 2) & 0x07)),
205 WINPR_ASSERTING_INT_CAST(INT16,
206 (((*(src + 8)) & 0x1F) << 3) | (((*(src + 8)) >> 2) & 0x07)),
207 WINPR_ASSERTING_INT_CAST(INT16,
208 (((*(src + 6)) & 0x1F) << 3) | (((*(src + 6)) >> 2) & 0x07)),
209 WINPR_ASSERTING_INT_CAST(INT16,
210 (((*(src + 4)) & 0x1F) << 3) | (((*(src + 4)) >> 2) & 0x07)),
211 WINPR_ASSERTING_INT_CAST(INT16,
212 (((*(src + 2)) & 0x1F) << 3) | (((*(src + 2)) >> 2) & 0x07)),
213 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
214 *a_val = _mm_set1_epi16(0xFF);
215 return 16;
216}
217
218static inline size_t nsc_encode_next_a4(const BYTE* src, const BYTE* palette, __m128i* r_val,
219 __m128i* g_val, __m128i* b_val, __m128i* a_val)
220{
221 BYTE idx[8] = { 0 };
222
223 for (int shift = 7; shift >= 0; shift--)
224 {
225 idx[shift] = ((*src) >> shift) & 1;
226 idx[shift] |= (((*(src + 1)) >> shift) & 1) << 1;
227 idx[shift] |= (((*(src + 2)) >> shift) & 1) << 2;
228 idx[shift] |= (((*(src + 3)) >> shift) & 1) << 3;
229 idx[shift] *= 3;
230 }
231
232 *r_val = _mm_set_epi16(palette[idx[0]], palette[idx[1]], palette[idx[2]], palette[idx[3]],
233 palette[idx[4]], palette[idx[5]], palette[idx[6]], palette[idx[7]]);
234 *g_val = _mm_set_epi16(palette[idx[0] + 1], palette[idx[1] + 1], palette[idx[2] + 1],
235 palette[idx[3] + 1], palette[idx[4] + 1], palette[idx[5] + 1],
236 palette[idx[6] + 1], palette[idx[7] + 1]);
237 *b_val = _mm_set_epi16(palette[idx[0] + 2], palette[idx[1] + 2], palette[idx[2] + 2],
238 palette[idx[3] + 2], palette[idx[4] + 2], palette[idx[5] + 2],
239 palette[idx[6] + 2], palette[idx[7] + 2]);
240 *a_val = _mm_set1_epi16(0xFF);
241 return 4;
242}
243
244static inline size_t nsc_encode_next_rgb8(const BYTE* src, const BYTE* palette, __m128i* r_val,
245 __m128i* g_val, __m128i* b_val, __m128i* a_val)
246{
247 *r_val = _mm_set_epi16(palette[(*(src + 7ULL)) * 3ULL], palette[(*(src + 6ULL)) * 3ULL],
248 palette[(*(src + 5ULL)) * 3ULL], palette[(*(src + 4ULL)) * 3ULL],
249 palette[(*(src + 3ULL)) * 3ULL], palette[(*(src + 2ULL)) * 3ULL],
250 palette[(*(src + 1ULL)) * 3ULL], palette[(*src) * 3ULL]);
251 *g_val = _mm_set_epi16(
252 palette[(*(src + 7ULL)) * 3ULL + 1ULL], palette[(*(src + 6ULL)) * 3ULL + 1ULL],
253 palette[(*(src + 5ULL)) * 3ULL + 1ULL], palette[(*(src + 4ULL)) * 3ULL + 1ULL],
254 palette[(*(src + 3ULL)) * 3ULL + 1ULL], palette[(*(src + 2ULL)) * 3ULL + 1ULL],
255 palette[(*(src + 1ULL)) * 3ULL + 1ULL], palette[(*src) * 3ULL + 1ULL]);
256 *b_val = _mm_set_epi16(
257 palette[(*(src + 7ULL)) * 3ULL + 2ULL], palette[(*(src + 6ULL)) * 3ULL + 2ULL],
258 palette[(*(src + 5ULL)) * 3ULL + 2ULL], palette[(*(src + 4ULL)) * 3ULL + 2ULL],
259 palette[(*(src + 3ULL)) * 3ULL + 2ULL], palette[(*(src + 2ULL)) * 3ULL + 2ULL],
260 palette[(*(src + 1ULL)) * 3ULL + 2ULL], palette[(*src) * 3ULL + 2ULL]);
261 *a_val = _mm_set1_epi16(0xFF);
262 return 8;
263}
264
265static inline size_t nsc_encode_next_rgba(UINT32 format, const BYTE* src, const BYTE* palette,
266 __m128i* r_val, __m128i* g_val, __m128i* b_val,
267 __m128i* a_val)
268{
269 switch (format)
270 {
271 case PIXEL_FORMAT_BGRX32:
272 return nsc_encode_next_bgrx32(src, r_val, g_val, b_val, a_val);
273
274 case PIXEL_FORMAT_BGRA32:
275 return nsc_encode_next_bgra32(src, r_val, g_val, b_val, a_val);
276
277 case PIXEL_FORMAT_RGBX32:
278 return nsc_encode_next_rgbx32(src, r_val, g_val, b_val, a_val);
279
280 case PIXEL_FORMAT_RGBA32:
281 return nsc_encode_next_rgba32(src, r_val, g_val, b_val, a_val);
282
283 case PIXEL_FORMAT_BGR24:
284 return nsc_encode_next_bgr24(src, r_val, g_val, b_val, a_val);
285
286 case PIXEL_FORMAT_RGB24:
287 return nsc_encode_next_rgb24(src, r_val, g_val, b_val, a_val);
288
289 case PIXEL_FORMAT_BGR16:
290 return nsc_encode_next_bgr16(src, r_val, g_val, b_val, a_val);
291
292 case PIXEL_FORMAT_RGB16:
293 return nsc_encode_next_rgb16(src, r_val, g_val, b_val, a_val);
294
295 case PIXEL_FORMAT_A4:
296 return nsc_encode_next_a4(src, palette, r_val, g_val, b_val, a_val);
297
298 case PIXEL_FORMAT_RGB8:
299 return nsc_encode_next_rgb8(src, palette, r_val, g_val, b_val, a_val);
300
301 default:
302 return 0;
303 }
304}
305
306static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, const BYTE* data, UINT32 scanline)
307{
308 size_t y = 0;
309
310 if (!context || !data || (scanline == 0))
311 return FALSE;
312
313 const UINT16 tempWidth = ROUND_UP_TO(context->width, 8);
314 const UINT16 rw = (context->ChromaSubsamplingLevel > 0 ? tempWidth : context->width);
315
316 const BYTE ccl = WINPR_ASSERTING_INT_CAST(BYTE, context->ColorLossLevel);
317
318 for (; y < context->height; y++)
319 {
320 const BYTE* src = data + (context->height - 1 - y) * scanline;
321 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
322 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
323 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
324 BYTE* aplane = context->priv->PlaneBuffers[3] + y * context->width;
325
326 for (UINT16 x = 0; x < context->width; x += 8)
327 {
328 __m128i r_val = { 0 };
329 __m128i g_val = { 0 };
330 __m128i b_val = { 0 };
331 __m128i a_val = { 0 };
332
333 const size_t rc = nsc_encode_next_rgba(context->format, src, context->palette, &r_val,
334 &g_val, &b_val, &a_val);
335 src += rc;
336
337 __m128i y_val = _mm_srai_epi16(r_val, 2);
338 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(g_val, 1));
339 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(b_val, 2));
340 __m128i co_val = _mm_sub_epi16(r_val, b_val);
341 co_val = _mm_srai_epi16(co_val, ccl);
342 __m128i cg_val = _mm_sub_epi16(g_val, _mm_srai_epi16(r_val, 1));
343 cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1));
344 cg_val = _mm_srai_epi16(cg_val, ccl);
345 y_val = _mm_packus_epi16(y_val, y_val);
346 STORE_SI128(yplane, y_val);
347 co_val = _mm_packs_epi16(co_val, co_val);
348 STORE_SI128(coplane, co_val);
349 cg_val = _mm_packs_epi16(cg_val, cg_val);
350 STORE_SI128(cgplane, cg_val);
351 a_val = _mm_packus_epi16(a_val, a_val);
352 STORE_SI128(aplane, a_val);
353 yplane += 8;
354 coplane += 8;
355 cgplane += 8;
356 aplane += 8;
357 }
358
359 if (context->ChromaSubsamplingLevel > 0 && (context->width % 2) == 1)
360 {
361 context->priv->PlaneBuffers[0][y * rw + context->width] =
362 context->priv->PlaneBuffers[0][y * rw + context->width - 1];
363 context->priv->PlaneBuffers[1][y * rw + context->width] =
364 context->priv->PlaneBuffers[1][y * rw + context->width - 1];
365 context->priv->PlaneBuffers[2][y * rw + context->width] =
366 context->priv->PlaneBuffers[2][y * rw + context->width - 1];
367 }
368 }
369
370 if (context->ChromaSubsamplingLevel > 0 && (y % 2) == 1)
371 {
372 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
373 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
374 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
375 CopyMemory(yplane, yplane - rw, rw);
376 CopyMemory(coplane, coplane - rw, rw);
377 CopyMemory(cgplane, cgplane - rw, rw);
378 }
379
380 return TRUE;
381}
382
383static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context)
384{
385 BYTE* co_dst = NULL;
386 BYTE* cg_dst = NULL;
387 INT8* co_src0 = NULL;
388 INT8* co_src1 = NULL;
389 INT8* cg_src0 = NULL;
390 INT8* cg_src1 = NULL;
391 UINT32 tempWidth = 0;
392 UINT32 tempHeight = 0;
393 __m128i t;
394 __m128i val;
395 __m128i mask = _mm_set1_epi16(0xFF);
396 tempWidth = ROUND_UP_TO(context->width, 8);
397 tempHeight = ROUND_UP_TO(context->height, 2);
398
399 for (size_t y = 0; y < tempHeight >> 1; y++)
400 {
401 co_dst = context->priv->PlaneBuffers[1] + y * (tempWidth >> 1);
402 cg_dst = context->priv->PlaneBuffers[2] + y * (tempWidth >> 1);
403 co_src0 = (INT8*)context->priv->PlaneBuffers[1] + (y << 1) * tempWidth;
404 co_src1 = co_src0 + tempWidth;
405 cg_src0 = (INT8*)context->priv->PlaneBuffers[2] + (y << 1) * tempWidth;
406 cg_src1 = cg_src0 + tempWidth;
407
408 for (UINT32 x = 0; x < tempWidth >> 1; x += 8)
409 {
410 t = LOAD_SI128(co_src0);
411 t = _mm_avg_epu8(t, LOAD_SI128(co_src1));
412 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
413 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
414 val = _mm_packus_epi16(val, val);
415 STORE_SI128(co_dst, val);
416 co_dst += 8;
417 co_src0 += 16;
418 co_src1 += 16;
419 t = LOAD_SI128(cg_src0);
420 t = _mm_avg_epu8(t, LOAD_SI128(cg_src1));
421 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
422 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
423 val = _mm_packus_epi16(val, val);
424 STORE_SI128(cg_dst, val);
425 cg_dst += 8;
426 cg_src0 += 16;
427 cg_src1 += 16;
428 }
429 }
430}
431
432static BOOL nsc_encode_sse2(NSC_CONTEXT* WINPR_RESTRICT context, const BYTE* WINPR_RESTRICT data,
433 UINT32 scanline)
434{
435 if (!nsc_encode_argb_to_aycocg_sse2(context, data, scanline))
436 return FALSE;
437
438 if (context->ChromaSubsamplingLevel > 0)
439 nsc_encode_subsampling_sse2(context);
440
441 return TRUE;
442}
443#endif
444
445void nsc_init_sse2_int(NSC_CONTEXT* WINPR_RESTRICT context)
446{
447#if defined(SSE_AVX_INTRINSICS_ENABLED)
448 WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
449 PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2")
450 context->encode = nsc_encode_sse2;
451#else
452 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
453 WINPR_UNUSED(context);
454#endif
455}