FreeRDP
Loading...
Searching...
No Matches
rfx_neon.c
1/*
2 FreeRDP: A Remote Desktop Protocol Implementation
3 RemoteFX Codec Library - NEON Optimizations
4
5 Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18*/
19
20#include <winpr/platform.h>
21#include <freerdp/config.h>
22#include <freerdp/log.h>
23
24#include "../rfx_types.h"
25#include "../rfx_quantization.h"
26#include "rfx_neon.h"
27
28#include "../../core/simd.h"
29
30#if defined(NEON_INTRINSICS_ENABLED)
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <arm_neon.h>
36#include <winpr/sysinfo.h>
37
38/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
39
40static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
42{
43 int16x8_t quantFactors = vdupq_n_s16(factor);
44 int16x8_t* buf = (int16x8_t*)buffer;
45 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
46
47 do
48 {
49 int16x8_t val = vld1q_s16((INT16*)buf);
50 val = vshlq_s16(val, quantFactors);
51 vst1q_s16((INT16*)buf, val);
52 buf++;
53 } while (buf < buf_end);
54}
55
56WINPR_ATTR_NODISCARD
57static BOOL rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals,
58 size_t nrQuantVals)
59{
60 WINPR_ASSERT(buffer);
61 WINPR_ASSERT(quantVals);
62 WINPR_ASSERT(nrQuantVals == NR_QUANT_VALUES);
63
64 for (size_t x = 0; x < nrQuantVals; x++)
65 {
66 const UINT32 val = quantVals[x];
67 if (val < 1)
68 return FALSE;
69 }
70
71 rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
72 rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
73 rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
74 rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
75 rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
76 rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
77 rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
78 rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
79 rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
80 rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
81 return TRUE;
82}
83
84static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
86 INT16* WINPR_RESTRICT dst, size_t subband_width)
87{
88 INT16* l_ptr = l;
89 INT16* h_ptr = h;
90 INT16* dst_ptr = dst;
91
92 for (size_t y = 0; y < subband_width; y++)
93 {
94 /* Even coefficients */
95 for (size_t n = 0; n < subband_width; n += 8)
96 {
97 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
98 int16x8_t l_n = vld1q_s16(l_ptr);
99 int16x8_t h_n = vld1q_s16(h_ptr);
100 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
101
102 if (n == 0)
103 {
104 int16_t first = vgetq_lane_s16(h_n_m, 1);
105 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
106 }
107
108 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
109 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
110 tmp_n = vshrq_n_s16(tmp_n, 1);
111 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
112 vst1q_s16(l_ptr, dst_n);
113 l_ptr += 8;
114 h_ptr += 8;
115 }
116
117 l_ptr -= subband_width;
118 h_ptr -= subband_width;
119
120 /* Odd coefficients */
121 for (size_t n = 0; n < subband_width; n += 8)
122 {
123 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
124 int16x8_t h_n = vld1q_s16(h_ptr);
125 h_n = vshlq_n_s16(h_n, 1);
126 int16x8x2_t dst_n;
127 dst_n.val[0] = vld1q_s16(l_ptr);
128 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
129
130 if (n == subband_width - 8)
131 {
132 int16_t last = vgetq_lane_s16(dst_n_p, 6);
133 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
134 }
135
136 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
137 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
138 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
139 vst2q_s16(dst_ptr, dst_n);
140 l_ptr += 8;
141 h_ptr += 8;
142 dst_ptr += 16;
143 }
144 }
145}
146
147static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
149 INT16* WINPR_RESTRICT dst, size_t subband_width)
150{
151 INT16* l_ptr = l;
152 INT16* h_ptr = h;
153 INT16* dst_ptr = dst;
154 const size_t total_width = subband_width + subband_width;
155
156 /* Even coefficients */
157 for (size_t n = 0; n < subband_width; n++)
158 {
159 for (size_t x = 0; x < total_width; x += 8)
160 {
161 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
162 int16x8_t l_n = vld1q_s16(l_ptr);
163 int16x8_t h_n = vld1q_s16(h_ptr);
164 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
165
166 if (n == 0)
167 tmp_n = vaddq_s16(tmp_n, h_n);
168 else
169 {
170 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
171 tmp_n = vaddq_s16(tmp_n, h_n_m);
172 }
173
174 tmp_n = vshrq_n_s16(tmp_n, 1);
175 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
176 vst1q_s16(dst_ptr, dst_n);
177 l_ptr += 8;
178 h_ptr += 8;
179 dst_ptr += 8;
180 }
181
182 dst_ptr += total_width;
183 }
184
185 h_ptr = h;
186 dst_ptr = dst + total_width;
187
188 /* Odd coefficients */
189 for (size_t n = 0; n < subband_width; n++)
190 {
191 for (size_t x = 0; x < total_width; x += 8)
192 {
193 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
194 int16x8_t h_n = vld1q_s16(h_ptr);
195 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
196 h_n = vshlq_n_s16(h_n, 1);
197 int16x8_t tmp_n = dst_n_m;
198
199 if (n == subband_width - 1)
200 tmp_n = vaddq_s16(tmp_n, dst_n_m);
201 else
202 {
203 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
204 tmp_n = vaddq_s16(tmp_n, dst_n_p);
205 }
206
207 tmp_n = vshrq_n_s16(tmp_n, 1);
208 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
209 vst1q_s16(dst_ptr, dst_n);
210 h_ptr += 8;
211 dst_ptr += 8;
212 }
213
214 dst_ptr += total_width;
215 }
216}
217
218static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
220 size_t subband_width)
221{
222 INT16 *hl, *lh, *hh, *ll;
223 INT16 *l_dst, *h_dst;
224 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
225 */
226 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
227 /* The lower part L uses LL(3) and HL(0). */
228 /* The higher part H uses LH(1) and HH(2). */
229 ll = buffer + subband_width * subband_width * 3;
230 hl = buffer;
231 l_dst = idwt;
232 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
233 lh = buffer + subband_width * subband_width;
234 hh = buffer + subband_width * subband_width * 2;
235 h_dst = idwt + subband_width * subband_width * 2;
236 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
237 /* Inverse DWT in vertical direction, results are stored in original buffer. */
238 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
239}
240
241static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
242{
243 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
244 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
245 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
246}
247
248static inline void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
249 const INT16* restrict pHighBand,
250 size_t nHighStep, INT16* restrict pDstBand,
251 size_t nDstStep, size_t nLowCount,
252 size_t nHighCount, size_t nDstCount)
253{
254 WINPR_ASSERT(pLowBand);
255 WINPR_ASSERT(pHighBand);
256 WINPR_ASSERT(pDstBand);
257
258 INT16* l_ptr = pLowBand;
259 const INT16* h_ptr = pHighBand;
260 INT16* dst_ptr = pDstBand;
261 size_t batchSize = (nLowCount + nHighCount) >> 1;
262
263 for (size_t y = 0; y < nDstCount; y++)
264 {
265 /* Even coefficients */
266 size_t n = 0;
267 for (; n < batchSize; n += 8)
268 {
269 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
270 int16x8_t l_n = vld1q_s16(l_ptr);
271 int16x8_t h_n = vld1q_s16(h_ptr);
272 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
273
274 if (n == 0)
275 {
276 int16_t first = vgetq_lane_s16(h_n_m, 1);
277 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
278 }
279 else if (n == 24)
280 h_n = vsetq_lane_s16(0, h_n, 7);
281
282 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
283 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
284 tmp_n = vshrq_n_s16(tmp_n, 1);
285 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
286 vst1q_s16(l_ptr, dst_n);
287 l_ptr += 8;
288 h_ptr += 8;
289 }
290 if (n < 32)
291 *l_ptr -= *(h_ptr - 1);
292
293 l_ptr -= batchSize;
294 h_ptr -= batchSize;
295
296 /* Odd coefficients */
297 n = 0;
298 for (; n < batchSize; n += 8)
299 {
300 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
301 int16x8_t h_n = vld1q_s16(h_ptr);
302 h_n = vshlq_n_s16(h_n, 1);
303 int16x8x2_t dst_n;
304 dst_n.val[0] = vld1q_s16(l_ptr);
305 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
306
307 if (n == 24)
308 h_n = vsetq_lane_s16(0, h_n, 7);
309
310 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
311 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
312 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
313 vst2q_s16(dst_ptr, dst_n);
314 l_ptr += 8;
315 h_ptr += 8;
316 dst_ptr += 16;
317 }
318 if (n == 32)
319 {
320 h_ptr -= 1;
321 l_ptr += 1;
322 }
323 else
324 {
325 *dst_ptr = *l_ptr;
326 l_ptr += 1;
327 dst_ptr += 1;
328 }
329 }
330}
331
332static inline void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
333 const INT16* restrict pHighBand, size_t nHighStep,
334 INT16* restrict pDstBand, size_t nDstStep,
335 size_t nLowCount, size_t nHighCount,
336 size_t nDstCount)
337{
338 WINPR_ASSERT(pLowBand);
339 WINPR_ASSERT(pHighBand);
340 WINPR_ASSERT(pDstBand);
341
342 const INT16* l_ptr = pLowBand;
343 const INT16* h_ptr = pHighBand;
344 INT16* dst_ptr = pDstBand;
345 size_t batchSize = (nDstCount >> 3) << 3;
346 size_t forceBandSize = (nLowCount + nHighCount) >> 1;
347
348 /* Even coefficients */
349 for (size_t n = 0; n < forceBandSize; n++)
350 {
351 for (size_t x = 0; x < batchSize; x += 8)
352 {
353 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
354 int16x8_t l_n = vld1q_s16(l_ptr);
355 int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
356 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
357
358 if (n == 0)
359 tmp_n = vaddq_s16(tmp_n, h_n);
360 else if (n < 31)
361 {
362 int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
363 tmp_n = vaddq_s16(tmp_n, h_n_m);
364 }
365
366 tmp_n = vshrq_n_s16(tmp_n, 1);
367 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
368 vst1q_s16(dst_ptr, dst_n);
369 l_ptr += 8;
370 h_ptr += 8;
371 dst_ptr += 8;
372 }
373
374 if (nDstCount > batchSize)
375 {
376 int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
377 int16_t tmp_n = h_n + 1;
378 if (n == 0)
379 tmp_n += h_n;
380 else if (n < 31)
381 tmp_n += *(h_ptr - nHighStep);
382 tmp_n >>= 1;
383 *dst_ptr = *l_ptr - tmp_n;
384 l_ptr += 1;
385 h_ptr += 1;
386 dst_ptr += 1;
387 }
388
389 dst_ptr += nDstStep;
390 }
391
392 if (forceBandSize < 32)
393 {
394 for (size_t x = 0; x < batchSize; x += 8)
395 {
396 int16x8_t l_n = vld1q_s16(l_ptr);
397 int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
398 int16x8_t tmp_n = vsubq_s16(l_n, h_n);
399 vst1q_s16(dst_ptr, tmp_n);
400 l_ptr += 8;
401 h_ptr += 8;
402 dst_ptr += 8;
403 }
404
405 if (nDstCount > batchSize)
406 {
407 *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
408 l_ptr += 1;
409 h_ptr += 1;
410 dst_ptr += 1;
411 }
412 }
413
414 h_ptr = pHighBand;
415 dst_ptr = pDstBand + nDstStep;
416
417 /* Odd coefficients */
418 for (size_t n = 0; n < forceBandSize; n++)
419 {
420 for (size_t x = 0; x < batchSize; x += 8)
421 {
422 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
423 int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
424 if (n == 31)
425 {
426 int16x8_t dst_n_p = vld1q_s16(l_ptr);
427 l_ptr += 8;
428 tmp_n = vaddq_s16(tmp_n, dst_n_p);
429 tmp_n = vshrq_n_s16(tmp_n, 1);
430 }
431 else
432 {
433 int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
434 tmp_n = vaddq_s16(tmp_n, dst_n_p);
435 tmp_n = vshrq_n_s16(tmp_n, 1);
436 int16x8_t h_n = vld1q_s16(h_ptr);
437 h_n = vshlq_n_s16(h_n, 1);
438 tmp_n = vaddq_s16(tmp_n, h_n);
439 }
440 vst1q_s16(dst_ptr, tmp_n);
441 h_ptr += 8;
442 dst_ptr += 8;
443 }
444
445 if (nDstCount > batchSize)
446 {
447 int16_t tmp_n = *(dst_ptr - nDstStep);
448 if (n == 31)
449 {
450 int16_t dst_n_p = *l_ptr;
451 l_ptr += 1;
452 tmp_n += dst_n_p;
453 tmp_n >>= 1;
454 }
455 else
456 {
457 int16_t dst_n_p = *(dst_ptr + nDstStep);
458 tmp_n += dst_n_p;
459 tmp_n >>= 1;
460 int16_t h_n = *h_ptr;
461 h_n <<= 1;
462 tmp_n += h_n;
463 }
464 *dst_ptr = tmp_n;
465 h_ptr += 1;
466 dst_ptr += 1;
467 }
468
469 dst_ptr += nDstStep;
470 }
471}
472
473static inline size_t prfx_get_band_l_count(size_t level)
474{
475 return (64 >> level) + 1;
476}
477
478static inline size_t prfx_get_band_h_count(size_t level)
479{
480 if (level == 1)
481 return (64 >> 1) - 1;
482 else
483 return (64 + (1u << (level - 1))) >> level;
484}
485
486static inline void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
487 size_t level)
488{
489 size_t nDstStepX;
490 size_t nDstStepY;
491 INT16 *HL, *LH;
492 INT16 *HH, *LL;
493 INT16 *L, *H, *LLx;
494
495 const size_t nBandL = prfx_get_band_l_count(level);
496 const size_t nBandH = prfx_get_band_h_count(level);
497 size_t offset = 0;
498
499 WINPR_ASSERT(buffer);
500 WINPR_ASSERT(temp);
501
502 HL = &buffer[offset];
503 offset += (nBandH * nBandL);
504 LH = &buffer[offset];
505 offset += (nBandL * nBandH);
506 HH = &buffer[offset];
507 offset += (nBandH * nBandH);
508 LL = &buffer[offset];
509 nDstStepX = (nBandL + nBandH);
510 nDstStepY = (nBandL + nBandH);
511 offset = 0;
512 L = &temp[offset];
513 offset += (nBandL * nDstStepX);
514 H = &temp[offset];
515 LLx = &buffer[0];
516
517 /* horizontal (LL + HL -> L) */
518 rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
519
520 /* horizontal (LH + HH -> H) */
521 rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
522
523 /* vertical (L + H -> LL) */
524 rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
525 nBandL + nBandH);
526}
527
528static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
529{
530 WINPR_ASSERT(buffer);
531 WINPR_ASSERT(temp);
532 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
533 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
534 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
535}
536#endif // NEON_INTRINSICS_ENABLED
537
538void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
539{
540#if defined(NEON_INTRINSICS_ENABLED)
541 WLog_VRB(PRIM_TAG, "NEON optimizations");
542 PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
543 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
544 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
545 context->quantization_decode = rfx_quantization_decode_NEON;
546 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
547 context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
548#else
549 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
550 WINPR_UNUSED(context);
551#endif
552}