20#include <winpr/platform.h>
21#include <freerdp/config.h>
22#include <freerdp/log.h>
24#include "../rfx_types.h"
25#include "../rfx_quantization.h"
28#include "../../core/simd.h"
30#if defined(NEON_INTRINSICS_ENABLED)
36#include <winpr/sysinfo.h>
40static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41rfx_quantization_decode_block_NEON(INT16* buffer,
const size_t buffer_size,
const UINT32 factor)
43 int16x8_t quantFactors = vdupq_n_s16(factor);
44 int16x8_t* buf = (int16x8_t*)buffer;
45 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
49 int16x8_t val = vld1q_s16((INT16*)buf);
50 val = vshlq_s16(val, quantFactors);
51 vst1q_s16((INT16*)buf, val);
53 }
while (buf < buf_end);
57static BOOL rfx_quantization_decode_NEON(INT16* buffer,
const UINT32* WINPR_RESTRICT quantVals,
61 WINPR_ASSERT(quantVals);
62 WINPR_ASSERT(nrQuantVals == NR_QUANT_VALUES);
64 for (
size_t x = 0; x < nrQuantVals; x++)
66 const UINT32 val = quantVals[x];
71 rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);
72 rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1);
73 rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1);
74 rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);
75 rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);
76 rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);
77 rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);
78 rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);
79 rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);
80 rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);
84static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
86 INT16* WINPR_RESTRICT dst,
size_t subband_width)
92 for (
size_t y = 0; y < subband_width; y++)
95 for (
size_t n = 0; n < subband_width; n += 8)
98 int16x8_t l_n = vld1q_s16(l_ptr);
99 int16x8_t h_n = vld1q_s16(h_ptr);
100 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
104 int16_t first = vgetq_lane_s16(h_n_m, 1);
105 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
108 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
109 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
110 tmp_n = vshrq_n_s16(tmp_n, 1);
111 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
112 vst1q_s16(l_ptr, dst_n);
117 l_ptr -= subband_width;
118 h_ptr -= subband_width;
121 for (
size_t n = 0; n < subband_width; n += 8)
124 int16x8_t h_n = vld1q_s16(h_ptr);
125 h_n = vshlq_n_s16(h_n, 1);
127 dst_n.val[0] = vld1q_s16(l_ptr);
128 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
130 if (n == subband_width - 8)
132 int16_t last = vgetq_lane_s16(dst_n_p, 6);
133 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
136 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
137 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
138 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
139 vst2q_s16(dst_ptr, dst_n);
147static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
149 INT16* WINPR_RESTRICT dst,
size_t subband_width)
153 INT16* dst_ptr = dst;
154 const size_t total_width = subband_width + subband_width;
157 for (
size_t n = 0; n < subband_width; n++)
159 for (
size_t x = 0; x < total_width; x += 8)
162 int16x8_t l_n = vld1q_s16(l_ptr);
163 int16x8_t h_n = vld1q_s16(h_ptr);
164 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
167 tmp_n = vaddq_s16(tmp_n, h_n);
170 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
171 tmp_n = vaddq_s16(tmp_n, h_n_m);
174 tmp_n = vshrq_n_s16(tmp_n, 1);
175 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
176 vst1q_s16(dst_ptr, dst_n);
182 dst_ptr += total_width;
186 dst_ptr = dst + total_width;
189 for (
size_t n = 0; n < subband_width; n++)
191 for (
size_t x = 0; x < total_width; x += 8)
194 int16x8_t h_n = vld1q_s16(h_ptr);
195 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
196 h_n = vshlq_n_s16(h_n, 1);
197 int16x8_t tmp_n = dst_n_m;
199 if (n == subband_width - 1)
200 tmp_n = vaddq_s16(tmp_n, dst_n_m);
203 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
204 tmp_n = vaddq_s16(tmp_n, dst_n_p);
207 tmp_n = vshrq_n_s16(tmp_n, 1);
208 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
209 vst1q_s16(dst_ptr, dst_n);
214 dst_ptr += total_width;
218static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
220 size_t subband_width)
222 INT16 *hl, *lh, *hh, *ll;
223 INT16 *l_dst, *h_dst;
229 ll = buffer + subband_width * subband_width * 3;
232 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
233 lh = buffer + subband_width * subband_width;
234 hh = buffer + subband_width * subband_width * 2;
235 h_dst = idwt + subband_width * subband_width * 2;
236 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
238 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
241static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
243 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
244 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
245 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
248static inline void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand,
size_t nLowStep,
249 const INT16* restrict pHighBand,
250 size_t nHighStep, INT16* restrict pDstBand,
251 size_t nDstStep,
size_t nLowCount,
252 size_t nHighCount,
size_t nDstCount)
254 WINPR_ASSERT(pLowBand);
255 WINPR_ASSERT(pHighBand);
256 WINPR_ASSERT(pDstBand);
258 INT16* l_ptr = pLowBand;
259 const INT16* h_ptr = pHighBand;
260 INT16* dst_ptr = pDstBand;
261 size_t batchSize = (nLowCount + nHighCount) >> 1;
263 for (
size_t y = 0; y < nDstCount; y++)
267 for (; n < batchSize; n += 8)
270 int16x8_t l_n = vld1q_s16(l_ptr);
271 int16x8_t h_n = vld1q_s16(h_ptr);
272 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
276 int16_t first = vgetq_lane_s16(h_n_m, 1);
277 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
280 h_n = vsetq_lane_s16(0, h_n, 7);
282 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
283 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
284 tmp_n = vshrq_n_s16(tmp_n, 1);
285 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
286 vst1q_s16(l_ptr, dst_n);
291 *l_ptr -= *(h_ptr - 1);
298 for (; n < batchSize; n += 8)
301 int16x8_t h_n = vld1q_s16(h_ptr);
302 h_n = vshlq_n_s16(h_n, 1);
304 dst_n.val[0] = vld1q_s16(l_ptr);
305 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
308 h_n = vsetq_lane_s16(0, h_n, 7);
310 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
311 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
312 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
313 vst2q_s16(dst_ptr, dst_n);
332static inline void rfx_idwt_extrapolate_vert_neon(
const INT16* restrict pLowBand,
size_t nLowStep,
333 const INT16* restrict pHighBand,
size_t nHighStep,
334 INT16* restrict pDstBand,
size_t nDstStep,
335 size_t nLowCount,
size_t nHighCount,
338 WINPR_ASSERT(pLowBand);
339 WINPR_ASSERT(pHighBand);
340 WINPR_ASSERT(pDstBand);
342 const INT16* l_ptr = pLowBand;
343 const INT16* h_ptr = pHighBand;
344 INT16* dst_ptr = pDstBand;
345 size_t batchSize = (nDstCount >> 3) << 3;
346 size_t forceBandSize = (nLowCount + nHighCount) >> 1;
349 for (
size_t n = 0; n < forceBandSize; n++)
351 for (
size_t x = 0; x < batchSize; x += 8)
354 int16x8_t l_n = vld1q_s16(l_ptr);
355 int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
356 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
359 tmp_n = vaddq_s16(tmp_n, h_n);
362 int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
363 tmp_n = vaddq_s16(tmp_n, h_n_m);
366 tmp_n = vshrq_n_s16(tmp_n, 1);
367 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
368 vst1q_s16(dst_ptr, dst_n);
374 if (nDstCount > batchSize)
376 int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
377 int16_t tmp_n = h_n + 1;
381 tmp_n += *(h_ptr - nHighStep);
383 *dst_ptr = *l_ptr - tmp_n;
392 if (forceBandSize < 32)
394 for (
size_t x = 0; x < batchSize; x += 8)
396 int16x8_t l_n = vld1q_s16(l_ptr);
397 int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
398 int16x8_t tmp_n = vsubq_s16(l_n, h_n);
399 vst1q_s16(dst_ptr, tmp_n);
405 if (nDstCount > batchSize)
407 *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
415 dst_ptr = pDstBand + nDstStep;
418 for (
size_t n = 0; n < forceBandSize; n++)
420 for (
size_t x = 0; x < batchSize; x += 8)
423 int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
426 int16x8_t dst_n_p = vld1q_s16(l_ptr);
428 tmp_n = vaddq_s16(tmp_n, dst_n_p);
429 tmp_n = vshrq_n_s16(tmp_n, 1);
433 int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
434 tmp_n = vaddq_s16(tmp_n, dst_n_p);
435 tmp_n = vshrq_n_s16(tmp_n, 1);
436 int16x8_t h_n = vld1q_s16(h_ptr);
437 h_n = vshlq_n_s16(h_n, 1);
438 tmp_n = vaddq_s16(tmp_n, h_n);
440 vst1q_s16(dst_ptr, tmp_n);
445 if (nDstCount > batchSize)
447 int16_t tmp_n = *(dst_ptr - nDstStep);
450 int16_t dst_n_p = *l_ptr;
457 int16_t dst_n_p = *(dst_ptr + nDstStep);
460 int16_t h_n = *h_ptr;
473static inline size_t prfx_get_band_l_count(
size_t level)
475 return (64 >> level) + 1;
478static inline size_t prfx_get_band_h_count(
size_t level)
481 return (64 >> 1) - 1;
483 return (64 + (1u << (level - 1))) >> level;
486static inline void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
495 const size_t nBandL = prfx_get_band_l_count(level);
496 const size_t nBandH = prfx_get_band_h_count(level);
499 WINPR_ASSERT(buffer);
502 HL = &buffer[offset];
503 offset += (nBandH * nBandL);
504 LH = &buffer[offset];
505 offset += (nBandL * nBandH);
506 HH = &buffer[offset];
507 offset += (nBandH * nBandH);
508 LL = &buffer[offset];
509 nDstStepX = (nBandL + nBandH);
510 nDstStepY = (nBandL + nBandH);
513 offset += (nBandL * nDstStepX);
518 rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
521 rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
524 rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
528static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
530 WINPR_ASSERT(buffer);
532 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
533 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
534 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
538void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
540#if defined(NEON_INTRINSICS_ENABLED)
541 WLog_VRB(PRIM_TAG,
"NEON optimizations");
542 PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb,
"rfx_decode_YCbCr_to_RGB_NEON");
543 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
"rfx_quantization_decode_NEON");
544 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode,
"rfx_dwt_2d_decode_NEON");
545 context->quantization_decode = rfx_quantization_decode_NEON;
546 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
547 context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
549 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or NEON intrinsics not available");
550 WINPR_UNUSED(context);