20#include <freerdp/config.h> 
   22#include <freerdp/types.h> 
   23#include <freerdp/primitives.h> 
   24#include <winpr/sysinfo.h> 
   26#include "prim_internal.h" 
   27#include "prim_colors.h" 
   30#if defined(NEON_INTRINSICS_ENABLED) 
   35static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
 
   36                                                BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
 
   37                                                const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
 
   38                                                uint8_t gPos, uint8_t bPos, uint8_t aPos)
 
   41  const INT16* pY = pSrc[0];
 
   42  const INT16* pCb = pSrc[1];
 
   43  const INT16* pCr = pSrc[2];
 
   44  const size_t srcPad = (srcStep - (roi->width * 
sizeof(INT16))) / 
sizeof(INT16);
 
   45  const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
 
   46  const size_t pad = roi->width % 8;
 
   47  const int16x4_t c4096 = vdup_n_s16(4096);
 
   49  for (UINT32 y = 0; y < roi->height; y++)
 
   51    for (UINT32 x = 0; x < roi->width - pad; x += 8)
 
   53      const int16x8_t Y = vld1q_s16(pY);
 
   54      const int16x4_t Yh = vget_high_s16(Y);
 
   55      const int16x4_t Yl = vget_low_s16(Y);
 
   56      const int32x4_t YhAdd = vaddl_s16(Yh, c4096); 
 
   57      const int32x4_t YlAdd = vaddl_s16(Yl, c4096); 
 
   58      const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
 
   59      const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
 
   60      const int16x8_t Cr = vld1q_s16(pCr);
 
   61      const int16x4_t Crh = vget_high_s16(Cr);
 
   62      const int16x4_t Crl = vget_low_s16(Cr);
 
   63      const int16x8_t Cb = vld1q_s16(pCb);
 
   64      const int16x4_t Cbh = vget_high_s16(Cb);
 
   65      const int16x4_t Cbl = vget_low_s16(Cb);
 
   69        const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); 
 
   70        const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); 
 
   71        const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
 
   72        const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
 
   73        const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
 
   74        const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
 
   75        const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
 
   76        bgrx.val[rPos] = vqmovun_s16(Rs);
 
   80        const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            
 
   81        const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            
 
   82        const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); 
 
   83        const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); 
 
   84        const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
 
   85        const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
 
   86        const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
 
   87        const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
 
   88        const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
 
   89        const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
 
   90        const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
 
   91        const uint8x8_t G = vqmovun_s16(Gs);
 
   96        const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); 
 
   97        const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); 
 
   98        const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
 
   99        const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
 
  100        const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
 
  101        const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
 
  102        const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
 
  103        const uint8x8_t B = vqmovun_s16(Bs);
 
  108        bgrx.val[aPos] = vdup_n_u8(0xFF);
 
  117    for (UINT32 x = 0; x < pad; x++)
 
  119      const INT32 divisor = 16;
 
  120      const INT32 Y = ((*pY++) + 4096) << divisor;
 
  121      const INT32 Cb = (*pCb++);
 
  122      const INT32 Cr = (*pCr++);
 
  123      const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
 
  124      const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
 
  125      const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
 
  126      const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
 
  127      INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
 
  128      INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
 
  129      INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
 
  131      bgrx[bPos] = CLIP(B);
 
  132      bgrx[gPos] = CLIP(G);
 
  133      bgrx[rPos] = CLIP(R);
 
  147  return PRIMITIVES_SUCCESS;
 
  150static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
 
  151                                              BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
 
  157    case PIXEL_FORMAT_BGRA32:
 
  158    case PIXEL_FORMAT_BGRX32:
 
  159      return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
 
  161    case PIXEL_FORMAT_RGBA32:
 
  162    case PIXEL_FORMAT_RGBX32:
 
  163      return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
 
  165    case PIXEL_FORMAT_ARGB32:
 
  166    case PIXEL_FORMAT_XRGB32:
 
  167      return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
 
  169    case PIXEL_FORMAT_ABGR32:
 
  170    case PIXEL_FORMAT_XBGR32:
 
  171      return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
 
  174      return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
 
  179neon_RGBToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3], 
 
  181                             BYTE* WINPR_RESTRICT pDst, 
 
  184                             uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
 
  186  UINT32 pad = roi->width % 8;
 
  188  for (UINT32 y = 0; y < roi->height; y++)
 
  190    const INT16* pr = (
const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
 
  191    const INT16* pg = (
const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
 
  192    const INT16* pb = (
const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
 
  193    BYTE* dst = pDst + y * dstStep;
 
  195    for (UINT32 x = 0; x < roi->width - pad; x += 8)
 
  197      int16x8_t r = vld1q_s16(pr);
 
  198      int16x8_t g = vld1q_s16(pg);
 
  199      int16x8_t b = vld1q_s16(pb);
 
  201      bgrx.val[aPos] = vdup_n_u8(0xFF);
 
  202      bgrx.val[rPos] = vqmovun_s16(r);
 
  203      bgrx.val[gPos] = vqmovun_s16(g);
 
  204      bgrx.val[bPos] = vqmovun_s16(b);
 
  212    for (UINT32 x = 0; x < pad; x++)
 
  226  return PRIMITIVES_SUCCESS;
 
  230neon_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], 
 
  232                           BYTE* WINPR_RESTRICT pDst, 
 
  239    case PIXEL_FORMAT_BGRA32:
 
  240    case PIXEL_FORMAT_BGRX32:
 
  241      return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
 
  243    case PIXEL_FORMAT_RGBA32:
 
  244    case PIXEL_FORMAT_RGBX32:
 
  245      return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
 
  247    case PIXEL_FORMAT_ARGB32:
 
  248    case PIXEL_FORMAT_XRGB32:
 
  249      return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
 
  251    case PIXEL_FORMAT_ABGR32:
 
  252    case PIXEL_FORMAT_XBGR32:
 
  253      return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
 
  256      return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
 
  262void primitives_init_colors_neon_int(
primitives_t* WINPR_RESTRICT prims)
 
  264#if defined(NEON_INTRINSICS_ENABLED) 
  265  generic = primitives_get_generic();
 
  267  WLog_VRB(PRIM_TAG, 
"NEON optimizations");
 
  268  prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
 
  269  prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
 
  271  WLog_VRB(PRIM_TAG, 
"undefined WITH_SIMD or neon intrinsics not available");