16#include <winpr/sysinfo.h> 
   18#include <freerdp/config.h> 
   21#include <freerdp/types.h> 
   22#include <freerdp/primitives.h> 
   23#include <freerdp/log.h> 
   25#include "prim_internal.h" 
   26#include "prim_avxsse.h" 
   28#include "../codec/color.h" 
   30#include <freerdp/codec/color.h> 
   32#if defined(SSE_AVX_INTRINSICS_ENABLED) 
   36static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
 
   37                                                    UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
 
   39                                                    const BYTE* WINPR_RESTRICT pSrcData,
 
   40                                                    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
   41                                                    int64_t srcVMultiplier, int64_t srcVOffset,
 
   42                                                    int64_t dstVMultiplier, int64_t dstVOffset)
 
   45  const int64_t srcByte = 3;
 
   46  const int64_t dstByte = 4;
 
   48  const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
 
   49  const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
 
   50  const UINT32 rem = nWidth % 4;
 
   52  const int64_t width = nWidth - rem;
 
   53  for (int64_t y = 0; y < nHeight; y++)
 
   55    const BYTE* WINPR_RESTRICT srcLine =
 
   56        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
 
   57    BYTE* WINPR_RESTRICT dstLine =
 
   58        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
 
   62    for (; x < width; x += 4)
 
   64      const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
 
   65      __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
 
   66      const __m128i s0 = LOAD_SI128(src);
 
   67      const __m128i s1 = _mm_shuffle_epi8(s0, smask);
 
   68      const __m128i s2 = LOAD_SI128(dst);
 
   70      __m128i d0 = _mm_blendv_epi8(s1, s2, mask);
 
   74    for (; x < nWidth; x++)
 
   76      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
 
   77      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
 
   84  return PRIMITIVES_SUCCESS;
 
   87static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
 
   88                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
 
   90                                                     const BYTE* WINPR_RESTRICT pSrcData,
 
   91                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
   92                                                     int64_t srcVMultiplier, int64_t srcVOffset,
 
   93                                                     int64_t dstVMultiplier, int64_t dstVOffset)
 
   96  const int64_t srcByte = 4;
 
   97  const int64_t dstByte = 4;
 
   99  const __m128i mask = _mm_setr_epi8((
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF,
 
  100                                     (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF,
 
  101                                     (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
 
  102  const UINT32 rem = nWidth % 4;
 
  103  const int64_t width = nWidth - rem;
 
  104  for (int64_t y = 0; y < nHeight; y++)
 
  106    const BYTE* WINPR_RESTRICT srcLine =
 
  107        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
 
  108    BYTE* WINPR_RESTRICT dstLine =
 
  109        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
 
  112    for (; x < width; x += 4)
 
  114      const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
 
  115      __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
 
  116      const __m128i s0 = LOAD_SI128(src);
 
  117      const __m128i s1 = LOAD_SI128(dst);
 
  118      __m128i d0 = _mm_blendv_epi8(s1, s0, mask);
 
  119      STORE_SI128(dst, d0);
 
  122    for (; x < nWidth; x++)
 
  124      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
 
  125      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
 
  132  return PRIMITIVES_SUCCESS;
 
  135static pstatus_t sse_image_copy_no_overlap_dst_alpha(
 
  136    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
 
  137    UINT32 nWidth, UINT32 nHeight, 
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
 
  138    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, 
const gdiPalette* WINPR_RESTRICT palette,
 
  139    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
 
  142  WINPR_ASSERT(pDstData);
 
  143  WINPR_ASSERT(pSrcData);
 
  147    case PIXEL_FORMAT_BGR24:
 
  150        case PIXEL_FORMAT_BGRX32:
 
  151        case PIXEL_FORMAT_BGRA32:
 
  152          return sse_image_copy_bgr24_bgrx32(
 
  153              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  154              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  159    case PIXEL_FORMAT_BGRX32:
 
  160    case PIXEL_FORMAT_BGRA32:
 
  163        case PIXEL_FORMAT_BGRX32:
 
  164        case PIXEL_FORMAT_BGRA32:
 
  165          return sse_image_copy_bgrx32_bgrx32(
 
  166              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  167              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  172    case PIXEL_FORMAT_RGBX32:
 
  173    case PIXEL_FORMAT_RGBA32:
 
  176        case PIXEL_FORMAT_RGBX32:
 
  177        case PIXEL_FORMAT_RGBA32:
 
  178          return sse_image_copy_bgrx32_bgrx32(
 
  179              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  180              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  190  return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
 
  191                              pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
 
  194static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
 
  195                                           UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
 
  196                                           UINT32 nWidth, UINT32 nHeight,
 
  197                                           const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
 
  198                                           UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
  199                                           const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
 
  201  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
 
  202  int64_t srcVOffset = 0;
 
  203  int64_t srcVMultiplier = 1;
 
  204  int64_t dstVOffset = 0;
 
  205  int64_t dstVMultiplier = 1;
 
  207  if ((nWidth == 0) || (nHeight == 0))
 
  208    return PRIMITIVES_SUCCESS;
 
  210  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
 
  213  if (!pDstData || !pSrcData)
 
  217    nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
 
  220    nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
 
  224    srcVOffset = (nHeight - 1ll) * nSrcStep;
 
  228  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
 
  229    return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
 
  230                                               nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
 
  231                                               nXSrc, nYSrc, palette, flags, srcVMultiplier,
 
  232                                               srcVOffset, dstVMultiplier, dstVOffset);
 
  233  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
 
  234    return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
 
  235                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
 
  236                                                nXSrc, nYSrc, palette, srcVMultiplier,
 
  237                                                srcVOffset, dstVMultiplier, dstVOffset, flags);
 
  241    return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
 
  242                                pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
 
  248void primitives_init_copy_sse41_int(
primitives_t* WINPR_RESTRICT prims)
 
  250#if defined(SSE_AVX_INTRINSICS_ENABLED) 
  251  WLog_VRB(PRIM_TAG, 
"SSE4.1 optimizations");
 
  252  prims->copy_no_overlap = sse_image_copy_no_overlap;
 
  254  WLog_VRB(PRIM_TAG, 
"undefined WITH_SIMD or SSE4.1 intrinsics not available");
 
fn_copy_no_overlap_t copy_no_overlap