16#include <freerdp/config.h> 
   18#include <freerdp/types.h> 
   19#include <freerdp/primitives.h> 
   20#include <winpr/sysinfo.h> 
   22#include "prim_shift.h" 
   24#include "prim_internal.h" 
   25#include "prim_templates.h" 
   27#if defined(SSE_AVX_INTRINSICS_ENABLED) 
   34SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
 
   35                 *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
 
   37SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
 
   38                 *dptr++ = *sptr++ >> val)
 
   40SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
 
   41                 *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
 
   43SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
 
   44                 *dptr++ = *sptr++ >> val)
 
   46static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
 
   49  const INT32 shifts = 2;
 
   51    return PRIMITIVES_SUCCESS;
 
   55    return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
 
   57  UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
 
   58  if ((ULONG_PTR)pSrcDst & offBeatMask)
 
   61    return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
 
   64  const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / 
sizeof(INT16);
 
   67    const UINT32 add = 16 - rem;
 
   68    pstatus_t status = 
generic->lShiftC_16s_inplace(pSrcDst, val, add);
 
   69    if (status != PRIMITIVES_SUCCESS)
 
   76  size_t count = len >> (8 - shifts);
 
   77  len -= count << (8 - shifts);
 
   81    const __m128i* src = (
const __m128i*)pSrcDst;
 
   83    __m128i xmm0 = LOAD_SI128(src++);
 
   84    __m128i xmm1 = LOAD_SI128(src++);
 
   85    __m128i xmm2 = LOAD_SI128(src++);
 
   86    __m128i xmm3 = LOAD_SI128(src++);
 
   87    __m128i xmm4 = LOAD_SI128(src++);
 
   88    __m128i xmm5 = LOAD_SI128(src++);
 
   89    __m128i xmm6 = LOAD_SI128(src++);
 
   90    __m128i xmm7 = LOAD_SI128(src);
 
   92    xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
 
   93    xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
 
   94    xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
 
   95    xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
 
   96    xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
 
   97    xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
 
   98    xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
 
   99    xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
 
  101    __m128i* dst = (__m128i*)pSrcDst;
 
  103    STORE_SI128(dst++, xmm0);
 
  104    STORE_SI128(dst++, xmm1);
 
  105    STORE_SI128(dst++, xmm2);
 
  106    STORE_SI128(dst++, xmm3);
 
  107    STORE_SI128(dst++, xmm4);
 
  108    STORE_SI128(dst++, xmm5);
 
  109    STORE_SI128(dst++, xmm6);
 
  110    STORE_SI128(dst++, xmm7);
 
  112    pSrcDst = (INT16*)dst;
 
  116  count = len >> (5 - shifts);
 
  117  len -= count << (5 - shifts);
 
  120    const __m128i* src = (
const __m128i*)pSrcDst;
 
  121    __m128i xmm0 = LOAD_SI128(src);
 
  123    xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
 
  125    __m128i* dst = (__m128i*)pSrcDst;
 
  126    STORE_SI128(dst++, xmm0);
 
  127    pSrcDst = (INT16*)dst;
 
  132    return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
 
  134  return PRIMITIVES_SUCCESS;
 
  144void primitives_init_shift_sse3_int(
primitives_t* WINPR_RESTRICT prims)
 
  146#if defined(SSE_AVX_INTRINSICS_ENABLED) 
  147  generic = primitives_get_generic();
 
  149  WLog_VRB(PRIM_TAG, 
"SSE2/SSE3 optimizations");
 
  150  prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
 
  151  prims->lShiftC_16s = sse2_lShiftC_16s;
 
  152  prims->rShiftC_16s = sse2_rShiftC_16s;
 
  153  prims->lShiftC_16u = sse2_lShiftC_16u;
 
  154  prims->rShiftC_16u = sse2_rShiftC_16u;
 
  157  WLog_VRB(PRIM_TAG, 
"undefined WITH_SIMD or SSE3 intrinsics not available");