From dee9019e7c92127dc359fd51de931b81d042574b Mon Sep 17 00:00:00 2001 From: akallabeth Date: Fri, 7 Jun 2024 11:56:44 +0200 Subject: [PATCH] [codec,progressive] use add_16s_inplace --- include/freerdp/primitives.h | 10 ++- libfreerdp/codec/progressive.c | 18 ++-- libfreerdp/primitives/prim_add.c | 47 +++++----- libfreerdp/primitives/prim_add_opt.c | 129 ++++++++++++++++----------- 4 files changed, 114 insertions(+), 90 deletions(-) diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index b00011bc7..20c92e59e 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -102,8 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32 typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1, const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst, UINT32 len); -typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst, - const INT16* WINPR_RESTRICT pSrc, UINT32 len); +typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1, + INT16* WINPR_RESTRICT pSrcDst2, UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); @@ -185,7 +185,6 @@ typedef struct __zero_t zero; /* bzero or faster */ /* Arithmetic functions */ __add_16s_t add_16s; - __add_16s_inplace_t add_16s_inplace; /* And/or */ __andC_32u_t andC_32u; __orC_32u_t orC_32u; @@ -217,6 +216,11 @@ typedef struct /* flags */ DWORD flags; primitives_uninit_t uninit; + + /** \brief Do vecotor addition, store result in both input buffers + * pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2 + */ + __add_16s_inplace_t add_16s_inplace; } primitives_t; typedef enum diff --git a/libfreerdp/codec/progressive.c b/libfreerdp/codec/progressive.c index 3ac8b87e4..692731c6b 100644 --- a/libfreerdp/codec/progressive.c +++ b/libfreerdp/codec/progressive.c @@ -833,20 +833,14 @@ static INLINE int progressive_rfx_dwt_2d_decode(PROGRESSIVE_CONTEXT* WINPR_RESTR if (!progressive || !buffer || !current) return -1; - INT16 dst[4096] = { 0 }; + const size_t belements = 4096; + const size_t bsize = belements * sizeof(INT16); if (reverse) - memcpy(buffer, current, sizeof(dst)); + memcpy(buffer, current, bsize); + else if (!coeffDiff) + memcpy(current, buffer, bsize); else - { - if (coeffDiff) - { - prims->add_16s(buffer, current, dst, ARRAYSIZE(dst)); - memcpy(current, dst, sizeof(dst)); - memcpy(buffer, dst, sizeof(dst)); - } - else - memcpy(current, buffer, sizeof(dst)); - } + prims->add_16s_inplace(buffer, current, belements); INT16* temp = (INT16*)BufferPool_Take(progressive->bufferPool, -1); /* DWT buffer */ diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c index b1d4602c4..6a9a9994d 100644 --- a/libfreerdp/primitives/prim_add.c +++ b/libfreerdp/primitives/prim_add.c @@ -26,38 +26,43 @@ /* ---------------------------------------------------------------------------- * 16-bit signed add with saturation (under and over). */ +static INLINE INT16 add(INT16 a, INT16 b) +{ + INT32 k = (INT32)a + (INT32)b; + + if (k > INT16_MAX) + return INT16_MAX; + + if (k < INT16_MIN) + return INT16_MIN; + + return (INT16)k; +} + static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1, const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst, UINT32 len) { - while (len--) - { - INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++); + const UINT32 rem = len % 16; + const UINT32 align = len - rem; - if (k > INT16_MAX) - *pDst++ = ((INT16)INT16_MAX); - else if (k < INT16_MIN) - *pDst++ = ((INT16)INT16_MIN); - else - *pDst++ = (INT16)k; - } + for (UINT32 x = 0; x < align; x++) + *pDst++ = add(*pSrc1++, *pSrc2++); + + for (UINT32 x = 0; x < rem; x++) + *pDst++ = add(*pSrc1++, *pSrc2++); return PRIMITIVES_SUCCESS; } -static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, - const INT16* WINPR_RESTRICT pSrc, UINT32 len) +static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, + INT16* WINPR_RESTRICT pSrcDst2, UINT32 len) { - while (len--) + for (UINT32 x = 0; x < len; x++) { - INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++); - - if (k > INT16_MAX) - *pSrcDst++ = ((INT16)INT16_MAX); - else if (k < INT16_MIN) - *pSrcDst++ = ((INT16)INT16_MIN); - else - *pSrcDst++ = (INT16)k; + INT16 v = add(pSrcDst1[x], pSrcDst2[x]); + pSrcDst1[x] = v; + pSrcDst2[x] = v; } return PRIMITIVES_SUCCESS; diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c index 87a3ea620..7274683a4 100644 --- a/libfreerdp/primitives/prim_add_opt.c +++ b/libfreerdp/primitives/prim_add_opt.c @@ -35,66 +35,74 @@ static primitives_t* generic = NULL; SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) -static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, - const INT16* WINPR_RESTRICT pSrc, UINT32 len) +static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, + INT16* WINPR_RESTRICT pSrcDst2, UINT32 len) { const int shifts = 2; UINT32 offBeatMask; - INT16* dptr = pSrcDst; - const INT16* sptr = pSrc; + INT16* dptr1 = pSrcDst1; + INT16* dptr2 = pSrcDst2; size_t count; if (len < 16) /* pointless if too small */ - return generic->add_16s_inplace(pSrcDst, pSrc, len); + return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len); offBeatMask = (1 << (shifts - 1)) - 1; - if ((ULONG_PTR)pSrcDst & offBeatMask) + if ((ULONG_PTR)pSrcDst1 & offBeatMask) { /* Incrementing the pointer skips over 16-byte boundary. */ - return generic->add_16s_inplace(pSrcDst, pSrc, len); + return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len); } /* Get to the 16-byte boundary now. */ - const size_t rem = (ULONG_PTR)dptr & 0x0f; + const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16); if (rem != 0) { - pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem); + const size_t add = 16 - rem; + pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add); if (status != PRIMITIVES_SUCCESS) return status; - dptr += rem; - sptr += rem; + dptr1 += add; + dptr2 += add; } /* Use 4 128-bit SSE registers. */ count = len >> (7 - shifts); len -= count << (7 - shifts); - if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f)) + if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f)) { /* Unaligned loads */ while (count--) { - const __m128i* sptr1 = dptr; - const __m128i* sptr2 = sptr; - __m128i* dptr1 = dptr; - sptr += 4 * sizeof(__m128i); - dptr += 4 * sizeof(__m128i); + const __m128i* vsptr1 = (const __m128i*)dptr1; + const __m128i* vsptr2 = (const __m128i*)dptr2; + __m128i* vdptr1 = (__m128i*)dptr1; + __m128i* vdptr2 = (__m128i*)dptr2; - __m128i xmm0 = _mm_lddqu_si128(sptr1++); - __m128i xmm1 = _mm_lddqu_si128(sptr1++); - __m128i xmm2 = _mm_lddqu_si128(sptr1++); - __m128i xmm3 = _mm_lddqu_si128(sptr1++); - __m128i xmm4 = _mm_lddqu_si128(sptr2++); - __m128i xmm5 = _mm_lddqu_si128(sptr2++); - __m128i xmm6 = _mm_lddqu_si128(sptr2++); - __m128i xmm7 = _mm_lddqu_si128(sptr2++); + __m128i xmm0 = _mm_lddqu_si128(vsptr1++); + __m128i xmm1 = _mm_lddqu_si128(vsptr1++); + __m128i xmm2 = _mm_lddqu_si128(vsptr1++); + __m128i xmm3 = _mm_lddqu_si128(vsptr1++); + __m128i xmm4 = _mm_lddqu_si128(vsptr2++); + __m128i xmm5 = _mm_lddqu_si128(vsptr2++); + __m128i xmm6 = _mm_lddqu_si128(vsptr2++); + __m128i xmm7 = _mm_lddqu_si128(vsptr2++); xmm0 = _mm_adds_epi16(xmm0, xmm4); xmm1 = _mm_adds_epi16(xmm1, xmm5); xmm2 = _mm_adds_epi16(xmm2, xmm6); xmm3 = _mm_adds_epi16(xmm3, xmm7); - _mm_store_si128(dptr1++, xmm0); - _mm_store_si128(dptr1++, xmm1); - _mm_store_si128(dptr1++, xmm2); - _mm_store_si128(dptr1++, xmm3); + _mm_store_si128(vdptr1++, xmm0); + _mm_store_si128(vdptr1++, xmm1); + _mm_store_si128(vdptr1++, xmm2); + _mm_store_si128(vdptr1++, xmm3); + + _mm_store_si128(vdptr2++, xmm0); + _mm_store_si128(vdptr2++, xmm1); + _mm_store_si128(vdptr2++, xmm2); + _mm_store_si128(vdptr2++, xmm3); + + dptr1 = (INT16*)vdptr1; + dptr2 = (INT16*)vdptr2; } } else @@ -102,30 +110,37 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, /* Aligned loads */ while (count--) { - const __m128i* sptr1 = dptr; - const __m128i* sptr2 = sptr; - __m128i* dptr1 = dptr; - sptr += 4 * sizeof(__m128i); - dptr += 4 * sizeof(__m128i); + const __m128i* vsptr1 = (const __m128i*)dptr1; + const __m128i* vsptr2 = (const __m128i*)dptr2; + __m128i* vdptr1 = (__m128i*)dptr1; + __m128i* vdptr2 = (__m128i*)dptr2; - __m128i xmm0 = _mm_load_si128(sptr1++); - __m128i xmm1 = _mm_load_si128(sptr1++); - __m128i xmm2 = _mm_load_si128(sptr1++); - __m128i xmm3 = _mm_load_si128(sptr1++); - __m128i xmm4 = _mm_load_si128(sptr2++); - __m128i xmm5 = _mm_load_si128(sptr2++); - __m128i xmm6 = _mm_load_si128(sptr2++); - __m128i xmm7 = _mm_load_si128(sptr2++); + __m128i xmm0 = _mm_load_si128(vsptr1++); + __m128i xmm1 = _mm_load_si128(vsptr1++); + __m128i xmm2 = _mm_load_si128(vsptr1++); + __m128i xmm3 = _mm_load_si128(vsptr1++); + __m128i xmm4 = _mm_load_si128(vsptr2++); + __m128i xmm5 = _mm_load_si128(vsptr2++); + __m128i xmm6 = _mm_load_si128(vsptr2++); + __m128i xmm7 = _mm_load_si128(vsptr2++); xmm0 = _mm_adds_epi16(xmm0, xmm4); xmm1 = _mm_adds_epi16(xmm1, xmm5); xmm2 = _mm_adds_epi16(xmm2, xmm6); xmm3 = _mm_adds_epi16(xmm3, xmm7); - _mm_store_si128(dptr1, xmm0); - _mm_store_si128(dptr1, xmm1); - _mm_store_si128(dptr1, xmm2); - _mm_store_si128(dptr1, xmm3); + _mm_store_si128(vdptr1++, xmm0); + _mm_store_si128(vdptr1++, xmm1); + _mm_store_si128(vdptr1++, xmm2); + _mm_store_si128(vdptr1++, xmm3); + + _mm_store_si128(vdptr2++, xmm0); + _mm_store_si128(vdptr2++, xmm1); + _mm_store_si128(vdptr2++, xmm2); + _mm_store_si128(vdptr2++, xmm3); + + dptr1 = (INT16*)vdptr1; + dptr2 = (INT16*)vdptr2; } } /* Use a single 128-bit SSE register. */ @@ -133,19 +148,25 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, len -= count << (5 - shifts); while (count--) { - const __m128i* sptr1 = sptr; - __m128i* dptr1 = dptr; - sptr += sizeof(__m128i); - dptr += sizeof(__m128i); + const __m128i* vsptr1 = (const __m128i*)dptr1; + const __m128i* vsptr2 = (const __m128i*)dptr2; + __m128i* vdptr1 = (__m128i*)dptr1; + __m128i* vdptr2 = (__m128i*)dptr2; + + __m128i xmm0 = LOAD_SI128(vsptr1); + __m128i xmm1 = LOAD_SI128(vsptr2); - __m128i xmm0 = LOAD_SI128(sptr1); - __m128i xmm1 = LOAD_SI128(dptr1); xmm0 = _mm_adds_epi16(xmm0, xmm1); - _mm_store_si128(dptr, xmm0); + + _mm_store_si128(vdptr1++, xmm0); + _mm_store_si128(vdptr2++, xmm0); + + dptr1 = (INT16*)vdptr1; + dptr2 = (INT16*)vdptr2; } /* Finish off the remainder. */ if (len > 0) - return generic->add_16s_inplace(dptr, sptr, len); + return generic->add_16s_inplace(dptr1, dptr2, len); return PRIMITIVES_SUCCESS; }