mirror of
https://github.com/morgan9e/FreeRDP
synced 2026-04-15 00:44:19 +09:00
[primitives,yuv] fix issues with SSE
* fix unaligned memory access * fix width not a multiple of 16
This commit is contained in:
@@ -1053,14 +1053,16 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc,
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
|
||||
INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
|
||||
BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
|
||||
BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
|
||||
BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
|
||||
{
|
||||
for (UINT32 x = 0; x < width; x += 2)
|
||||
for (UINT32 x = offset; x < width; x += 2)
|
||||
{
|
||||
const BYTE* srcEven = &pSrcEven[4ULL * x];
|
||||
const BYTE* srcOdd = &pSrcOdd[4ULL * x];
|
||||
const BOOL lastX = (x + 1) >= width;
|
||||
BYTE Y1e = 0;
|
||||
BYTE Y2e = 0;
|
||||
@@ -1187,8 +1189,8 @@ static INLINE pstatus_t general_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT p
|
||||
BYTE* b5 = b4 + 8ULL * dst2Step[0];
|
||||
BYTE* b6 = pDst2[1] + 1ULL * (y / 2) * dst2Step[1];
|
||||
BYTE* b7 = pDst2[2] + 1ULL * (y / 2) * dst2Step[2];
|
||||
general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
|
||||
b7, roi->width);
|
||||
general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5,
|
||||
b6, b7, roi->width);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
@@ -1807,8 +1809,8 @@ static INLINE pstatus_t general_RGBToAVC444YUVv2_ANY(
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
|
||||
inline void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
|
||||
BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
|
||||
BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
|
||||
BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
|
||||
@@ -1816,8 +1818,10 @@ static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
|
||||
BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
|
||||
{
|
||||
for (UINT32 x = 0; x < width; x += 2)
|
||||
for (UINT32 x = offset; x < width; x += 2)
|
||||
{
|
||||
const BYTE* srcEven = &pSrcEven[4ULL * x];
|
||||
const BYTE* srcOdd = &pSrcOdd[4ULL * x];
|
||||
BYTE Ya = 0;
|
||||
BYTE Ua = 0;
|
||||
BYTE Va = 0;
|
||||
@@ -1966,7 +1970,7 @@ static INLINE pstatus_t general_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT
|
||||
BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
|
||||
BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
|
||||
general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
|
||||
0, srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
|
||||
dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1, dstChromaU2, dstChromaV1,
|
||||
dstChromaV2, roi->width);
|
||||
}
|
||||
|
||||
@@ -320,6 +320,21 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
|
||||
return WINPR_ASSERTING_INT_CAST(BYTE, val);
|
||||
}
|
||||
|
||||
FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
|
||||
BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
|
||||
BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
|
||||
BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width);
|
||||
|
||||
FREERDP_LOCAL void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
|
||||
BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
|
||||
BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
|
||||
BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
|
||||
BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
|
||||
BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
|
||||
BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width);
|
||||
|
||||
/* Function prototypes for all the init/deinit routines. */
|
||||
FREERDP_LOCAL void primitives_init_copy(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_set(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
@@ -41,8 +41,8 @@ static primitives_t* generic = NULL;
|
||||
/****************************************************************************/
|
||||
/* sse41 YUV420 -> RGB conversion */
|
||||
/****************************************************************************/
|
||||
static __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
|
||||
__m128i Vraw, UINT8 pos)
|
||||
static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
|
||||
__m128i Vraw, UINT8 pos)
|
||||
{
|
||||
const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
|
||||
mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
|
||||
@@ -121,10 +121,10 @@ static __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m
|
||||
return dst;
|
||||
}
|
||||
|
||||
static pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
|
||||
const UINT32* WINPR_RESTRICT srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
|
||||
const UINT32* WINPR_RESTRICT srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
const UINT32 nWidth = roi->width;
|
||||
const UINT32 nHeight = roi->height;
|
||||
@@ -190,9 +190,9 @@ static pstatus_t sse41_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UIN
|
||||
}
|
||||
}
|
||||
|
||||
static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
|
||||
const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
|
||||
const BYTE* WINPR_RESTRICT pV[2])
|
||||
static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
|
||||
const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
|
||||
const BYTE* WINPR_RESTRICT pV[2], BOOL filter)
|
||||
{
|
||||
WINPR_ASSERT(pRGB);
|
||||
WINPR_ASSERT(pY);
|
||||
@@ -209,7 +209,7 @@ static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
|
||||
const BYTE Y = pY[i][offset + j];
|
||||
BYTE U = pU[i][offset + j];
|
||||
BYTE V = pV[i][offset + j];
|
||||
if ((i == 0) && (j == 0))
|
||||
if ((i == 0) && (j == 0) && filter)
|
||||
{
|
||||
const INT32 avgU =
|
||||
4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1];
|
||||
@@ -223,14 +223,15 @@ static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
|
||||
const BYTE r = YUV2R(Y, U, V);
|
||||
const BYTE g = YUV2G(Y, U, V);
|
||||
const BYTE b = YUV2B(Y, U, V);
|
||||
pRGB[i] = writePixelBGRX(pRGB[i], bpp, DstFormat, r, g, b, 0);
|
||||
writePixelBGRX(&pRGB[i][(j + offset) * bpp], bpp, DstFormat, r, g, b, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void unpack_mul_add(__m128i toadd[2], __m128i wide, __m128i mul, __m128i sub)
|
||||
static inline void unpack_mul_add(__m128i toadd[2], __m128i narrow, short iMul, __m128i sub)
|
||||
{
|
||||
const __m128i usub = _mm_sub_epi16(wide, sub);
|
||||
const __m128i usub = _mm_sub_epi16(narrow, sub);
|
||||
const __m128i mul = _mm_set1_epi32(iMul);
|
||||
const __m128i umulhi = _mm_mulhi_epi16(usub, mul);
|
||||
const __m128i umullo = _mm_mullo_epi16(usub, mul);
|
||||
{
|
||||
@@ -243,54 +244,87 @@ static inline void unpack_mul_add(__m128i toadd[2], __m128i wide, __m128i mul, _
|
||||
}
|
||||
}
|
||||
|
||||
static inline __m128i sse41_yuv2x(const __m128i Y[2], __m128i U, __m128i V, const short iMulU,
|
||||
const short iMulV)
|
||||
/* input are uint16_t vectors */
|
||||
static inline __m128i sse41_yuv2x_single(const __m128i Y, __m128i U, __m128i V, const short iMulU,
|
||||
const short iMulV)
|
||||
{
|
||||
const __m128i zero = _mm_set1_epi8(0);
|
||||
const __m128i addX = _mm_set1_epi16(128);
|
||||
|
||||
__m128i res[2][2] = { { _mm_unpackhi_epi16(Y[0], zero), _mm_unpacklo_epi16(Y[0], zero) },
|
||||
{
|
||||
_mm_unpackhi_epi16(Y[1], zero),
|
||||
_mm_unpacklo_epi16(Y[1], zero),
|
||||
} };
|
||||
|
||||
__m128i Ylo = _mm_unpacklo_epi16(Y, zero);
|
||||
__m128i Yhi = _mm_unpackhi_epi16(Y, zero);
|
||||
if (iMulU != 0)
|
||||
{
|
||||
const __m128i addX = _mm_set1_epi16(128);
|
||||
const __m128i D = _mm_sub_epi16(U, addX);
|
||||
const __m128i mulU = _mm_set1_epi16(iMulU);
|
||||
unpack_mul_add(res[0], _mm_unpackhi_epi8(U, zero), mulU, addX);
|
||||
unpack_mul_add(res[1], _mm_unpacklo_epi8(U, zero), mulU, addX);
|
||||
const __m128i mulDlo = _mm_mullo_epi16(D, mulU);
|
||||
const __m128i mulDhi = _mm_mulhi_epi16(D, mulU);
|
||||
const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi);
|
||||
Ylo = _mm_add_epi32(Ylo, Dlo);
|
||||
|
||||
const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi);
|
||||
Yhi = _mm_add_epi32(Yhi, Dhi);
|
||||
}
|
||||
if (iMulV != 0)
|
||||
{
|
||||
const __m128i mulV = _mm_set1_epi16(iMulV);
|
||||
unpack_mul_add(res[0], _mm_unpackhi_epi8(V, zero), mulV, addX);
|
||||
unpack_mul_add(res[1], _mm_unpacklo_epi8(V, zero), mulV, addX);
|
||||
const __m128i addX = _mm_set1_epi16(128);
|
||||
const __m128i E = _mm_sub_epi16(V, addX);
|
||||
const __m128i mul = _mm_set1_epi16(iMulV);
|
||||
const __m128i mulElo = _mm_mullo_epi16(E, mul);
|
||||
const __m128i mulEhi = _mm_mulhi_epi16(E, mul);
|
||||
const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi);
|
||||
const __m128i esumlo = _mm_add_epi32(Ylo, Elo);
|
||||
|
||||
const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi);
|
||||
const __m128i esumhi = _mm_add_epi32(Yhi, Ehi);
|
||||
Ylo = esumlo;
|
||||
Yhi = esumhi;
|
||||
}
|
||||
|
||||
res[0][0] = _mm_srai_epi32(res[0][0], 8);
|
||||
res[0][1] = _mm_srai_epi32(res[0][1], 8);
|
||||
res[1][0] = _mm_srai_epi32(res[1][0], 8);
|
||||
res[1][1] = _mm_srai_epi32(res[1][1], 8);
|
||||
const __m128i pres1 = _mm_packs_epi32(res[0][0], res[0][1]);
|
||||
const __m128i pres2 = _mm_packs_epi32(res[1][0], res[1][1]);
|
||||
return _mm_packus_epi16(pres1, pres2);
|
||||
const __m128i rYlo = _mm_srai_epi32(Ylo, 8);
|
||||
const __m128i rYhi = _mm_srai_epi32(Yhi, 8);
|
||||
const __m128i rY = _mm_packs_epi32(rYlo, rYhi);
|
||||
return rY;
|
||||
}
|
||||
|
||||
/* Input are uint8_t vectors */
|
||||
static inline __m128i sse41_yuv2x(const __m128i Y, __m128i U, __m128i V, const short iMulU,
|
||||
const short iMulV)
|
||||
{
|
||||
const __m128i zero = _mm_set1_epi8(0);
|
||||
|
||||
/* Ylo = Y * 256
|
||||
* Ulo = uint8_t -> uint16_t
|
||||
* Vlo = uint8_t -> uint16_t
|
||||
*/
|
||||
const __m128i Ylo = _mm_unpacklo_epi8(zero, Y);
|
||||
const __m128i Ulo = _mm_unpacklo_epi8(U, zero);
|
||||
const __m128i Vlo = _mm_unpacklo_epi8(V, zero);
|
||||
const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV);
|
||||
|
||||
const __m128i Yhi = _mm_unpackhi_epi8(zero, Y);
|
||||
const __m128i Uhi = _mm_unpackhi_epi8(U, zero);
|
||||
const __m128i Vhi = _mm_unpackhi_epi8(V, zero);
|
||||
const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV);
|
||||
const __m128i res = _mm_packus_epi16(preslo, preshi);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/* const INT32 r = ((256L * C(Y) + 0L * D(U) + 403L * E(V))) >> 8; */
|
||||
static inline __m128i sse41_yuv2r(const __m128i Y[2], __m128i U, __m128i V)
|
||||
static inline __m128i sse41_yuv2r(const __m128i Y, __m128i U, __m128i V)
|
||||
{
|
||||
return sse41_yuv2x(Y, U, V, 0, 403);
|
||||
}
|
||||
|
||||
/* const INT32 g = ((256L * C(Y) - 48L * D(U) - 120L * E(V))) >> 8; */
|
||||
static inline __m128i sse41_yuv2g(const __m128i Y[2], __m128i U, __m128i V)
|
||||
static inline __m128i sse41_yuv2g(const __m128i Y, __m128i U, __m128i V)
|
||||
{
|
||||
return sse41_yuv2x(Y, U, V, -48, -120);
|
||||
}
|
||||
|
||||
/* const INT32 b = ((256L * C(Y) + 475L * D(U) + 0L * E(V))) >> 8; */
|
||||
static inline __m128i sse41_yuv2b(const __m128i Y[2], __m128i U, __m128i V)
|
||||
static inline __m128i sse41_yuv2b(const __m128i Y, __m128i U, __m128i V)
|
||||
{
|
||||
return sse41_yuv2x(Y, U, V, 475, 0);
|
||||
}
|
||||
@@ -300,36 +334,101 @@ static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y
|
||||
{
|
||||
const __m128i zero = _mm_set1_epi8(0);
|
||||
/* Y * 256 */
|
||||
const __m128i yY[] = { _mm_unpackhi_epi8(zero, Y), _mm_unpacklo_epi8(zero, Y) };
|
||||
const __m128i r = sse41_yuv2r(yY, U, V);
|
||||
const __m128i r = sse41_yuv2r(Y, U, V);
|
||||
const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) };
|
||||
|
||||
const __m128i g = sse41_yuv2g(yY, U, V);
|
||||
const __m128i b = sse41_yuv2b(yY, U, V);
|
||||
const __m128i g = sse41_yuv2g(Y, U, V);
|
||||
const __m128i b = sse41_yuv2b(Y, U, V);
|
||||
|
||||
const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) };
|
||||
|
||||
const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF,
|
||||
0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF);
|
||||
|
||||
__m128i* rgb = (__m128i*)pRGB;
|
||||
const __m128i bgrx0 = _mm_unpackhi_epi16(bg[0], rx[0]);
|
||||
const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]);
|
||||
_mm_maskmoveu_si128(bgrx0, mask, (char*)&rgb[0]);
|
||||
const __m128i bgrx1 = _mm_unpacklo_epi16(bg[0], rx[0]);
|
||||
const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]);
|
||||
_mm_maskmoveu_si128(bgrx1, mask, (char*)&rgb[1]);
|
||||
const __m128i bgrx2 = _mm_unpackhi_epi16(bg[1], rx[1]);
|
||||
const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]);
|
||||
_mm_maskmoveu_si128(bgrx2, mask, (char*)&rgb[2]);
|
||||
const __m128i bgrx3 = _mm_unpacklo_epi16(bg[1], rx[1]);
|
||||
const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]);
|
||||
_mm_maskmoveu_si128(bgrx3, mask, (char*)&rgb[3]);
|
||||
}
|
||||
|
||||
static void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
|
||||
const __m128i pU[2], const __m128i pV[2])
|
||||
static inline __m128i odd1sum(__m128i u1)
|
||||
{
|
||||
const __m128i zero = _mm_set1_epi8(0);
|
||||
const __m128i u1hi = _mm_unpackhi_epi8(u1, zero);
|
||||
const __m128i u1lo = _mm_unpacklo_epi8(u1, zero);
|
||||
return _mm_hadds_epi16(u1lo, u1hi);
|
||||
}
|
||||
|
||||
static inline __m128i odd0sum(__m128i u0, __m128i u1sum)
|
||||
{
|
||||
/* Mask out even bytes, extend uint8_t to uint16_t by filling in zero bytes,
|
||||
* horizontally add the values */
|
||||
const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07,
|
||||
0x80, 0x05, 0x80, 0x03, 0x80, 0x01);
|
||||
const __m128i u0odd = _mm_shuffle_epi8(u0, mask);
|
||||
return _mm_adds_epi16(u1sum, u0odd);
|
||||
}
|
||||
|
||||
static inline __m128i calcavg(__m128i u0even, __m128i sum)
|
||||
{
|
||||
const __m128i u4zero = _mm_slli_epi16(u0even, 2);
|
||||
const __m128i uavg = _mm_sub_epi16(u4zero, sum);
|
||||
const __m128i zero = _mm_set1_epi8(0);
|
||||
const __m128i savg = _mm_packus_epi16(uavg, zero);
|
||||
const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03,
|
||||
0x80, 0x02, 0x80, 0x01, 0x80, 0x00);
|
||||
return _mm_shuffle_epi8(savg, smask);
|
||||
}
|
||||
|
||||
static inline __m128i diffmask(__m128i avg, __m128i u0even)
|
||||
{
|
||||
/* Check for values >= 30 to apply the avg value to
|
||||
* use int16 for calculations to avoid issues with signed 8bit integers
|
||||
*/
|
||||
const __m128i diff = _mm_subs_epi16(u0even, avg);
|
||||
const __m128i absdiff = _mm_abs_epi16(diff);
|
||||
const __m128i val30 = _mm_set1_epi16(30);
|
||||
return _mm_cmplt_epi16(absdiff, val30);
|
||||
}
|
||||
|
||||
static inline void sse41_filter(__m128i pU[2])
|
||||
{
|
||||
const __m128i u1sum = odd1sum(pU[1]);
|
||||
const __m128i sum = odd0sum(pU[0], u1sum);
|
||||
|
||||
/* Mask out the odd bytes. We don´t need to do anything to make the uint8_t to uint16_t */
|
||||
const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
|
||||
0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
|
||||
const __m128i u0even = _mm_and_si128(pU[0], emask);
|
||||
const __m128i avg = calcavg(u0even, sum);
|
||||
const __m128i umask = diffmask(avg, u0even);
|
||||
|
||||
const __m128i u0orig = _mm_and_si128(u0even, umask);
|
||||
const __m128i u0avg = _mm_andnot_si128(umask, avg);
|
||||
const __m128i evenresult = _mm_or_si128(u0orig, u0avg);
|
||||
const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
|
||||
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00);
|
||||
const __m128i u0odd = _mm_and_si128(pU[0], omask);
|
||||
const __m128i result = _mm_or_si128(evenresult, u0odd);
|
||||
pU[0] = result;
|
||||
}
|
||||
|
||||
static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
|
||||
__m128i pU[2], __m128i pV[2])
|
||||
{
|
||||
WINPR_ASSERT(pRGB);
|
||||
WINPR_ASSERT(pY);
|
||||
WINPR_ASSERT(pU);
|
||||
WINPR_ASSERT(pV);
|
||||
|
||||
sse41_filter(pU);
|
||||
sse41_filter(pV);
|
||||
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]);
|
||||
@@ -342,21 +441,24 @@ static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
|
||||
{
|
||||
WINPR_ASSERT((nWidth % 2) == 0);
|
||||
const UINT32 pad = nWidth % 16;
|
||||
for (size_t x = 0; x < nWidth - pad; x += 16)
|
||||
|
||||
size_t x = 0;
|
||||
for (; x < nWidth - pad; x += 16)
|
||||
{
|
||||
const __m128i Y[] = { _mm_load_si128((const __m128i*)&YData[0][x]),
|
||||
_mm_load_si128((const __m128i*)&YData[1][x]) };
|
||||
const __m128i U[] = { _mm_load_si128((const __m128i*)&UData[0][x]),
|
||||
_mm_load_si128((const __m128i*)&UData[1][x]) };
|
||||
const __m128i V[] = { _mm_load_si128((const __m128i*)&VData[0][x]),
|
||||
_mm_load_si128((const __m128i*)&VData[1][x]) };
|
||||
const __m128i Y[] = { _mm_loadu_si128((const __m128i*)&YData[0][x]),
|
||||
_mm_loadu_si128((const __m128i*)&YData[1][x]) };
|
||||
__m128i U[] = { _mm_loadu_si128((const __m128i*)&UData[0][x]),
|
||||
_mm_loadu_si128((const __m128i*)&UData[1][x]) };
|
||||
__m128i V[] = { _mm_loadu_si128((const __m128i*)&VData[0][x]),
|
||||
_mm_loadu_si128((const __m128i*)&VData[1][x]) };
|
||||
|
||||
BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
|
||||
sse41_BGRX_fillRGB(dstp, Y, U, V);
|
||||
}
|
||||
|
||||
for (size_t x = nWidth - pad; x < nWidth; x += 2)
|
||||
for (; x < nWidth; x += 2)
|
||||
{
|
||||
BGRX_fillRGB(x, pDst, YData, UData, VData);
|
||||
BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
@@ -392,10 +494,6 @@ static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
|
||||
srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
|
||||
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
@@ -460,41 +558,63 @@ PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
|
||||
};
|
||||
*/
|
||||
|
||||
static inline void sse41_BGRX_TO_YUV(const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine,
|
||||
BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine)
|
||||
{
|
||||
const BYTE r1 = pLine1[2];
|
||||
const BYTE g1 = pLine1[1];
|
||||
const BYTE b1 = pLine1[0];
|
||||
|
||||
if (pYLine)
|
||||
pYLine[0] = RGB2Y(r1, g1, b1);
|
||||
if (pULine)
|
||||
pULine[0] = RGB2U(r1, g1, b1);
|
||||
if (pVLine)
|
||||
pVLine[0] = RGB2V(r1, g1, b1);
|
||||
}
|
||||
|
||||
/* compute the luma (Y) component from a single rgb source line */
|
||||
|
||||
static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
|
||||
{
|
||||
__m128i x0;
|
||||
__m128i x1;
|
||||
__m128i x2;
|
||||
__m128i x3;
|
||||
const __m128i y_factors = BGRX_Y_FACTORS;
|
||||
const __m128i* argb = (const __m128i*)src;
|
||||
__m128i* ydst = (__m128i*)dst;
|
||||
|
||||
for (UINT32 x = 0; x < width; x += 16)
|
||||
UINT32 x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
/* store 16 rgba pixels in 4 128 bit registers */
|
||||
x0 = _mm_load_si128(argb++); // 1st 4 pixels
|
||||
x1 = _mm_load_si128(argb++); // 2nd 4 pixels
|
||||
x2 = _mm_load_si128(argb++); // 3rd 4 pixels
|
||||
x3 = _mm_load_si128(argb++); // 4th 4 pixels
|
||||
/* multiplications and subtotals */
|
||||
x0 = _mm_maddubs_epi16(x0, y_factors);
|
||||
x1 = _mm_maddubs_epi16(x1, y_factors);
|
||||
x2 = _mm_maddubs_epi16(x2, y_factors);
|
||||
x3 = _mm_maddubs_epi16(x3, y_factors);
|
||||
/* the total sums */
|
||||
x0 = _mm_hadd_epi16(x0, x1);
|
||||
x2 = _mm_hadd_epi16(x2, x3);
|
||||
/* shift the results */
|
||||
x0 = _mm_srli_epi16(x0, Y_SHIFT);
|
||||
x2 = _mm_srli_epi16(x2, Y_SHIFT);
|
||||
/* pack the 16 words into bytes */
|
||||
__m128i x0 = _mm_loadu_si128(argb++); // 1st 4 pixels
|
||||
{
|
||||
x0 = _mm_maddubs_epi16(x0, y_factors);
|
||||
|
||||
__m128i x1 = _mm_loadu_si128(argb++); // 2nd 4 pixels
|
||||
x1 = _mm_maddubs_epi16(x1, y_factors);
|
||||
x0 = _mm_hadds_epi16(x0, x1);
|
||||
x0 = _mm_srli_epi16(x0, Y_SHIFT);
|
||||
}
|
||||
|
||||
__m128i x2 = _mm_loadu_si128(argb++); // 3rd 4 pixels
|
||||
{
|
||||
x2 = _mm_maddubs_epi16(x2, y_factors);
|
||||
|
||||
__m128i x3 = _mm_loadu_si128(argb++); // 4th 4 pixels
|
||||
x3 = _mm_maddubs_epi16(x3, y_factors);
|
||||
x2 = _mm_hadds_epi16(x2, x3);
|
||||
x2 = _mm_srli_epi16(x2, Y_SHIFT);
|
||||
}
|
||||
|
||||
x0 = _mm_packus_epi16(x0, x2);
|
||||
/* save to y plane */
|
||||
_mm_storeu_si128(ydst++, x0);
|
||||
}
|
||||
|
||||
for (; x < width; x++)
|
||||
{
|
||||
sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the chrominance (UV) components from two rgb source lines */
|
||||
@@ -507,32 +627,33 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
|
||||
const __m128i u_factors = BGRX_U_FACTORS;
|
||||
const __m128i v_factors = BGRX_V_FACTORS;
|
||||
const __m128i vector128 = CONST128_FACTORS;
|
||||
__m128i x0;
|
||||
__m128i x1;
|
||||
__m128i x2;
|
||||
__m128i x3;
|
||||
__m128i x4;
|
||||
__m128i x5;
|
||||
const __m128i* rgb1 = (const __m128i*)src1;
|
||||
const __m128i* rgb2 = (const __m128i*)src2;
|
||||
__m64* udst = (__m64*)dst1;
|
||||
__m64* vdst = (__m64*)dst2;
|
||||
|
||||
for (UINT32 x = 0; x < width; x += 16)
|
||||
size_t x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
const __m128i* rgb1 = (const __m128i*)&src1[4ULL * x];
|
||||
const __m128i* rgb2 = (const __m128i*)&src2[4ULL * x];
|
||||
__m64* udst = (__m64*)&dst1[x / 2];
|
||||
__m64* vdst = (__m64*)&dst2[x / 2];
|
||||
|
||||
/* subsample 16x2 pixels into 16x1 pixels */
|
||||
x0 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
__m128i x0 = _mm_loadu_si128(&rgb1[0]);
|
||||
__m128i x4 = _mm_loadu_si128(&rgb2[0]);
|
||||
x0 = _mm_avg_epu8(x0, x4);
|
||||
x1 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
|
||||
__m128i x1 = _mm_loadu_si128(&rgb1[1]);
|
||||
x4 = _mm_loadu_si128(&rgb2[1]);
|
||||
x1 = _mm_avg_epu8(x1, x4);
|
||||
x2 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
|
||||
__m128i x2 = _mm_loadu_si128(&rgb1[2]);
|
||||
x4 = _mm_loadu_si128(&rgb2[2]);
|
||||
x2 = _mm_avg_epu8(x2, x4);
|
||||
x3 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
|
||||
__m128i x3 = _mm_loadu_si128(&rgb1[3]);
|
||||
x4 = _mm_loadu_si128(&rgb2[3]);
|
||||
x3 = _mm_avg_epu8(x3, x4);
|
||||
|
||||
/* subsample these 16x1 pixels into 8x1 pixels */
|
||||
/**
|
||||
* shuffle controls
|
||||
@@ -549,7 +670,7 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
|
||||
x2 = _mm_maddubs_epi16(x0, u_factors);
|
||||
x3 = _mm_maddubs_epi16(x1, u_factors);
|
||||
x4 = _mm_maddubs_epi16(x0, v_factors);
|
||||
x5 = _mm_maddubs_epi16(x1, v_factors);
|
||||
__m128i x5 = _mm_maddubs_epi16(x1, v_factors);
|
||||
/* the total sums */
|
||||
x0 = _mm_hadd_epi16(x2, x3);
|
||||
x1 = _mm_hadd_epi16(x4, x5);
|
||||
@@ -561,50 +682,60 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
|
||||
/* add 128 */
|
||||
x0 = _mm_sub_epi8(x0, vector128);
|
||||
/* the lower 8 bytes go to the u plane */
|
||||
_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
|
||||
_mm_storel_pi(udst, _mm_castsi128_ps(x0));
|
||||
/* the upper 8 bytes go to the v plane */
|
||||
_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
|
||||
_mm_storeh_pi(vdst, _mm_castsi128_ps(x0));
|
||||
}
|
||||
|
||||
for (; x < width - width % 2; x += 2)
|
||||
{
|
||||
BYTE u[4] = { 0 };
|
||||
BYTE v[4] = { 0 };
|
||||
sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]);
|
||||
sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]);
|
||||
sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]);
|
||||
sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]);
|
||||
const INT16 u4 = (INT16)u[0] + u[1] + u[2] + u[3];
|
||||
const INT16 uu = u4 / 4;
|
||||
const BYTE u8 = CLIP(uu);
|
||||
dst1[x / 2] = u8;
|
||||
|
||||
const INT16 v4 = (INT16)v[0] + v[1] + v[2] + v[3];
|
||||
const INT16 vu = v4 / 4;
|
||||
const BYTE v8 = CLIP(vu);
|
||||
dst2[x / 2] = v8;
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
|
||||
const UINT32 dstStep[],
|
||||
static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
const BYTE* argb = pSrc;
|
||||
BYTE* ydst = pDst[0];
|
||||
BYTE* udst = pDst[1];
|
||||
BYTE* vdst = pDst[2];
|
||||
|
||||
if (roi->height < 1 || roi->width < 1)
|
||||
{
|
||||
return !PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
|
||||
size_t y = 0;
|
||||
for (; y < roi->height - roi->height % 2; y += 2)
|
||||
{
|
||||
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
const BYTE* line1 = &pSrc[y * srcStep];
|
||||
const BYTE* line2 = &pSrc[(1ULL + y) * srcStep];
|
||||
BYTE* ydst1 = &pDst[0][y * dstStep[0]];
|
||||
BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]];
|
||||
BYTE* udst = &pDst[1][y / 2 * dstStep[1]];
|
||||
BYTE* vdst = &pDst[2][y / 2 * dstStep[2]];
|
||||
|
||||
for (UINT32 y = 0; y < roi->height - 1; y += 2)
|
||||
{
|
||||
const BYTE* line1 = argb;
|
||||
const BYTE* line2 = argb + srcStep;
|
||||
sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
|
||||
sse41_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
|
||||
sse41_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
|
||||
argb += 2ULL * srcStep;
|
||||
ydst += 2ULL * dstStep[0];
|
||||
udst += 1ULL * dstStep[1];
|
||||
vdst += 1ULL * dstStep[2];
|
||||
sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width);
|
||||
sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width);
|
||||
}
|
||||
|
||||
if (roi->height & 1)
|
||||
for (; y < roi->height; y++)
|
||||
{
|
||||
/* pass the same last line of an odd height twice for UV */
|
||||
sse41_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
|
||||
sse41_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
|
||||
const BYTE* line = &pSrc[y * srcStep];
|
||||
BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]];
|
||||
sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
@@ -618,7 +749,7 @@ static pstatus_t sse41_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFo
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return sse41_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
|
||||
|
||||
default:
|
||||
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
@@ -642,17 +773,18 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
const __m128i v_factors = BGRX_V_FACTORS;
|
||||
const __m128i vector128 = CONST128_FACTORS;
|
||||
|
||||
for (UINT32 x = 0; x < width; x += 16)
|
||||
UINT32 x = 0;
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
/* store 16 rgba pixels in 4 128 bit registers */
|
||||
const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
|
||||
const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
|
||||
const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
|
||||
const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
|
||||
const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
|
||||
const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
|
||||
const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
|
||||
const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
|
||||
const __m128i xe1 = _mm_loadu_si128(argbEven++); // 1st 4 pixels
|
||||
const __m128i xe2 = _mm_loadu_si128(argbEven++); // 2nd 4 pixels
|
||||
const __m128i xe3 = _mm_loadu_si128(argbEven++); // 3rd 4 pixels
|
||||
const __m128i xe4 = _mm_loadu_si128(argbEven++); // 4th 4 pixels
|
||||
const __m128i xo1 = _mm_loadu_si128(argbOdd++); // 1st 4 pixels
|
||||
const __m128i xo2 = _mm_loadu_si128(argbOdd++); // 2nd 4 pixels
|
||||
const __m128i xo3 = _mm_loadu_si128(argbOdd++); // 3rd 4 pixels
|
||||
const __m128i xo4 = _mm_loadu_si128(argbOdd++); // 4th 4 pixels
|
||||
{
|
||||
/* Y: multiplications with subtotals and horizontal sums */
|
||||
const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
|
||||
@@ -743,7 +875,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
|
||||
if (b1Odd) /* b4 */
|
||||
{
|
||||
_mm_store_si128((__m128i*)b4, uo);
|
||||
_mm_storeu_si128((__m128i*)b4, uo);
|
||||
b4 += 16;
|
||||
}
|
||||
|
||||
@@ -821,7 +953,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
|
||||
if (b1Odd) /* b5 */
|
||||
{
|
||||
_mm_store_si128((__m128i*)b5, vo);
|
||||
_mm_storeu_si128((__m128i*)b5, vo);
|
||||
b5 += 16;
|
||||
}
|
||||
|
||||
@@ -836,6 +968,9 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
|
||||
b7, width);
|
||||
}
|
||||
|
||||
static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
|
||||
@@ -849,10 +984,6 @@ static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT
|
||||
if (roi->height < 1 || roi->width < 1)
|
||||
return !PRIMITIVES_SUCCESS;
|
||||
|
||||
if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
|
||||
return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
|
||||
roi);
|
||||
|
||||
for (size_t y = 0; y < roi->height; y += 2)
|
||||
{
|
||||
const BOOL last = (y >= (roi->height - 1));
|
||||
@@ -920,19 +1051,20 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
const __m128i* argbEven = (const __m128i*)srcEven;
|
||||
const __m128i* argbOdd = (const __m128i*)srcOdd;
|
||||
|
||||
for (UINT32 x = 0; x < width; x += 16)
|
||||
UINT32 x = 0;
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
/* store 16 rgba pixels in 4 128 bit registers
|
||||
* for even and odd rows.
|
||||
*/
|
||||
const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
|
||||
const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
|
||||
const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
|
||||
const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
|
||||
const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */
|
||||
const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */
|
||||
const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */
|
||||
const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */
|
||||
const __m128i xe1 = _mm_loadu_si128(argbEven++); /* 1st 4 pixels */
|
||||
const __m128i xe2 = _mm_loadu_si128(argbEven++); /* 2nd 4 pixels */
|
||||
const __m128i xe3 = _mm_loadu_si128(argbEven++); /* 3rd 4 pixels */
|
||||
const __m128i xe4 = _mm_loadu_si128(argbEven++); /* 4th 4 pixels */
|
||||
const __m128i xo1 = _mm_loadu_si128(argbOdd++); /* 1st 4 pixels */
|
||||
const __m128i xo2 = _mm_loadu_si128(argbOdd++); /* 2nd 4 pixels */
|
||||
const __m128i xo3 = _mm_loadu_si128(argbOdd++); /* 3rd 4 pixels */
|
||||
const __m128i xo4 = _mm_loadu_si128(argbOdd++); /* 4th 4 pixels */
|
||||
{
|
||||
/* Y: multiplications with subtotals and horizontal sums */
|
||||
const __m128i y_factors = BGRX_Y_FACTORS;
|
||||
@@ -1150,6 +1282,11 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd,
|
||||
uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2,
|
||||
yOddChromaDst1, yOddChromaDst2, uChromaDst1,
|
||||
uChromaDst2, vChromaDst1, vChromaDst2, width);
|
||||
}
|
||||
|
||||
static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
|
||||
@@ -1161,10 +1298,6 @@ static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UI
|
||||
if (roi->height < 1 || roi->width < 1)
|
||||
return !PRIMITIVES_SUCCESS;
|
||||
|
||||
if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
|
||||
return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
|
||||
roi);
|
||||
|
||||
for (size_t y = 0; y < roi->height; y += 2)
|
||||
{
|
||||
const BYTE* srcEven = (pSrc + y * srcStep);
|
||||
|
||||
Reference in New Issue
Block a user