[primitives,yuv] fix issues with SSE

* fix unaligned memory access * fix width not a multiple of 16
2026-04-15 00:44:19 +09:00 · 2025-01-09 23:19:34 +01:00
parent 6e8ab6241b
commit e2cdcd8a18
3 changed files with 318 additions and 166 deletions
--- a/libfreerdp/primitives/prim_YUV.c
+++ b/libfreerdp/primitives/prim_YUV.c
@@ -1053,14 +1053,16 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc,
 	}
 }

-static INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
-    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
 {
-	for (UINT32 x = 0; x < width; x += 2)
+	for (UINT32 x = offset; x < width; x += 2)
 	{
+		const BYTE* srcEven = &pSrcEven[4ULL * x];
+		const BYTE* srcOdd = &pSrcOdd[4ULL * x];
 		const BOOL lastX = (x + 1) >= width;
 		BYTE Y1e = 0;
 		BYTE Y2e = 0;
@@ -1187,8 +1189,8 @@ static INLINE pstatus_t general_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT p
 		BYTE* b5 = b4 + 8ULL * dst2Step[0];
 		BYTE* b6 = pDst2[1] + 1ULL * (y / 2) * dst2Step[1];
 		BYTE* b7 = pDst2[2] + 1ULL * (y / 2) * dst2Step[2];
-		general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
-		                                       b7, roi->width);
+		general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5,
+		                                       b6, b7, roi->width);
 	}

 	return PRIMITIVES_SUCCESS;
@@ -1807,8 +1809,8 @@ static INLINE pstatus_t general_RGBToAVC444YUVv2_ANY(
 	return PRIMITIVES_SUCCESS;
 }

-static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
-    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+inline void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
@@ -1816,8 +1818,10 @@ static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
 {
-	for (UINT32 x = 0; x < width; x += 2)
+	for (UINT32 x = offset; x < width; x += 2)
 	{
+		const BYTE* srcEven = &pSrcEven[4ULL * x];
+		const BYTE* srcOdd = &pSrcOdd[4ULL * x];
 		BYTE Ya = 0;
 		BYTE Ua = 0;
 		BYTE Va = 0;
@@ -1966,7 +1970,7 @@ static INLINE pstatus_t general_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT
 		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
 		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
 		general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
-		    srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
+		    0, srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
 		    dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1, dstChromaU2, dstChromaV1,
 		    dstChromaV2, roi->width);
 	}
--- a/libfreerdp/primitives/prim_internal.h
+++ b/libfreerdp/primitives/prim_internal.h
@@ -320,6 +320,21 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
 	return WINPR_ASSERTING_INT_CAST(BYTE, val);
 }

+FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width);
+
+FREERDP_LOCAL void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width);
+
 /* Function prototypes for all the init/deinit routines. */
 FREERDP_LOCAL void primitives_init_copy(primitives_t* WINPR_RESTRICT prims);
 FREERDP_LOCAL void primitives_init_set(primitives_t* WINPR_RESTRICT prims);
--- a/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
+++ b/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
@@ -41,8 +41,8 @@ static primitives_t* generic = NULL;
 /****************************************************************************/
 /* sse41 YUV420 -> RGB conversion                                           */
 /****************************************************************************/
-static __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
-                                  __m128i Vraw, UINT8 pos)
+static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
+                                         __m128i Vraw, UINT8 pos)
 {
 	const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
 		                     mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
@@ -121,10 +121,10 @@ static __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m
 	return dst;
 }

-static pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
-                                        const UINT32* WINPR_RESTRICT srcStep,
-                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
-                                        const prim_size_t* WINPR_RESTRICT roi)
+static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
+                                               const UINT32* WINPR_RESTRICT srcStep,
+                                               BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                               const prim_size_t* WINPR_RESTRICT roi)
 {
 	const UINT32 nWidth = roi->width;
 	const UINT32 nHeight = roi->height;
@@ -190,9 +190,9 @@ static pstatus_t sse41_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UIN
 	}
 }

-static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
-                         const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
-                         const BYTE* WINPR_RESTRICT pV[2])
+static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
+                                const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
+                                const BYTE* WINPR_RESTRICT pV[2], BOOL filter)
 {
 	WINPR_ASSERT(pRGB);
 	WINPR_ASSERT(pY);
@@ -209,7 +209,7 @@ static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
 			const BYTE Y = pY[i][offset + j];
 			BYTE U = pU[i][offset + j];
 			BYTE V = pV[i][offset + j];
-			if ((i == 0) && (j == 0))
+			if ((i == 0) && (j == 0) && filter)
 			{
 				const INT32 avgU =
 				    4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1];
@@ -223,14 +223,15 @@ static void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
 			const BYTE r = YUV2R(Y, U, V);
 			const BYTE g = YUV2G(Y, U, V);
 			const BYTE b = YUV2B(Y, U, V);
-			pRGB[i] = writePixelBGRX(pRGB[i], bpp, DstFormat, r, g, b, 0);
+			writePixelBGRX(&pRGB[i][(j + offset) * bpp], bpp, DstFormat, r, g, b, 0);
 		}
 	}
 }

-static inline void unpack_mul_add(__m128i toadd[2], __m128i wide, __m128i mul, __m128i sub)
+static inline void unpack_mul_add(__m128i toadd[2], __m128i narrow, short iMul, __m128i sub)
 {
-	const __m128i usub = _mm_sub_epi16(wide, sub);
+	const __m128i usub = _mm_sub_epi16(narrow, sub);
+	const __m128i mul = _mm_set1_epi32(iMul);
 	const __m128i umulhi = _mm_mulhi_epi16(usub, mul);
 	const __m128i umullo = _mm_mullo_epi16(usub, mul);
 	{
@@ -243,54 +244,87 @@ static inline void unpack_mul_add(__m128i toadd[2], __m128i wide, __m128i mul, _
 	}
 }

-static inline __m128i sse41_yuv2x(const __m128i Y[2], __m128i U, __m128i V, const short iMulU,
-                                  const short iMulV)
+/* input are uint16_t vectors */
+static inline __m128i sse41_yuv2x_single(const __m128i Y, __m128i U, __m128i V, const short iMulU,
+                                         const short iMulV)
 {
 	const __m128i zero = _mm_set1_epi8(0);
-	const __m128i addX = _mm_set1_epi16(128);
-
-	__m128i res[2][2] = { { _mm_unpackhi_epi16(Y[0], zero), _mm_unpacklo_epi16(Y[0], zero) },
-		                  {
-		                      _mm_unpackhi_epi16(Y[1], zero),
-		                      _mm_unpacklo_epi16(Y[1], zero),
-		                  } };

+	__m128i Ylo = _mm_unpacklo_epi16(Y, zero);
+	__m128i Yhi = _mm_unpackhi_epi16(Y, zero);
 	if (iMulU != 0)
 	{
+		const __m128i addX = _mm_set1_epi16(128);
+		const __m128i D = _mm_sub_epi16(U, addX);
 		const __m128i mulU = _mm_set1_epi16(iMulU);
-		unpack_mul_add(res[0], _mm_unpackhi_epi8(U, zero), mulU, addX);
-		unpack_mul_add(res[1], _mm_unpacklo_epi8(U, zero), mulU, addX);
+		const __m128i mulDlo = _mm_mullo_epi16(D, mulU);
+		const __m128i mulDhi = _mm_mulhi_epi16(D, mulU);
+		const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi);
+		Ylo = _mm_add_epi32(Ylo, Dlo);
+
+		const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi);
+		Yhi = _mm_add_epi32(Yhi, Dhi);
 	}
 	if (iMulV != 0)
 	{
-		const __m128i mulV = _mm_set1_epi16(iMulV);
-		unpack_mul_add(res[0], _mm_unpackhi_epi8(V, zero), mulV, addX);
-		unpack_mul_add(res[1], _mm_unpacklo_epi8(V, zero), mulV, addX);
+		const __m128i addX = _mm_set1_epi16(128);
+		const __m128i E = _mm_sub_epi16(V, addX);
+		const __m128i mul = _mm_set1_epi16(iMulV);
+		const __m128i mulElo = _mm_mullo_epi16(E, mul);
+		const __m128i mulEhi = _mm_mulhi_epi16(E, mul);
+		const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi);
+		const __m128i esumlo = _mm_add_epi32(Ylo, Elo);
+
+		const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi);
+		const __m128i esumhi = _mm_add_epi32(Yhi, Ehi);
+		Ylo = esumlo;
+		Yhi = esumhi;
 	}

-	res[0][0] = _mm_srai_epi32(res[0][0], 8);
-	res[0][1] = _mm_srai_epi32(res[0][1], 8);
-	res[1][0] = _mm_srai_epi32(res[1][0], 8);
-	res[1][1] = _mm_srai_epi32(res[1][1], 8);
-	const __m128i pres1 = _mm_packs_epi32(res[0][0], res[0][1]);
-	const __m128i pres2 = _mm_packs_epi32(res[1][0], res[1][1]);
-	return _mm_packus_epi16(pres1, pres2);
+	const __m128i rYlo = _mm_srai_epi32(Ylo, 8);
+	const __m128i rYhi = _mm_srai_epi32(Yhi, 8);
+	const __m128i rY = _mm_packs_epi32(rYlo, rYhi);
+	return rY;
+}
+
+/* Input are uint8_t vectors */
+static inline __m128i sse41_yuv2x(const __m128i Y, __m128i U, __m128i V, const short iMulU,
+                                  const short iMulV)
+{
+	const __m128i zero = _mm_set1_epi8(0);
+
+	/* Ylo = Y * 256
+	 * Ulo = uint8_t -> uint16_t
+	 * Vlo = uint8_t -> uint16_t
+	 */
+	const __m128i Ylo = _mm_unpacklo_epi8(zero, Y);
+	const __m128i Ulo = _mm_unpacklo_epi8(U, zero);
+	const __m128i Vlo = _mm_unpacklo_epi8(V, zero);
+	const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV);
+
+	const __m128i Yhi = _mm_unpackhi_epi8(zero, Y);
+	const __m128i Uhi = _mm_unpackhi_epi8(U, zero);
+	const __m128i Vhi = _mm_unpackhi_epi8(V, zero);
+	const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV);
+	const __m128i res = _mm_packus_epi16(preslo, preshi);
+
+	return res;
 }

 /* const INT32 r = ((256L * C(Y) + 0L * D(U) + 403L * E(V))) >> 8; */
-static inline __m128i sse41_yuv2r(const __m128i Y[2], __m128i U, __m128i V)
+static inline __m128i sse41_yuv2r(const __m128i Y, __m128i U, __m128i V)
 {
 	return sse41_yuv2x(Y, U, V, 0, 403);
 }

 /*  const INT32 g = ((256L * C(Y) - 48L * D(U) - 120L * E(V))) >> 8; */
-static inline __m128i sse41_yuv2g(const __m128i Y[2], __m128i U, __m128i V)
+static inline __m128i sse41_yuv2g(const __m128i Y, __m128i U, __m128i V)
 {
 	return sse41_yuv2x(Y, U, V, -48, -120);
 }

 /* const INT32 b = ((256L * C(Y) + 475L * D(U) + 0L * E(V))) >> 8; */
-static inline __m128i sse41_yuv2b(const __m128i Y[2], __m128i U, __m128i V)
+static inline __m128i sse41_yuv2b(const __m128i Y, __m128i U, __m128i V)
 {
 	return sse41_yuv2x(Y, U, V, 475, 0);
 }
@@ -300,36 +334,101 @@ static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y
 {
 	const __m128i zero = _mm_set1_epi8(0);
 	/* Y * 256 */
-	const __m128i yY[] = { _mm_unpackhi_epi8(zero, Y), _mm_unpacklo_epi8(zero, Y) };
-	const __m128i r = sse41_yuv2r(yY, U, V);
+	const __m128i r = sse41_yuv2r(Y, U, V);
 	const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) };

-	const __m128i g = sse41_yuv2g(yY, U, V);
-	const __m128i b = sse41_yuv2b(yY, U, V);
+	const __m128i g = sse41_yuv2g(Y, U, V);
+	const __m128i b = sse41_yuv2b(Y, U, V);
+
 	const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) };

 	const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF,
 	                                 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF);

 	__m128i* rgb = (__m128i*)pRGB;
-	const __m128i bgrx0 = _mm_unpackhi_epi16(bg[0], rx[0]);
+	const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]);
 	_mm_maskmoveu_si128(bgrx0, mask, (char*)&rgb[0]);
-	const __m128i bgrx1 = _mm_unpacklo_epi16(bg[0], rx[0]);
+	const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]);
 	_mm_maskmoveu_si128(bgrx1, mask, (char*)&rgb[1]);
-	const __m128i bgrx2 = _mm_unpackhi_epi16(bg[1], rx[1]);
+	const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]);
 	_mm_maskmoveu_si128(bgrx2, mask, (char*)&rgb[2]);
-	const __m128i bgrx3 = _mm_unpacklo_epi16(bg[1], rx[1]);
+	const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]);
 	_mm_maskmoveu_si128(bgrx3, mask, (char*)&rgb[3]);
 }

-static void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
-                               const __m128i pU[2], const __m128i pV[2])
+static inline __m128i odd1sum(__m128i u1)
+{
+	const __m128i zero = _mm_set1_epi8(0);
+	const __m128i u1hi = _mm_unpackhi_epi8(u1, zero);
+	const __m128i u1lo = _mm_unpacklo_epi8(u1, zero);
+	return _mm_hadds_epi16(u1lo, u1hi);
+}
+
+static inline __m128i odd0sum(__m128i u0, __m128i u1sum)
+{
+	/* Mask out even bytes, extend uint8_t to uint16_t by filling in zero bytes,
+	 * horizontally add the values */
+	const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07,
+	                                 0x80, 0x05, 0x80, 0x03, 0x80, 0x01);
+	const __m128i u0odd = _mm_shuffle_epi8(u0, mask);
+	return _mm_adds_epi16(u1sum, u0odd);
+}
+
+static inline __m128i calcavg(__m128i u0even, __m128i sum)
+{
+	const __m128i u4zero = _mm_slli_epi16(u0even, 2);
+	const __m128i uavg = _mm_sub_epi16(u4zero, sum);
+	const __m128i zero = _mm_set1_epi8(0);
+	const __m128i savg = _mm_packus_epi16(uavg, zero);
+	const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03,
+	                                  0x80, 0x02, 0x80, 0x01, 0x80, 0x00);
+	return _mm_shuffle_epi8(savg, smask);
+}
+
+static inline __m128i diffmask(__m128i avg, __m128i u0even)
+{
+	/* Check for values >= 30 to apply the avg value to
+	 * use int16 for calculations to avoid issues with signed 8bit integers
+	 */
+	const __m128i diff = _mm_subs_epi16(u0even, avg);
+	const __m128i absdiff = _mm_abs_epi16(diff);
+	const __m128i val30 = _mm_set1_epi16(30);
+	return _mm_cmplt_epi16(absdiff, val30);
+}
+
+static inline void sse41_filter(__m128i pU[2])
+{
+	const __m128i u1sum = odd1sum(pU[1]);
+	const __m128i sum = odd0sum(pU[0], u1sum);
+
+	/* Mask out the odd bytes. We don´t need to do anything to make the uint8_t to uint16_t */
+	const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
+	                                  0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
+	const __m128i u0even = _mm_and_si128(pU[0], emask);
+	const __m128i avg = calcavg(u0even, sum);
+	const __m128i umask = diffmask(avg, u0even);
+
+	const __m128i u0orig = _mm_and_si128(u0even, umask);
+	const __m128i u0avg = _mm_andnot_si128(umask, avg);
+	const __m128i evenresult = _mm_or_si128(u0orig, u0avg);
+	const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
+	                                  0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00);
+	const __m128i u0odd = _mm_and_si128(pU[0], omask);
+	const __m128i result = _mm_or_si128(evenresult, u0odd);
+	pU[0] = result;
+}
+
+static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
+                                      __m128i pU[2], __m128i pV[2])
 {
 	WINPR_ASSERT(pRGB);
 	WINPR_ASSERT(pY);
 	WINPR_ASSERT(pU);
 	WINPR_ASSERT(pV);

+	sse41_filter(pU);
+	sse41_filter(pV);
+
 	for (size_t i = 0; i < 2; i++)
 	{
 		sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]);
@@ -342,21 +441,24 @@ static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
 {
 	WINPR_ASSERT((nWidth % 2) == 0);
 	const UINT32 pad = nWidth % 16;
-	for (size_t x = 0; x < nWidth - pad; x += 16)
+
+	size_t x = 0;
+	for (; x < nWidth - pad; x += 16)
 	{
-		const __m128i Y[] = { _mm_load_si128((const __m128i*)&YData[0][x]),
-			                  _mm_load_si128((const __m128i*)&YData[1][x]) };
-		const __m128i U[] = { _mm_load_si128((const __m128i*)&UData[0][x]),
-			                  _mm_load_si128((const __m128i*)&UData[1][x]) };
-		const __m128i V[] = { _mm_load_si128((const __m128i*)&VData[0][x]),
-			                  _mm_load_si128((const __m128i*)&VData[1][x]) };
+		const __m128i Y[] = { _mm_loadu_si128((const __m128i*)&YData[0][x]),
+			                  _mm_loadu_si128((const __m128i*)&YData[1][x]) };
+		__m128i U[] = { _mm_loadu_si128((const __m128i*)&UData[0][x]),
+			            _mm_loadu_si128((const __m128i*)&UData[1][x]) };
+		__m128i V[] = { _mm_loadu_si128((const __m128i*)&VData[0][x]),
+			            _mm_loadu_si128((const __m128i*)&VData[1][x]) };
+
 		BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
 		sse41_BGRX_fillRGB(dstp, Y, U, V);
 	}

-	for (size_t x = nWidth - pad; x < nWidth; x += 2)
+	for (; x < nWidth; x += 2)
 	{
-		BGRX_fillRGB(x, pDst, YData, UData, VData);
+		BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE);
 	}

 	return PRIMITIVES_SUCCESS;
@@ -392,10 +494,6 @@ static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
                                             UINT32 dstStep, UINT32 DstFormat,
                                             const prim_size_t* WINPR_RESTRICT roi)
 {
-	if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
-	    srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
-		return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
-
 	switch (DstFormat)
 	{
 		case PIXEL_FORMAT_BGRX32:
@@ -460,41 +558,63 @@ PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
 };
 */

+static inline void sse41_BGRX_TO_YUV(const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine,
+                                     BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine)
+{
+	const BYTE r1 = pLine1[2];
+	const BYTE g1 = pLine1[1];
+	const BYTE b1 = pLine1[0];
+
+	if (pYLine)
+		pYLine[0] = RGB2Y(r1, g1, b1);
+	if (pULine)
+		pULine[0] = RGB2U(r1, g1, b1);
+	if (pVLine)
+		pVLine[0] = RGB2V(r1, g1, b1);
+}
+
 /* compute the luma (Y) component from a single rgb source line */

 static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
 {
-	__m128i x0;
-	__m128i x1;
-	__m128i x2;
-	__m128i x3;
 	const __m128i y_factors = BGRX_Y_FACTORS;
 	const __m128i* argb = (const __m128i*)src;
 	__m128i* ydst = (__m128i*)dst;

-	for (UINT32 x = 0; x < width; x += 16)
+	UINT32 x = 0;
+
+	for (; x < width - width % 16; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers */
-		x0 = _mm_load_si128(argb++); // 1st 4 pixels
-		x1 = _mm_load_si128(argb++); // 2nd 4 pixels
-		x2 = _mm_load_si128(argb++); // 3rd 4 pixels
-		x3 = _mm_load_si128(argb++); // 4th 4 pixels
-		/* multiplications and subtotals */
-		x0 = _mm_maddubs_epi16(x0, y_factors);
-		x1 = _mm_maddubs_epi16(x1, y_factors);
-		x2 = _mm_maddubs_epi16(x2, y_factors);
-		x3 = _mm_maddubs_epi16(x3, y_factors);
-		/* the total sums */
-		x0 = _mm_hadd_epi16(x0, x1);
-		x2 = _mm_hadd_epi16(x2, x3);
-		/* shift the results */
-		x0 = _mm_srli_epi16(x0, Y_SHIFT);
-		x2 = _mm_srli_epi16(x2, Y_SHIFT);
-		/* pack the 16 words into bytes */
+		__m128i x0 = _mm_loadu_si128(argb++); // 1st 4 pixels
+		{
+			x0 = _mm_maddubs_epi16(x0, y_factors);
+
+			__m128i x1 = _mm_loadu_si128(argb++); // 2nd 4 pixels
+			x1 = _mm_maddubs_epi16(x1, y_factors);
+			x0 = _mm_hadds_epi16(x0, x1);
+			x0 = _mm_srli_epi16(x0, Y_SHIFT);
+		}
+
+		__m128i x2 = _mm_loadu_si128(argb++); // 3rd 4 pixels
+		{
+			x2 = _mm_maddubs_epi16(x2, y_factors);
+
+			__m128i x3 = _mm_loadu_si128(argb++); // 4th 4 pixels
+			x3 = _mm_maddubs_epi16(x3, y_factors);
+			x2 = _mm_hadds_epi16(x2, x3);
+			x2 = _mm_srli_epi16(x2, Y_SHIFT);
+		}
+
 		x0 = _mm_packus_epi16(x0, x2);
 		/* save to y plane */
 		_mm_storeu_si128(ydst++, x0);
 	}
+
+	for (; x < width; x++)
+	{
+		sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL);
+	}
 }

 /* compute the chrominance (UV) components from two rgb source lines */
@@ -507,32 +627,33 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
 	const __m128i u_factors = BGRX_U_FACTORS;
 	const __m128i v_factors = BGRX_V_FACTORS;
 	const __m128i vector128 = CONST128_FACTORS;
-	__m128i x0;
-	__m128i x1;
-	__m128i x2;
-	__m128i x3;
-	__m128i x4;
-	__m128i x5;
-	const __m128i* rgb1 = (const __m128i*)src1;
-	const __m128i* rgb2 = (const __m128i*)src2;
-	__m64* udst = (__m64*)dst1;
-	__m64* vdst = (__m64*)dst2;

-	for (UINT32 x = 0; x < width; x += 16)
+	size_t x = 0;
+
+	for (; x < width - width % 16; x += 16)
 	{
+		const __m128i* rgb1 = (const __m128i*)&src1[4ULL * x];
+		const __m128i* rgb2 = (const __m128i*)&src2[4ULL * x];
+		__m64* udst = (__m64*)&dst1[x / 2];
+		__m64* vdst = (__m64*)&dst2[x / 2];
+
 		/* subsample 16x2 pixels into 16x1 pixels */
-		x0 = _mm_load_si128(rgb1++);
-		x4 = _mm_load_si128(rgb2++);
+		__m128i x0 = _mm_loadu_si128(&rgb1[0]);
+		__m128i x4 = _mm_loadu_si128(&rgb2[0]);
 		x0 = _mm_avg_epu8(x0, x4);
-		x1 = _mm_load_si128(rgb1++);
-		x4 = _mm_load_si128(rgb2++);
+
+		__m128i x1 = _mm_loadu_si128(&rgb1[1]);
+		x4 = _mm_loadu_si128(&rgb2[1]);
 		x1 = _mm_avg_epu8(x1, x4);
-		x2 = _mm_load_si128(rgb1++);
-		x4 = _mm_load_si128(rgb2++);
+
+		__m128i x2 = _mm_loadu_si128(&rgb1[2]);
+		x4 = _mm_loadu_si128(&rgb2[2]);
 		x2 = _mm_avg_epu8(x2, x4);
-		x3 = _mm_load_si128(rgb1++);
-		x4 = _mm_load_si128(rgb2++);
+
+		__m128i x3 = _mm_loadu_si128(&rgb1[3]);
+		x4 = _mm_loadu_si128(&rgb2[3]);
 		x3 = _mm_avg_epu8(x3, x4);
+
 		/* subsample these 16x1 pixels into 8x1 pixels */
 		/**
 		 * shuffle controls
@@ -549,7 +670,7 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
 		x2 = _mm_maddubs_epi16(x0, u_factors);
 		x3 = _mm_maddubs_epi16(x1, u_factors);
 		x4 = _mm_maddubs_epi16(x0, v_factors);
-		x5 = _mm_maddubs_epi16(x1, v_factors);
+		__m128i x5 = _mm_maddubs_epi16(x1, v_factors);
 		/* the total sums */
 		x0 = _mm_hadd_epi16(x2, x3);
 		x1 = _mm_hadd_epi16(x4, x5);
@@ -561,50 +682,60 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
 		/* add 128 */
 		x0 = _mm_sub_epi8(x0, vector128);
 		/* the lower 8 bytes go to the u plane */
-		_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
+		_mm_storel_pi(udst, _mm_castsi128_ps(x0));
 		/* the upper 8 bytes go to the v plane */
-		_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
+		_mm_storeh_pi(vdst, _mm_castsi128_ps(x0));
+	}
+
+	for (; x < width - width % 2; x += 2)
+	{
+		BYTE u[4] = { 0 };
+		BYTE v[4] = { 0 };
+		sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]);
+		sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]);
+		sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]);
+		sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]);
+		const INT16 u4 = (INT16)u[0] + u[1] + u[2] + u[3];
+		const INT16 uu = u4 / 4;
+		const BYTE u8 = CLIP(uu);
+		dst1[x / 2] = u8;
+
+		const INT16 v4 = (INT16)v[0] + v[1] + v[2] + v[3];
+		const INT16 vu = v4 / 4;
+		const BYTE v8 = CLIP(vu);
+		dst2[x / 2] = v8;
 	}
 }

-static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
-                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
-                                        const UINT32 dstStep[],
+static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                        BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
                                        const prim_size_t* WINPR_RESTRICT roi)
 {
-	const BYTE* argb = pSrc;
-	BYTE* ydst = pDst[0];
-	BYTE* udst = pDst[1];
-	BYTE* vdst = pDst[2];
-
 	if (roi->height < 1 || roi->width < 1)
 	{
 		return !PRIMITIVES_SUCCESS;
 	}

-	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+	size_t y = 0;
+	for (; y < roi->height - roi->height % 2; y += 2)
 	{
-		return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
-	}
+		const BYTE* line1 = &pSrc[y * srcStep];
+		const BYTE* line2 = &pSrc[(1ULL + y) * srcStep];
+		BYTE* ydst1 = &pDst[0][y * dstStep[0]];
+		BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]];
+		BYTE* udst = &pDst[1][y / 2 * dstStep[1]];
+		BYTE* vdst = &pDst[2][y / 2 * dstStep[2]];

-	for (UINT32 y = 0; y < roi->height - 1; y += 2)
-	{
-		const BYTE* line1 = argb;
-		const BYTE* line2 = argb + srcStep;
 		sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
-		sse41_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
-		sse41_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
-		argb += 2ULL * srcStep;
-		ydst += 2ULL * dstStep[0];
-		udst += 1ULL * dstStep[1];
-		vdst += 1ULL * dstStep[2];
+		sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width);
+		sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width);
 	}

-	if (roi->height & 1)
+	for (; y < roi->height; y++)
 	{
-		/* pass the same last line of an odd height twice for UV */
-		sse41_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
-		sse41_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
+		const BYTE* line = &pSrc[y * srcStep];
+		BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]];
+		sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width);
 	}

 	return PRIMITIVES_SUCCESS;
@@ -618,7 +749,7 @@ static pstatus_t sse41_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFo
 	{
 		case PIXEL_FORMAT_BGRX32:
 		case PIXEL_FORMAT_BGRA32:
-			return sse41_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+			return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);

 		default:
 			return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
@@ -642,17 +773,18 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
 	const __m128i v_factors = BGRX_V_FACTORS;
 	const __m128i vector128 = CONST128_FACTORS;

-	for (UINT32 x = 0; x < width; x += 16)
+	UINT32 x = 0;
+	for (; x < width - width % 16; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers */
-		const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
-		const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
-		const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
-		const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
-		const __m128i xo1 = _mm_load_si128(argbOdd++);  // 1st 4 pixels
-		const __m128i xo2 = _mm_load_si128(argbOdd++);  // 2nd 4 pixels
-		const __m128i xo3 = _mm_load_si128(argbOdd++);  // 3rd 4 pixels
-		const __m128i xo4 = _mm_load_si128(argbOdd++);  // 4th 4 pixels
+		const __m128i xe1 = _mm_loadu_si128(argbEven++); // 1st 4 pixels
+		const __m128i xe2 = _mm_loadu_si128(argbEven++); // 2nd 4 pixels
+		const __m128i xe3 = _mm_loadu_si128(argbEven++); // 3rd 4 pixels
+		const __m128i xe4 = _mm_loadu_si128(argbEven++); // 4th 4 pixels
+		const __m128i xo1 = _mm_loadu_si128(argbOdd++);  // 1st 4 pixels
+		const __m128i xo2 = _mm_loadu_si128(argbOdd++);  // 2nd 4 pixels
+		const __m128i xo3 = _mm_loadu_si128(argbOdd++);  // 3rd 4 pixels
+		const __m128i xo4 = _mm_loadu_si128(argbOdd++);  // 4th 4 pixels
 		{
 			/* Y: multiplications with subtotals and horizontal sums */
 			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
@@ -743,7 +875,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(

 			if (b1Odd) /* b4 */
 			{
-				_mm_store_si128((__m128i*)b4, uo);
+				_mm_storeu_si128((__m128i*)b4, uo);
 				b4 += 16;
 			}

@@ -821,7 +953,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(

 			if (b1Odd) /* b5 */
 			{
-				_mm_store_si128((__m128i*)b5, vo);
+				_mm_storeu_si128((__m128i*)b5, vo);
 				b5 += 16;
 			}

@@ -836,6 +968,9 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
 			}
 		}
 	}
+
+	general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
+	                                       b7, width);
 }

 static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
@@ -849,10 +984,6 @@ static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT
 	if (roi->height < 1 || roi->width < 1)
 		return !PRIMITIVES_SUCCESS;

-	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
-		return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
-		                               roi);
-
 	for (size_t y = 0; y < roi->height; y += 2)
 	{
 		const BOOL last = (y >= (roi->height - 1));
@@ -920,19 +1051,20 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 	const __m128i* argbEven = (const __m128i*)srcEven;
 	const __m128i* argbOdd = (const __m128i*)srcOdd;

-	for (UINT32 x = 0; x < width; x += 16)
+	UINT32 x = 0;
+	for (; x < width - width % 16; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers
 		 * for even and odd rows.
 		 */
-		const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
-		const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
-		const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
-		const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
-		const __m128i xo1 = _mm_load_si128(argbOdd++);  /* 1st 4 pixels */
-		const __m128i xo2 = _mm_load_si128(argbOdd++);  /* 2nd 4 pixels */
-		const __m128i xo3 = _mm_load_si128(argbOdd++);  /* 3rd 4 pixels */
-		const __m128i xo4 = _mm_load_si128(argbOdd++);  /* 4th 4 pixels */
+		const __m128i xe1 = _mm_loadu_si128(argbEven++); /* 1st 4 pixels */
+		const __m128i xe2 = _mm_loadu_si128(argbEven++); /* 2nd 4 pixels */
+		const __m128i xe3 = _mm_loadu_si128(argbEven++); /* 3rd 4 pixels */
+		const __m128i xe4 = _mm_loadu_si128(argbEven++); /* 4th 4 pixels */
+		const __m128i xo1 = _mm_loadu_si128(argbOdd++);  /* 1st 4 pixels */
+		const __m128i xo2 = _mm_loadu_si128(argbOdd++);  /* 2nd 4 pixels */
+		const __m128i xo3 = _mm_loadu_si128(argbOdd++);  /* 3rd 4 pixels */
+		const __m128i xo4 = _mm_loadu_si128(argbOdd++);  /* 4th 4 pixels */
 		{
 			/* Y: multiplications with subtotals and horizontal sums */
 			const __m128i y_factors = BGRX_Y_FACTORS;
@@ -1150,6 +1282,11 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 			}
 		}
 	}
+
+	general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd,
+	                                         uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2,
+	                                         yOddChromaDst1, yOddChromaDst2, uChromaDst1,
+	                                         uChromaDst2, vChromaDst1, vChromaDst2, width);
 }

 static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
@@ -1161,10 +1298,6 @@ static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UI
 	if (roi->height < 1 || roi->width < 1)
 		return !PRIMITIVES_SUCCESS;

-	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
-		return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
-		                                 roi);
-
 	for (size_t y = 0; y < roi->height; y += 2)
 	{
 		const BYTE* srcEven = (pSrc + y * srcStep);