From 9501b6c58ec9fac3fce231ede68fd6e1c2946a78 Mon Sep 17 00:00:00 2001 From: erbth Date: Thu, 17 Jul 2014 16:25:34 +0200 Subject: [PATCH 01/31] OpenH264 first frame decode fix --- channels/rdpgfx/client/rdpgfx_codec.c | 6 +++ client/X11/xf_gfx.c | 10 +++- libfreerdp/codec/h264.c | 76 +++++++++++++++++++++++---- 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/channels/rdpgfx/client/rdpgfx_codec.c b/channels/rdpgfx/client/rdpgfx_codec.c index 4881db399..d621eea42 100644 --- a/channels/rdpgfx/client/rdpgfx_codec.c +++ b/channels/rdpgfx/client/rdpgfx_codec.c @@ -72,15 +72,19 @@ int rdpgfx_read_h264_metablock(RDPGFX_PLUGIN* gfx, wStream* s, RDPGFX_H264_METAB if (!meta->quantQualityVals) return -1; +#if 0 printf("H264_METABLOCK: numRegionRects: %d\n", (int) meta->numRegionRects); +#endif for (index = 0; index < meta->numRegionRects; index++) { regionRect = &(meta->regionRects[index]); rdpgfx_read_rect16(s, regionRect); +#if 0 printf("regionRects[%d]: left: %d top: %d right: %d bottom: %d\n", index, regionRect->left, regionRect->top, regionRect->right, regionRect->bottom); +#endif } if (Stream_GetRemainingLength(s) < (meta->numRegionRects * 2)) @@ -96,8 +100,10 @@ int rdpgfx_read_h264_metablock(RDPGFX_PLUGIN* gfx, wStream* s, RDPGFX_H264_METAB quantQualityVal->r = (quantQualityVal->qpVal >> 6) & 1; quantQualityVal->p = (quantQualityVal->qpVal >> 7) & 1; +#if 0 printf("quantQualityVals[%d]: qp: %d r: %d p: %d qualityVal: %d\n", index, quantQualityVal->qp, quantQualityVal->r, quantQualityVal->p, quantQualityVal->qualityVal); +#endif } return 1; diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index bf04042f6..da4d41101 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -297,6 +297,7 @@ int xf_SurfaceCommand_ClearCodec(xfContext* xfc, RdpgfxClientContext* context, R region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), &invalidRect); + if (!xfc->inGfxFrame) xf_OutputUpdate(xfc); @@ -397,9 +398,12 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ region16_init(&updateRegion); region16_intersect_rect(&updateRegion, &clippingRects, &updateRect); + updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects); +#if 0 printf("numRegionRects: %d nbUpdateRects: %d\n", meta->numRegionRects, nbUpdateRects); +#endif for (j = 0; j < nbUpdateRects; j++) { @@ -410,14 +414,17 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ /* update region from decoded H264 buffer */ +#if 0 printf("nXDst: %d nYDst: %d nWidth: %d nHeight: %d decoded: width: %d height: %d cmd: left: %d top: %d right: %d bottom: %d\n", nXDst, nYDst, nWidth, nHeight, h264->width, h264->height, cmd->left, cmd->top, cmd->right, cmd->bottom); +#endif freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, nXDst, nYDst, nWidth, nHeight, h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst); + region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), &updateRects[j]); } @@ -430,8 +437,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ cmd->left, cmd->top, cmd->width, cmd->height, 0xFF0000); #endif - if (!xfc->inGfxFrame) + if (!xfc->inGfxFrame){ xf_OutputUpdate(xfc); + } return 1; } diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 4b0d1de68..c532bc81c 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -28,9 +28,9 @@ #include #include -#define USE_GRAY_SCALE 1 +#define USE_GRAY_SCALE 0 #define USE_UPCONVERT 0 -#define USE_TRACE 1 +#define USE_TRACE 0 static BYTE clip(int x) { @@ -189,11 +189,12 @@ int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, { int x, y; BYTE* pDstPixel8; - BYTE *pY, *pU, *pV; + BYTE *pY, *pU, *pV, *pUv, *pVv; + int temp1=0,temp2=0; pY = pSrcData[0]; - pU = pSrcData[1]; - pV = pSrcData[0]; + pUv = pU = pSrcData[1]; + pVv = pV = pSrcData[2]; pDstPixel8 = &pDstData[(nYDst * nDstStep) + (nXDst * 4)]; @@ -201,13 +202,33 @@ int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, { for (x = 0; x < nWidth; x++) { - *((UINT32*) pDstPixel8) = RGB32(*pY, *pY, *pY); +/* *((UINT32*) pDstPixel8) = RGB32(*pY, *pY, *pY);*/ + *((UINT32*) pDstPixel8) = YUV_to_RGB(*pY,*pU,*pV); pDstPixel8 += 4; pY++; + + if(temp1){ + temp1=0; + pU++; + pV++; + }else{ + temp1=1; + } } pDstPixel8 += (nDstStep - (nWidth * 4)); pY += (nSrcStep[0] - nWidth); + if(temp2){ + temp2=0; + pU += (nSrcStep[1] - nWidth / 2); + pV += (nSrcStep[1] - nWidth / 2); + pUv = pU; + pVv = pV; + }else{ + temp2=1; + pU = pUv; + pV = pVv; + } } return 1; @@ -282,7 +303,7 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize); -#if 1 +#if 0 printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight); #endif @@ -335,9 +356,17 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pYUVData, &sBufferInfo); + + state = (*h264->pDecoder)->DecodeFrame2( + h264->pDecoder, + NULL, + 0, + pYUVData, + &sBufferInfo); + pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; -#if 1 +#if 0 printf("h264_decompress: state=%u, pYUVData=[%p,%p,%p], bufferStatus=%d, width=%d, height=%d, format=%d, stride=[%d,%d]\n", state, pYUVData[0], pYUVData[1], pYUVData[2], sBufferInfo.iBufferStatus, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iFormat, @@ -387,7 +416,6 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, fclose(fp); } - g_H264FrameId++; if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; @@ -395,6 +423,35 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); + if (g_H264DumpFrames) + { + FILE* fp; + BYTE* srcp; + char buf[4096]; + + snprintf(buf, sizeof(buf), "/tmp/wlog/H264_%d_rgb.ppm", g_H264FrameId); + fp = fopen(buf, "wb"); + fwrite("P6\n", 1, 3, fp); + snprintf(buf, sizeof(buf), "%d %d\n", pSystemBuffer->iWidth, pSystemBuffer->iHeight); + fwrite(buf, 1, strlen(buf), fp); + fwrite("255\n", 1, 4, fp); + + srcp = h264->data; + + for (j = 0; j < h264->height; j++) + { + for(i=0;iwidth;i++){ + fwrite(srcp, 1, 3, fp); + srcp += 4; + } + } + + fflush(fp); + fclose(fp); + } + + g_H264FrameId++; + return 1; #if USE_UPCONVERT @@ -499,6 +556,7 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) printf("Failed to set data format option on OpenH264 decoder (status=%ld)\n", status); } + #if USE_TRACE status = (*h264->pDecoder)->SetOption(h264->pDecoder, DECODER_OPTION_TRACE_LEVEL, &traceLevel); if (status != 0) From 20e76411dcd7c492d9157fe4c04082e815353144 Mon Sep 17 00:00:00 2001 From: erbth Date: Tue, 29 Jul 2014 21:42:04 +0200 Subject: [PATCH 02/31] H.264 hack and first port of YUV to XRGB format conversion to assembly --- .gitignore | 2 + client/X11/xf_gfx.c | 20 ++ libfreerdp/codec/CMakeLists.txt | 17 ++ libfreerdp/codec/h264.asm | 236 +++++++++++++++++++++ libfreerdp/codec/h264.asm.alt | 262 ++++++++++++++++++++++++ libfreerdp/codec/h264.c | 34 ++- libfreerdp/codec/test/TestOpenH264ASM.c | 57 ++++++ libfreerdp/codec/test/TestOpenH264ASM.h | 7 + 8 files changed, 631 insertions(+), 4 deletions(-) create mode 100644 libfreerdp/codec/h264.asm create mode 100644 libfreerdp/codec/h264.asm.alt create mode 100644 libfreerdp/codec/test/TestOpenH264ASM.c create mode 100644 libfreerdp/codec/test/TestOpenH264ASM.h diff --git a/.gitignore b/.gitignore index af133b4f7..928ef7b95 100755 --- a/.gitignore +++ b/.gitignore @@ -92,6 +92,7 @@ RelWithDebInfo # Binaries *.a +*.o *.so *.so.* *.dylib @@ -105,6 +106,7 @@ client/DirectFB/dfreerdp server/Sample/sfreerdp-server server/X11/xfreerdp-server xcode +libfreerdp/codec/test/TestOpenH264 # Other *~ diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index da4d41101..e1142f6ef 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -23,6 +23,8 @@ #include "xf_gfx.h" +#include + int xf_ResetGraphics(RdpgfxClientContext* context, RDPGFX_RESET_GRAPHICS_PDU* resetGraphics) { xfContext* xfc = (xfContext*) context->custom; @@ -353,6 +355,16 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_BITMAP_STREAM* bs; + static struct timeval TGES1; + struct timeval TGES2,TDEC1,TDEC2; + + TGES2.tv_usec=TGES1.tv_usec; + TGES2.tv_sec=TGES1.tv_sec; + + gettimeofday(&TGES1,NULL); + printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec)); + + h264 = xfc->h264; bs = (RDPGFX_H264_BITMAP_STREAM*) cmd->extra; @@ -369,8 +381,13 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ DstData = surface->data; + gettimeofday(&TDEC1,NULL); status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); + gettimeofday(&TDEC2,NULL); + printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec)); + + free(bs->data); printf("xf_SurfaceCommand_H264: status: %d\n", status); @@ -440,6 +457,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ if (!xfc->inGfxFrame){ xf_OutputUpdate(xfc); } + + gettimeofday(&TGES2,NULL); + printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec)); return 1; } diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index 17f23d99f..fdef7f6ec 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -91,6 +91,19 @@ if(WITH_OPENH264) add_definitions(-DWITH_OPENH264) include_directories(${OPENH264_INCLUDE_DIR}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES}) + + if(WITH_OPENH264_ASM) + set(OPENH264_ASM OPENH264_ASM_o) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm) + + add_definitions(-DWITH_OPENH264_ASM) + add_custom_target(${OPENH264_ASM}) + add_custom_command(TARGET ${OPENH264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC} + COMMENT "building H.264 asm objects ...") + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + endif() endif() add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT" @@ -121,6 +134,10 @@ else() install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) endif() +if(WITH_OPENH264_ASM) + add_dependencies(${MODULE_NAME} ${OPENH264_ASM}) +endif() + set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp") if(BUILD_TESTING) diff --git a/libfreerdp/codec/h264.asm b/libfreerdp/codec/h264.asm new file mode 100644 index 000000000..1473849e0 --- /dev/null +++ b/libfreerdp/codec/h264.asm @@ -0,0 +1,236 @@ +;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 +;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 +;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 + +section .data + debug: db "DEBUG",10 + dblen: equ $-debug + +section .text + ;global YUV_to_RGB_asm +YUV_to_RGB_asm: + shl rdi,8 + + mov eax,edx + imul eax,403 + add eax,edi + sub eax,51456 + + jae YUV_to_RGB_asm1 + mov eax,0 + jmp YUV_to_RGB_asm11 + +YUV_to_RGB_asm1: + cmp eax, 0xFFFF + jbe YUV_to_RGB_asm11 + mov eax,0xFF00 + +YUV_to_RGB_asm11: + and eax,0xFF00 + shl eax,8 + + mov ebx,esi + imul ebx,475 + add ebx,edi + sub ebx,60672 + + jae YUV_to_RGB_asm2 + mov ebx, 0 + jmp YUV_to_RGB_asm21 + +YUV_to_RGB_asm2: + cmp ebx,0xFFFF + jbe YUV_to_RGB_asm21 + mov ebx,0xFF00 + +YUV_to_RGB_asm21: + and ebx,0xFF00 + shr ebx,8 + + imul edx,120 + sub edi,edx + imul esi,48 + sub edi,esi + add edi,21632 + + bt edi,31 + jae YUV_to_RGB_asm3 + mov edi, 0 + jmp YUV_to_RGB_asm31 + +YUV_to_RGB_asm3: + cmp edi,0xFFFF + jbe YUV_to_RGB_asm31 + mov edi, 0xFF00 + +YUV_to_RGB_asm31: + and edi,0xFF00 + + or eax,edi + or eax,ebx + + ret + +;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); + global freerdp_image_yuv_to_xrgb_asm +freerdp_image_yuv_to_xrgb_asm: + push rbp + mov rbp, rsp + ;cWidth: cx + sub rsp,72 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[1] + push rbx + + + mov [rbp-8],rdi + + mov rax,[rsi] + mov [rbp-16],rax + mov rax,[rsi+8] + mov [rbp-24],rax + mov rax,[rsi+16] + mov [rbp-32],rax + + mov [rbp-40],rdx + + + shr rcx,1 ;/2 + mov [rbp-48],rcx + + + shl rdx,2 + mov [rbp-64],rdx + + + mov rax,[rbp-48] + mov [rbp-56],rax + + + mov [rbp-72],r8 + mov rax,[rbp-40] + shl dword [rbp-72],1 + sub [rbp-72],rax + + shr rax,1 + sub r9,rax + +freerdp_image_yuv_to_xrgb_asm_loopH: + mov rcx,[rbp-40] + shr rcx,1 + + +freerdp_image_yuv_to_xrgb_asm_loopW: + mov rax,[rbp-16] + mov edi,[rax] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + mov rax,[rbp-16] + mov edi,[rax+r8] + inc rax + mov [rbp-16],rax + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov rdx,[rbp-64] + mov [rbx+rdx],eax + add rbx,4 + mov [rbp-8],rbx + + + mov rax,[rbp-16] + mov edi,[rax] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + mov rax,[rbp-16] + mov edi,[rax+r8] + inc rax + mov [rbp-16],rax + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + inc rax + mov [rbp-24],rax + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + inc rax + mov [rbp-32],rax + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov rdx,[rbp-64] + mov [rbx+rdx],eax + add rbx,4 + mov [rbp-8],rbx + + dec cx + jne freerdp_image_yuv_to_xrgb_asm_loopW + + + mov rax,[rbp-8] + add rax,[rbp-64] + mov [rbp-8],rax + + mov rax,[rbp-16] + add rax,[rbp-72] + mov [rbp-16],rax + + mov rax,[rbp-24] + add rax,r9 + mov [rbp-24],rax + + mov rax,[rbp-32] + add rax,r9 + mov [rbp-32],rax + + dec qword [rbp-56] + jne freerdp_image_yuv_to_xrgb_asm_loopH + +;END + mov rax,0 +END: + pop rbx + mov rsp,rbp + pop rbp + ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.asm.alt b/libfreerdp/codec/h264.asm.alt new file mode 100644 index 000000000..98ae6f950 --- /dev/null +++ b/libfreerdp/codec/h264.asm.alt @@ -0,0 +1,262 @@ +;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 +;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 +;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 + +section .data + dbg1: db "DEBUG1",10 + dbg2: db "DEBUG2",10 + dbg3: db "DEBUG3",10 + dbg4: db "DEBUG4",10 + dbg equ $-dbg4 + +section .bss + temp1: resd 1 + temp2: resd 1 + temp3: resd 1 + temp4: resd 1 + +section .text + extern printf + + ;global YUV_to_RGB_asm +YUV_to_RGB_asm: + shl edi,8 + + mov eax,edx + imul eax,403 + mov [temp1],eax + add eax,edi + sub eax,51456 + + jae YUV_to_RGB_asm1 + mov eax,0 + jmp YUV_to_RGB_asm11 + +YUV_to_RGB_asm1: + cmp eax, 0xFFFF + jbe YUV_to_RGB_asm11 + mov eax,0xFF00 + +YUV_to_RGB_asm11: + and eax,0xFF00 + shl eax,8 + + mov ebx,esi + imul ebx,475 + mov [temp2],ebx + add ebx,edi + sub ebx,60672 + + jae YUV_to_RGB_asm2 + mov ebx, 0 + jmp YUV_to_RGB_asm21 + +YUV_to_RGB_asm2: + cmp ebx,0xFFFF + jbe YUV_to_RGB_asm21 + mov ebx,0xFF00 + +YUV_to_RGB_asm21: + and ebx,0xFF00 + shr ebx,8 + + imul edx,120 + mov [temp3],edx + sub edi,edx + imul esi,48 + mov [temp4],esi + sub edi,esi + add edi,21632 + + jae YUV_to_RGB_asm3 + mov edi, 0 + jmp YUV_to_RGB_asm31 + +YUV_to_RGB_asm3: + cmp edi,0xFFFF + jbe YUV_to_RGB_asm31 + mov edi, 0xFF00 + +YUV_to_RGB_asm31: + and edi,0xFF00 + + or eax,edi + or eax,ebx + + ret + + + +YUV_to_RGB_2asm: + shl edi,8 + + mov eax,[temp1] + add eax,edi + sub eax,51456 + + jae YUV_to_RGB_2asm1 + mov eax,0 + jmp YUV_to_RGB_2asm11 + +YUV_to_RGB_2asm1: + cmp eax, 0xFFFF + jbe YUV_to_RGB_2asm11 + mov eax,0xFF00 + +YUV_to_RGB_2asm11: + and eax,0xFF00 + shl eax,8 + + mov ebx,[temp2] + add ebx,edi + sub ebx,60672 + + jae YUV_to_RGB_2asm2 + mov ebx, 0 + jmp YUV_to_RGB_2asm21 + +YUV_to_RGB_2asm2: + cmp ebx,0xFFFF + jbe YUV_to_RGB_2asm21 + mov ebx,0xFF00 + +YUV_to_RGB_2asm21: + and ebx,0xFF00 + shr ebx,8 + + sub edi,[temp3] + sub edi,[temp4] + add edi,21632 + + jae YUV_to_RGB_2asm3 + mov edi, 0 + jmp YUV_to_RGB_2asm31 + +YUV_to_RGB_2asm3: + cmp edi,0xFFFF + jbe YUV_to_RGB_2asm31 + mov edi, 0xFF00 + +YUV_to_RGB_2asm31: + and edi,0xFF00 + + or eax,edi + or eax,ebx + + ret + + +;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); + global freerdp_image_yuv_to_xrgb_asm +freerdp_image_yuv_to_xrgb_asm: + push rbp + mov rbp, rsp + ;cWidth: cx + sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight + push rbx + + + mov [rbp-8],rdi + + mov rax,[rsi] + mov [rbp-16],rax + mov rax,[rsi+8] + mov [rbp-24],rax + mov rax,[rsi+16] + mov [rbp-32],rax + + mov [rbp-40],rdx + + + shr rcx,1 ;/2 + mov [rbp-48],rcx + + + mov rax,[rbp-48] + mov [rbp-56],rax + +freerdp_image_yuv_to_xrgb_asm_loopH: + mov rcx,[rbp-40] + shr rcx,1 + + +freerdp_image_yuv_to_xrgb_asm_loopW: + mov rax,[rbp-16] + mov edi,[rax] + + mov rax,[rbp-24] + mov esi,[rax] + inc rax + mov [rbp-24],rax + + mov rax,[rbp-32] + mov edx,[rax] + inc rax + mov [rbp-32],rax + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + mov rax,[rbp-16] + mov rbx,[rbp-40] + mov edi,[rax+rbx] + inc rax + mov [rbp-16],rax + + call YUV_to_RGB_2asm + + mov rbx,[rbp-8] + mov rdx,[rbp-40] + mov [rbx+rdx],eax + add rbx,4 + mov [rbp-8],rbx + + + mov rax,[rbp-16] + mov edi,[rax] + + call YUV_to_RGB_2asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + mov rax,[rbp-16] + mov rbx,[rbp-40] + mov edi,[rax+rbx] + inc rax + mov [rbp-16],rax + + call YUV_to_RGB_2asm + + mov rbx,[rbp-8] + mov rdx,[rbp-40] + mov [rbx+rdx],eax + add rbx,4 + mov [rbp-8],rbx + + dec cx + jne freerdp_image_yuv_to_xrgb_asm_loopW + + + mov rax,[rbp-8] + add rax,[rbp-40] + mov [rbp-8],rax + + mov rax,[rbp-16] + add rax,[rbp-40] + mov [rbp-16],rax + + dec qword [rbp-56] + jne freerdp_image_yuv_to_xrgb_asm_loopH + +;END + mov rax,0 +END: + pop rbx + mov rsp,rbp + pop rbp + ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index c532bc81c..67a81dc8c 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -32,6 +32,12 @@ #define USE_UPCONVERT 0 #define USE_TRACE 0 +#include + +#ifdef WITH_OPENH264_ASM +extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); +#endif + static BYTE clip(int x) { if (x < 0) return 0; @@ -39,7 +45,7 @@ static BYTE clip(int x) return (BYTE)x; } -static UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) +UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) { BYTE R, G, B; @@ -297,11 +303,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE* pV; int Y, U, V; int i, j; + + struct timeval T1,T2,T3; + + gettimeofday(&T2,NULL); if (!h264 || !h264->pDecoder) return -1; - pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize); + //pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize); #if 0 printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", @@ -349,6 +359,10 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); + gettimeofday(&T1,NULL); + printf("\ttime before first DecodeFrame2: %d sec %d usec\n",(int)(T1.tv_sec-T2.tv_sec),(int)(T1.tv_usec-T2.tv_usec)); + + gettimeofday(&T1,NULL); state = (*h264->pDecoder)->DecodeFrame2( h264->pDecoder, pSrcData, @@ -356,13 +370,17 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pYUVData, &sBufferInfo); - - state = (*h264->pDecoder)->DecodeFrame2( + gettimeofday(&T2,NULL); + state = (*h264->pDecoder)->DecodeFrame2( h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); + gettimeofday(&T3,NULL); + +// printf("\tfirst DecodeFrame2 took %d sec %d usec, second %d sec %d usec\n",(int)(T2.tv_sec-T1.tv_sec),(int)(T2.tv_usec-T1.tv_usec), +// (int)(T3.tv_sec-T2.tv_sec),(int)(T3.tv_usec-T2.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -420,8 +438,16 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; + gettimeofday(&T3,NULL); +#ifdef WITH_OPENH264_ASM + freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); +#else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); +#endif + + gettimeofday(&T1,NULL);//takes about 35ms!! + printf("\tfreerdp_image_copy_yuv420p_to_xrgb took %d sec %d usec\n",(int)(T1.tv_sec-T3.tv_sec),(int)(T1.tv_usec-T3.tv_usec)); if (g_H264DumpFrames) { diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c new file mode 100644 index 000000000..27dd46b08 --- /dev/null +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -0,0 +1,57 @@ +#include +#include +#include + +#include "TestOpenH264ASM.h" + +int main(void){ + int ret,i; + unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; + int nSrcStep[2]; + + struct timeval t1,t2,t3; + + pSrcData[0]=malloc(1920*1080*sizeof(char)); + pSrcData[1]=malloc(1920*1080/4*sizeof(char)); + pSrcData[2]=malloc(1920*1080/4*sizeof(char)); + pDstData_asm=malloc(1920*1080*4*sizeof(char)); + pDstData_c=malloc(1920*1080*4*sizeof(char)); + + for(i=0;i<1920*1080;i++){ + pSrcData[0][i]=i%255; + pSrcData[1][i/4]=pSrcData[0][i]; + pSrcData[2][i/4]=255-pSrcData[0][i]; + } + + printf("%X\n",pSrcData[0][0]); + + nSrcStep[0]=1088; + nSrcStep[1]=544; + + gettimeofday(&t1,NULL); + ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,1024,768,1088,544); + gettimeofday(&t2,NULL); + freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,1024*4,0,0,1024,768,pSrcData,nSrcStep,0,0); + gettimeofday(&t3,NULL); + + printf("in asm (%d) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec), + (int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec)); + + printf("in asm the result was %X %X %X\n in c %X %X %X.\n",(unsigned char *)pDstData_asm[92],(unsigned char *)pDstData_asm[93],(unsigned char *)pDstData_asm[94], + (unsigned char *)pDstData_c[92],(unsigned char *)pDstData_c[93],(unsigned char *)pDstData_c[94]); + + for(i=0;i<(1920*1080*4);i++){ + if(pDstData_c[i]!=pDstData_asm[i]){ + printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]); + break; + } + } + + free(pSrcData[0]); + free(pSrcData[1]); + free(pSrcData[2]); + free(pDstData_c); + free(pDstData_asm); + + return 0; +} diff --git a/libfreerdp/codec/test/TestOpenH264ASM.h b/libfreerdp/codec/test/TestOpenH264ASM.h new file mode 100644 index 000000000..83537e038 --- /dev/null +++ b/libfreerdp/codec/test/TestOpenH264ASM.h @@ -0,0 +1,7 @@ +extern int YUV_to_RGB_asm(unsigned char Y,unsigned char U,unsigned char V); +extern int YUV_to_RGB_2asm(unsigned char Y); +extern int YUV_to_RGB(unsigned char Y,unsigned char U,unsigned char V); + +extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1); +int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst, + int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc); \ No newline at end of file From de46a0c738acc3c4cbf4478a0fb928a7c749f962 Mon Sep 17 00:00:00 2001 From: erbth Date: Wed, 30 Jul 2014 12:46:52 +0200 Subject: [PATCH 03/31] repo prepared for merging --- libfreerdp/codec/{h264.c => h264.c.old} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libfreerdp/codec/{h264.c => h264.c.old} (100%) diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c.old similarity index 100% rename from libfreerdp/codec/h264.c rename to libfreerdp/codec/h264.c.old From 55407bd4e8fcbb4dbed53e48845238a2acde95aa Mon Sep 17 00:00:00 2001 From: erbth Date: Wed, 30 Jul 2014 13:08:08 +0200 Subject: [PATCH 04/31] repo prepared for merging #2 --- libfreerdp/codec/h264.c.old | 642 ------------------------------------ 1 file changed, 642 deletions(-) delete mode 100644 libfreerdp/codec/h264.c.old diff --git a/libfreerdp/codec/h264.c.old b/libfreerdp/codec/h264.c.old deleted file mode 100644 index 67a81dc8c..000000000 --- a/libfreerdp/codec/h264.c.old +++ /dev/null @@ -1,642 +0,0 @@ -/** - * FreeRDP: A Remote Desktop Protocol Implementation - * H.264 Bitmap Compression - * - * Copyright 2014 Mike McDonald - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include -#include -#include - -#include -#include - -#define USE_GRAY_SCALE 0 -#define USE_UPCONVERT 0 -#define USE_TRACE 0 - -#include - -#ifdef WITH_OPENH264_ASM -extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); -#endif - -static BYTE clip(int x) -{ - if (x < 0) return 0; - if (x > 255) return 255; - return (BYTE)x; -} - -UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) -{ - BYTE R, G, B; - -#if USE_GRAY_SCALE - /* - * Displays the Y plane as a gray-scale image. - */ - R = Y; - G = Y; - B = Y; -#else - int C, D, E; - -#if 0 - /* - * Documented colorspace conversion from YUV to RGB. - * See http://msdn.microsoft.com/en-us/library/ms893078.aspx - */ - - C = Y - 16; - D = U - 128; - E = V - 128; - - R = clip(( 298 * C + 409 * E + 128) >> 8); - G = clip(( 298 * C - 100 * D - 208 * E + 128) >> 8); - B = clip(( 298 * C + 516 * D + 128) >> 8); -#endif - -#if 0 - /* - * These coefficients produce better results. - * See http://www.microchip.com/forums/m599060.aspx - */ - - C = Y; - D = U - 128; - E = V - 128; - - R = clip(( 256 * C + 359 * E + 128) >> 8); - G = clip(( 256 * C - 88 * D - 183 * E + 128) >> 8); - B = clip(( 256 * C + 454 * D + 128) >> 8); -#endif - -#if 1 - /* - * These coefficients produce excellent results. - */ - - C = Y; - D = U - 128; - E = V - 128; - - R = clip(( 256 * C + 403 * E + 128) >> 8); - G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); - B = clip(( 256 * C + 475 * D + 128) >> 8); -#endif - -#endif - - return RGB32(R, G, B); -} - -#if USE_UPCONVERT -static BYTE* convert_420_to_444(BYTE* chroma420, int chroma420Width, int chroma420Height, int chroma420Stride) -{ - BYTE *chroma444, *src, *dst; - int chroma444Width; - int chroma444Height; - int i, j; - - chroma444Width = chroma420Width * 2; - chroma444Height = chroma420Height * 2; - - chroma444 = (BYTE*) malloc(chroma444Width * chroma444Height); - - if (!chroma444) - return NULL; - - /* Upconvert in the horizontal direction. */ - - for (j = 0; j < chroma420Height; j++) - { - src = chroma420 + j * chroma420Stride; - dst = chroma444 + j * chroma444Width; - dst[0] = src[0]; - for (i = 1; i < chroma420Width; i++) - { - dst[2*i-1] = (3 * src[i-1] + src[i] + 2) >> 2; - dst[2*i] = (src[i-1] + 3 * src[i] + 2) >> 2; - } - dst[chroma444Width-1] = src[chroma420Width-1]; - } - - /* Upconvert in the vertical direction (in-place, bottom-up). */ - - for (i = 0; i < chroma444Width; i++) - { - src = chroma444 + i + (chroma420Height-2) * chroma444Width; - dst = chroma444 + i + (2*(chroma420Height-2)+1) * chroma444Width; - dst[2*chroma444Width] = src[chroma444Width]; - for (j = chroma420Height - 2; j >= 0; j--) - { - dst[chroma444Width] = (src[0] + 3 * src[chroma444Width] + 2) >> 2; - dst[0] = (3 * src[0] + src[chroma444Width] + 2) >> 2; - dst -= 2 * chroma444Width; - src -= chroma444Width; - } - } - - return chroma444; -} -#endif - -#if USE_TRACE -static void trace_callback(H264_CONTEXT* h264, int level, const char* message) -{ - printf("%d - %s\n", level, message); -} -#endif - -static int g_H264FrameId = 0; -static BOOL g_H264DumpFrames = FALSE; - -int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) -{ - UINT32 size; - - h264->width = width; - h264->height = height; - h264->scanline = h264->width * 4; - size = h264->scanline * h264->height; - - if (size > h264->size) - { - h264->size = size; - h264->data = (BYTE*) realloc(h264->data, h264->size); - } - - if (!h264->data) - return -1; - - return 1; -} - -int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, int nYDst, - int nWidth, int nHeight, BYTE* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc) -{ - int x, y; - BYTE* pDstPixel8; - BYTE *pY, *pU, *pV, *pUv, *pVv; - int temp1=0,temp2=0; - - pY = pSrcData[0]; - pUv = pU = pSrcData[1]; - pVv = pV = pSrcData[2]; - - pDstPixel8 = &pDstData[(nYDst * nDstStep) + (nXDst * 4)]; - - for (y = 0; y < nHeight; y++) - { - for (x = 0; x < nWidth; x++) - { -/* *((UINT32*) pDstPixel8) = RGB32(*pY, *pY, *pY);*/ - *((UINT32*) pDstPixel8) = YUV_to_RGB(*pY,*pU,*pV); - pDstPixel8 += 4; - pY++; - - if(temp1){ - temp1=0; - pU++; - pV++; - }else{ - temp1=1; - } - } - - pDstPixel8 += (nDstStep - (nWidth * 4)); - pY += (nSrcStep[0] - nWidth); - if(temp2){ - temp2=0; - pU += (nSrcStep[1] - nWidth / 2); - pV += (nSrcStep[1] - nWidth / 2); - pUv = pU; - pVv = pV; - }else{ - temp2=1; - pU = pUv; - pV = pVv; - } - } - - return 1; -} - -BYTE* h264_strip_nal_unit_au_delimiter(BYTE* pSrcData, UINT32* pSrcSize) -{ - BYTE* data = pSrcData; - UINT32 size = *pSrcSize; - BYTE forbidden_zero_bit = 0; - BYTE nal_ref_idc = 0; - BYTE nal_unit_type = 0; - - /* ITU-T H.264 B.1.1 Byte stream NAL unit syntax */ - - while (size > 0) - { - if (*data) - break; - - data++; - size--; - } - - if (*data != 1) - return pSrcData; - - data++; - size--; - - forbidden_zero_bit = (data[0] >> 7); - nal_ref_idc = (data[0] >> 5); - nal_unit_type = (data[0] & 0x1F); - - if (forbidden_zero_bit) - return pSrcData; /* invalid */ - - if (nal_unit_type == 9) - { - /* NAL Unit AU Delimiter */ - - printf("NAL Unit AU Delimiter: idc: %d\n", nal_ref_idc); - - data += 2; - size -= 2; - - *pSrcSize = size; - return data; - } - - return pSrcData; -} - -int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) -{ -#ifdef WITH_OPENH264 - DECODING_STATE state; - SBufferInfo sBufferInfo; - SSysMEMBuffer* pSystemBuffer; - UINT32 UncompressedSize; - BYTE* pDstData; - BYTE* pYUVData[3]; - BYTE* pY; - BYTE* pU; - BYTE* pV; - int Y, U, V; - int i, j; - - struct timeval T1,T2,T3; - - gettimeofday(&T2,NULL); - - if (!h264 || !h264->pDecoder) - return -1; - - //pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize); - -#if 0 - printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", - pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight); -#endif - - /* Allocate a destination buffer (if needed). */ - - UncompressedSize = nWidth * nHeight * 4; - - if (UncompressedSize == 0) - return -1; - - pDstData = *ppDstData; - - if (!pDstData) - { - pDstData = (BYTE*) malloc(UncompressedSize); - - if (!pDstData) - return -1; - - *ppDstData = pDstData; - } - - if (g_H264DumpFrames) - { - FILE* fp; - char buf[4096]; - - snprintf(buf, sizeof(buf), "/tmp/wlog/bs_%d.h264", g_H264FrameId); - fp = fopen(buf, "wb"); - fwrite(pSrcData, 1, SrcSize, fp); - fflush(fp); - fclose(fp); - } - - /* - * Decompress the image. The RDP host only seems to send I420 format. - */ - - pYUVData[0] = NULL; - pYUVData[1] = NULL; - pYUVData[2] = NULL; - - ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); - - gettimeofday(&T1,NULL); - printf("\ttime before first DecodeFrame2: %d sec %d usec\n",(int)(T1.tv_sec-T2.tv_sec),(int)(T1.tv_usec-T2.tv_usec)); - - gettimeofday(&T1,NULL); - state = (*h264->pDecoder)->DecodeFrame2( - h264->pDecoder, - pSrcData, - SrcSize, - pYUVData, - &sBufferInfo); - - gettimeofday(&T2,NULL); - state = (*h264->pDecoder)->DecodeFrame2( - h264->pDecoder, - NULL, - 0, - pYUVData, - &sBufferInfo); - gettimeofday(&T3,NULL); - -// printf("\tfirst DecodeFrame2 took %d sec %d usec, second %d sec %d usec\n",(int)(T2.tv_sec-T1.tv_sec),(int)(T2.tv_usec-T1.tv_usec), -// (int)(T3.tv_sec-T2.tv_sec),(int)(T3.tv_usec-T2.tv_usec)); - - pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; - -#if 0 - printf("h264_decompress: state=%u, pYUVData=[%p,%p,%p], bufferStatus=%d, width=%d, height=%d, format=%d, stride=[%d,%d]\n", - state, pYUVData[0], pYUVData[1], pYUVData[2], sBufferInfo.iBufferStatus, - pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iFormat, - pSystemBuffer->iStride[0], pSystemBuffer->iStride[1]); -#endif - - if (state != 0) - return -1; - - if (!pYUVData[0] || !pYUVData[1] || !pYUVData[2]) - return -1; - - if (sBufferInfo.iBufferStatus != 1) - return -1; - - if (pSystemBuffer->iFormat != videoFormatI420) - return -1; - - /* Convert I420 (same as IYUV) to XRGB. */ - - pY = pYUVData[0]; - pU = pYUVData[1]; - pV = pYUVData[2]; - - if (g_H264DumpFrames) - { - FILE* fp; - BYTE* srcp; - char buf[4096]; - - snprintf(buf, sizeof(buf), "/tmp/wlog/H264_%d.ppm", g_H264FrameId); - fp = fopen(buf, "wb"); - fwrite("P5\n", 1, 3, fp); - snprintf(buf, sizeof(buf), "%d %d\n", pSystemBuffer->iWidth, pSystemBuffer->iHeight); - fwrite(buf, 1, strlen(buf), fp); - fwrite("255\n", 1, 4, fp); - - srcp = pY; - - for (j = 0; j < pSystemBuffer->iHeight; j++) - { - fwrite(srcp, 1, pSystemBuffer->iWidth, fp); - srcp += pSystemBuffer->iStride[0]; - } - - fflush(fp); - fclose(fp); - } - - - if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) - return -1; - - gettimeofday(&T3,NULL); -#ifdef WITH_OPENH264_ASM - freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); -#else - freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, - h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); -#endif - - gettimeofday(&T1,NULL);//takes about 35ms!! - printf("\tfreerdp_image_copy_yuv420p_to_xrgb took %d sec %d usec\n",(int)(T1.tv_sec-T3.tv_sec),(int)(T1.tv_usec-T3.tv_usec)); - - if (g_H264DumpFrames) - { - FILE* fp; - BYTE* srcp; - char buf[4096]; - - snprintf(buf, sizeof(buf), "/tmp/wlog/H264_%d_rgb.ppm", g_H264FrameId); - fp = fopen(buf, "wb"); - fwrite("P6\n", 1, 3, fp); - snprintf(buf, sizeof(buf), "%d %d\n", pSystemBuffer->iWidth, pSystemBuffer->iHeight); - fwrite(buf, 1, strlen(buf), fp); - fwrite("255\n", 1, 4, fp); - - srcp = h264->data; - - for (j = 0; j < h264->height; j++) - { - for(i=0;iwidth;i++){ - fwrite(srcp, 1, 3, fp); - srcp += 4; - } - } - - fflush(fp); - fclose(fp); - } - - g_H264FrameId++; - - return 1; - -#if USE_UPCONVERT - /* Convert 4:2:0 YUV to 4:4:4 YUV. */ - pU = convert_420_to_444(pU, pSystemBuffer->iWidth / 2, pSystemBuffer->iHeight / 2, pSystemBuffer->iStride[1]); - pV = convert_420_to_444(pV, pSystemBuffer->iWidth / 2, pSystemBuffer->iHeight / 2, pSystemBuffer->iStride[1]); -#endif - - for (j = 0; j < nHeight; j++) - { - BYTE *pXRGB = pDstData + ((nYDst + j) * nDstStep) + (nXDst * 4); - int y = nYDst + j; - - for (i = 0; i < nWidth; i++) - { - int x = nXDst + i; - - Y = pY[(y * pSystemBuffer->iStride[0]) + x]; -#if USE_UPCONVERT - U = pU[(y * pSystemBuffer->iWidth) + x]; - V = pV[(y * pSystemBuffer->iWidth) + x]; -#else - U = pU[(y/2) * pSystemBuffer->iStride[1] + (x/2)]; - V = pV[(y/2) * pSystemBuffer->iStride[1] + (x/2)]; -#endif - - *(UINT32*)pXRGB = YUV_to_RGB(Y, U, V); - - pXRGB += 4; - } - } - -#if USE_UPCONVERT - free(pU); - free(pV); -#endif -#endif - - return 1; -} - -int h264_compress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, UINT32* pDstSize) -{ - return 1; -} - -void h264_context_reset(H264_CONTEXT* h264) -{ - -} - -H264_CONTEXT* h264_context_new(BOOL Compressor) -{ - H264_CONTEXT* h264; - - h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT)); - - if (h264) - { - h264->Compressor = Compressor; - - if (h264_prepare_rgb_buffer(h264, 256, 256) < 0) - return NULL; - -#ifdef WITH_OPENH264 - { - static EVideoFormatType videoFormat = videoFormatI420; - -#if USE_TRACE - static int traceLevel = WELS_LOG_DEBUG; - static WelsTraceCallback traceCallback = (WelsTraceCallback) trace_callback; -#endif - - SDecodingParam sDecParam; - long status; - - WelsCreateDecoder(&h264->pDecoder); - - if (!h264->pDecoder) - { - printf("Failed to create OpenH264 decoder\n"); - goto EXCEPTION; - } - - ZeroMemory(&sDecParam, sizeof(sDecParam)); - sDecParam.iOutputColorFormat = videoFormatI420; - sDecParam.uiEcActiveFlag = 1; - sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT; - - status = (*h264->pDecoder)->Initialize(h264->pDecoder, &sDecParam); - - if (status != 0) - { - printf("Failed to initialize OpenH264 decoder (status=%ld)\n", status); - goto EXCEPTION; - } - - status = (*h264->pDecoder)->SetOption(h264->pDecoder, DECODER_OPTION_DATAFORMAT, &videoFormat); - - if (status != 0) - { - printf("Failed to set data format option on OpenH264 decoder (status=%ld)\n", status); - } - - -#if USE_TRACE - status = (*h264->pDecoder)->SetOption(h264->pDecoder, DECODER_OPTION_TRACE_LEVEL, &traceLevel); - if (status != 0) - { - printf("Failed to set trace level option on OpenH264 decoder (status=%ld)\n", status); - } - - status = (*h264->pDecoder)->SetOption(h264->pDecoder, DECODER_OPTION_TRACE_CALLBACK, &traceCallback); - if (status != 0) - { - printf("Failed to set trace callback option on OpenH264 decoder (status=%ld)\n", status); - } - - status = (*h264->pDecoder)->SetOption(h264->pDecoder, DECODER_OPTION_TRACE_CALLBACK_CONTEXT, &h264); - if (status != 0) - { - printf("Failed to set trace callback context option on OpenH264 decoder (status=%ld)\n", status); - } -#endif - } -#endif - - h264_context_reset(h264); - } - - return h264; - -EXCEPTION: -#ifdef WITH_OPENH264 - if (h264->pDecoder) - { - WelsDestroyDecoder(h264->pDecoder); - } -#endif - - free(h264); - - return NULL; -} - -void h264_context_free(H264_CONTEXT* h264) -{ - if (h264) - { - free(h264->data); - -#ifdef WITH_OPENH264 - if (h264->pDecoder) - { - (*h264->pDecoder)->Uninitialize(h264->pDecoder); - WelsDestroyDecoder(h264->pDecoder); - } -#endif - - free(h264); - } -} From a8257b5201866135352a37aac5148aa5b3040ca2 Mon Sep 17 00:00:00 2001 From: erbth Date: Fri, 8 Aug 2014 15:19:49 +0200 Subject: [PATCH 05/31] fixed some memory leaks arround DVC and RDPEGFX --- channels/drdynvc/client/dvcman.c | 10 +++++--- channels/rdpgfx/client/rdpgfx_main.c | 13 +++++++++++ client/X11/xf_gfx.c | 15 ++++++++---- libfreerdp/codec/h264.c | 23 +++++++++++++++++-- libfreerdp/utils/svc_plugin.c | 6 ++--- winpr/libwinpr/utils/collections/StreamPool.c | 2 ++ 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/channels/drdynvc/client/dvcman.c b/channels/drdynvc/client/dvcman.c index 9a6d80537..532a68575 100644 --- a/channels/drdynvc/client/dvcman.c +++ b/channels/drdynvc/client/dvcman.c @@ -429,6 +429,8 @@ int dvcman_close_channel(IWTSVirtualChannelManager* pChannelMgr, UINT32 ChannelI IWTSVirtualChannel* ichannel; DrdynvcClientContext* context; DVCMAN* dvcman = (DVCMAN*) pChannelMgr; + + printf("\t\tdvcman_close_channel\n"); channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); @@ -476,7 +478,7 @@ int dvcman_receive_channel_data_first(IWTSVirtualChannelManager* pChannelMgr, UI Stream_Release(channel->dvc_data); channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length); - Stream_AddRef(channel->dvc_data); + //Stream_AddRef(channel->dvc_data); return 0; } @@ -498,7 +500,8 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C if (channel->dvc_data) { /* Fragmented data */ - if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data)) + //if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data)) + if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data)) { DEBUG_WARN("data exceeding declared length!"); Stream_Release(channel->dvc_data); @@ -508,7 +511,8 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize); - if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)) + //if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)-1) + if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1) { Stream_SealLength(channel->dvc_data); Stream_SetPosition(channel->dvc_data, 0); diff --git a/channels/rdpgfx/client/rdpgfx_main.c b/channels/rdpgfx/client/rdpgfx_main.c index 412236f15..970640612 100644 --- a/channels/rdpgfx/client/rdpgfx_main.c +++ b/channels/rdpgfx/client/rdpgfx_main.c @@ -129,6 +129,8 @@ int rdpgfx_recv_caps_confirm_pdu(RDPGFX_CHANNEL_CALLBACK* callback, wStream* s) Stream_Read_UINT32(s, capsSet.version); /* version (4 bytes) */ Stream_Read_UINT32(s, capsDataLength); /* capsDataLength (4 bytes) */ Stream_Read_UINT32(s, capsSet.flags); /* capsData (4 bytes) */ + + /*TODO: interpret this answer*/ WLog_Print(gfx->log, WLOG_DEBUG, "RecvCapsConfirmPdu: version: 0x%04X flags: 0x%04X", capsSet.version, capsSet.flags); @@ -545,6 +547,8 @@ int rdpgfx_recv_solid_fill_pdu(RDPGFX_CHANNEL_CALLBACK* callback, wStream* s) { context->SolidFill(context, &pdu); } + + free(pdu.fillRects); return 1; } @@ -590,6 +594,8 @@ int rdpgfx_recv_surface_to_surface_pdu(RDPGFX_CHANNEL_CALLBACK* callback, wStrea context->SurfaceToSurface(context, &pdu); } + free(pdu.destPts); + return 1; } @@ -855,6 +861,9 @@ static int rdpgfx_on_data_received(IWTSVirtualChannelCallback* pChannelCallback, } Stream_Free(s, TRUE); + + //free(Stream_Buffer(data)); + //Stream_Free(data,TRUE); return status; } @@ -1056,6 +1065,10 @@ int rdpgfx_DVCPluginEntry(IDRDYNVC_ENTRY_POINTS* pEntryPoints) return -1; gfx->log = WLog_Get("com.freerdp.gfx.client"); +#if 0 + WLog_SetLogLevel(gfx->log, WLOG_DEBUG); +#endif + gfx->settings = (rdpSettings*) pEntryPoints->GetRdpSettings(pEntryPoints); gfx->iface.Initialize = rdpgfx_plugin_initialize; diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index a1a24a9c4..0b6ab8899 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -138,6 +138,9 @@ int xf_OutputUpdate(xfContext* xfc) int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height) { +/** ********************************* + * to be improved + * *********************************/ RECTANGLE_16 invalidRect; invalidRect.left = x; @@ -393,11 +396,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); gettimeofday(&TDEC2,NULL); - printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec)); - - free(bs->data); + //printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec)); - printf("xf_SurfaceCommand_H264: status: %d\n", status); + //printf("xf_SurfaceCommand_H264: status: %d\n", status); if (status < 0) return -1; @@ -454,6 +455,7 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ } region16_uninit(&updateRegion); + region16_uninit(&clippingRects); #if 0 /* fill with red for now to distinguish from the rest */ @@ -700,6 +702,7 @@ int xf_SurfaceToSurface(RdpgfxClientContext* context, RDPGFX_SURFACE_TO_SURFACE_ rectSrc = &(surfaceToSurface->rectSrc); destPt = &surfaceToSurface->destPts[0]; + /**not needed?*/ surfaceSrc = (xfGfxSurface*) context->GetSurfaceData(context, surfaceToSurface->surfaceIdSrc); @@ -726,6 +729,8 @@ int xf_SurfaceToSurface(RdpgfxClientContext* context, RDPGFX_SURFACE_TO_SURFACE_ invalidRect.top = destPt->y; invalidRect.right = destPt->x + rectSrc->right; invalidRect.bottom = destPt->y + rectSrc->bottom; + + /**width,height?*/ region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), &invalidRect); } @@ -759,7 +764,7 @@ int xf_SurfaceToCache(RdpgfxClientContext* context, RDPGFX_SURFACE_TO_CACHE_PDU* cacheEntry->alpha = surface->alpha; cacheEntry->scanline = (cacheEntry->width + (cacheEntry->width % 4)) * 4; - cacheEntry->data = (BYTE*) calloc(1, surface->scanline * surface->height); + cacheEntry->data = (BYTE*) calloc(1, cacheEntry->scanline * cacheEntry->height); if (!cacheEntry->data) return -1; diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index c2fbedf10..abc8f9e0b 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -28,6 +28,12 @@ #include #include +#include + +#ifdef WITH_OPENH264_ASM +extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); +#endif + #define USE_GRAY_SCALE 0 #define USE_UPCONVERT 0 @@ -340,6 +346,8 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz SBufferInfo sBufferInfo; SSysMEMBuffer* pSystemBuffer; BYTE* pYUVData[3]; + + struct timeval T1,T2; if (!h264->pDecoder) return -1; @@ -354,6 +362,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); + gettimeofday(&T1,NULL); state = (*h264->pDecoder)->DecodeFrame2( h264->pDecoder, pSrcData, @@ -370,10 +379,13 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (sBufferInfo.iBufferStatus != 1) state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); + + gettimeofday(&T2,NULL); + printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; -#if 1 +#if 0 printf("h264_decompress: state=%u, pYUVData=[%p,%p,%p], bufferStatus=%d, width=%d, height=%d, format=%d, stride=[%d,%d]\n", state, pYUVData[0], pYUVData[1], pYUVData[2], sBufferInfo.iBufferStatus, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iFormat, @@ -404,8 +416,15 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; +#ifdef WITH_OPENH264_ASM + gettimeofday(&T1,NULL); + freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); + gettimeofday(&T2,NULL); + printf("\tconverting took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); +#else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); +#endif return 1; } @@ -630,7 +649,7 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize); #endif -#if 1 +#if 0 printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight); #endif diff --git a/libfreerdp/utils/svc_plugin.c b/libfreerdp/utils/svc_plugin.c index 7a529d256..66dca1199 100644 --- a/libfreerdp/utils/svc_plugin.c +++ b/libfreerdp/utils/svc_plugin.c @@ -106,7 +106,7 @@ static void svc_plugin_process_received(rdpSvcPlugin* plugin, void* pData, UINT3 Stream_Release(plugin->data_in); plugin->data_in = StreamPool_Take(plugin->pool, totalLength); - Stream_AddRef(plugin->data_in); + //Stream_AddRef(plugin->data_in); } s = plugin->data_in; @@ -115,7 +115,7 @@ static void svc_plugin_process_received(rdpSvcPlugin* plugin, void* pData, UINT3 if (dataFlags & CHANNEL_FLAG_LAST) { - if (Stream_Capacity(s) != Stream_GetPosition(s)) + if (Stream_Length(s) != Stream_GetPosition(s)) { fprintf(stderr, "svc_plugin_process_received: read error\n"); } @@ -250,7 +250,7 @@ static void svc_plugin_process_terminated(rdpSvcPlugin* plugin) if (plugin->data_in) { - Stream_Free(plugin->data_in, TRUE); + Stream_Release(plugin->data_in); plugin->data_in = NULL; } diff --git a/winpr/libwinpr/utils/collections/StreamPool.c b/winpr/libwinpr/utils/collections/StreamPool.c index c95875fbe..696ecd971 100644 --- a/winpr/libwinpr/utils/collections/StreamPool.c +++ b/winpr/libwinpr/utils/collections/StreamPool.c @@ -155,6 +155,8 @@ wStream* StreamPool_Take(wStreamPool* pool, size_t size) Stream_SetPosition(s, 0); Stream_EnsureCapacity(s, size); + + Stream_SetLength(s,size); } s->pool = pool; From 095a7aba999b9a50257a700ff8c2c927d2d4fac5 Mon Sep 17 00:00:00 2001 From: erbth Date: Wed, 13 Aug 2014 20:56:40 +0200 Subject: [PATCH 06/31] OpenH264 YUV data conversion with intel SSSE3 in assembly --- .gitignore | 2 +- channels/drdynvc/client/dvcman.c | 10 +- client/X11/xf_gfx.c | 42 +- libfreerdp/codec/CMakeLists.txt | 43 +- libfreerdp/codec/h264.asm.alt | 262 ---------- libfreerdp/codec/h264.c | 21 +- libfreerdp/codec/h264_ssse3_x64.asm | 447 ++++++++++++++++++ libfreerdp/codec/{h264.asm => h264_x64.asm} | 4 - .../codec/test/Makefile.TestOpenH264ASM | 20 + libfreerdp/codec/test/TestOpenH264ASM.c | 55 ++- libfreerdp/codec/test/TestOpenH264ASM.h | 5 +- 11 files changed, 574 insertions(+), 337 deletions(-) delete mode 100644 libfreerdp/codec/h264.asm.alt create mode 100644 libfreerdp/codec/h264_ssse3_x64.asm rename libfreerdp/codec/{h264.asm => h264_x64.asm} (98%) create mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM diff --git a/.gitignore b/.gitignore index 928ef7b95..94ec2bf89 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,7 @@ client/DirectFB/dfreerdp server/Sample/sfreerdp-server server/X11/xfreerdp-server xcode -libfreerdp/codec/test/TestOpenH264 +libfreerdp/codec/test/TestOpenH264ASM # Other *~ diff --git a/channels/drdynvc/client/dvcman.c b/channels/drdynvc/client/dvcman.c index 532a68575..dd51a95ca 100644 --- a/channels/drdynvc/client/dvcman.c +++ b/channels/drdynvc/client/dvcman.c @@ -478,7 +478,6 @@ int dvcman_receive_channel_data_first(IWTSVirtualChannelManager* pChannelMgr, UI Stream_Release(channel->dvc_data); channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length); - //Stream_AddRef(channel->dvc_data); return 0; } @@ -488,6 +487,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C int status = 0; DVCMAN_CHANNEL* channel; UINT32 dataSize = Stream_GetRemainingLength(data); + wStream* s; channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); @@ -500,7 +500,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C if (channel->dvc_data) { /* Fragmented data */ - //if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data)) if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data)) { DEBUG_WARN("data exceeding declared length!"); @@ -511,14 +510,15 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize); - //if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)-1) if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1) { Stream_SealLength(channel->dvc_data); Stream_SetPosition(channel->dvc_data, 0); - status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data); - Stream_Release(channel->dvc_data); + s=channel->dvc_data; channel->dvc_data = NULL; + + status = channel->channel_callback->OnDataReceived(channel->channel_callback, s); + Stream_Release(s); } } else diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index 0b6ab8899..b7b7cbccc 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -139,7 +139,7 @@ int xf_OutputUpdate(xfContext* xfc) int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height) { /** ********************************* - * to be improved + * to be improved? * *********************************/ RECTANGLE_16 invalidRect; @@ -366,15 +366,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_BITMAP_STREAM* bs; - static struct timeval TGES1; - struct timeval TGES2,TDEC1,TDEC2; - - TGES2.tv_usec=TGES1.tv_usec; - TGES2.tv_sec=TGES1.tv_sec; - - gettimeofday(&TGES1,NULL); - printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec)); - h264 = xfc->h264; @@ -392,13 +383,14 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ DstData = surface->data; - gettimeofday(&TDEC1,NULL); status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); - gettimeofday(&TDEC2,NULL); - //printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec)); - //printf("xf_SurfaceCommand_H264: status: %d\n", status); + if (status < 0) + { + printf("h264_decompress failure: %d\n",status); + return -1; + } if (status < 0) return -1; @@ -427,9 +419,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects); -#if 0 - printf("numRegionRects: %d nbUpdateRects: %d\n", meta->numRegionRects, nbUpdateRects); -#endif for (j = 0; j < nbUpdateRects; j++) { @@ -439,13 +428,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ nHeight = updateRects[j].bottom - updateRects[j].top; /* update region from decoded H264 buffer */ - -#if 0 - printf("nXDst: %d nYDst: %d nWidth: %d nHeight: %d decoded: width: %d height: %d cmd: left: %d top: %d right: %d bottom: %d\n", - nXDst, nYDst, nWidth, nHeight, h264->width, h264->height, - cmd->left, cmd->top, cmd->right, cmd->bottom); -#endif - freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, nXDst, nYDst, nWidth, nHeight, h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst); @@ -457,19 +439,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ region16_uninit(&updateRegion); region16_uninit(&clippingRects); -#if 0 - /* fill with red for now to distinguish from the rest */ - freerdp_image_fill(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, - cmd->left, cmd->top, cmd->width, cmd->height, 0xFF0000); -#endif - - if (!xfc->inGfxFrame){ + if (!xfc->inGfxFrame) xf_OutputUpdate(xfc); - } - - gettimeofday(&TGES2,NULL); - printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec)); return 1; } diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index ea20105ff..1289cd45e 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -92,17 +92,44 @@ if(WITH_OPENH264) add_definitions(-DWITH_OPENH264) include_directories(${OPENH264_INCLUDE_DIR}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES}) - + + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(arch64 TRUE) + else() + set(arch64 FALSE) + endif() + if(WITH_OPENH264_ASM) set(OPENH264_ASM OPENH264_ASM_o) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm) - add_definitions(-DWITH_OPENH264_ASM) add_custom_target(${OPENH264_ASM}) - add_custom_command(TARGET ${OPENH264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC} - COMMENT "building H.264 asm objects ...") + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_x64.asm.o) + add_custom_command(TARGET ${OPENH264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "OpenH264 YUV data converting is not implemented in 32 bit assembly yet.") + endif() + + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + endif() + + if(WITH_OPENH264_SSSE3) + set(OPENH264_ASM OPENH264_ASM_o) + add_definitions(-DWITH_OPENH264_SSSE3) + add_custom_target(${OPENH264_ASM}) + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_ssse3_x64.asm.o) + add_custom_command(TARGET ${OPENH264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "OpenH264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.") + endif() + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) endif() endif() @@ -144,7 +171,7 @@ else() install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) endif() -if(WITH_OPENH264_ASM) +if(WITH_OPENH264_ASM OR WITH_OPENH264_SSSE3) add_dependencies(${MODULE_NAME} ${OPENH264_ASM}) endif() diff --git a/libfreerdp/codec/h264.asm.alt b/libfreerdp/codec/h264.asm.alt deleted file mode 100644 index 98ae6f950..000000000 --- a/libfreerdp/codec/h264.asm.alt +++ /dev/null @@ -1,262 +0,0 @@ -;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 -;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 -;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 - -section .data - dbg1: db "DEBUG1",10 - dbg2: db "DEBUG2",10 - dbg3: db "DEBUG3",10 - dbg4: db "DEBUG4",10 - dbg equ $-dbg4 - -section .bss - temp1: resd 1 - temp2: resd 1 - temp3: resd 1 - temp4: resd 1 - -section .text - extern printf - - ;global YUV_to_RGB_asm -YUV_to_RGB_asm: - shl edi,8 - - mov eax,edx - imul eax,403 - mov [temp1],eax - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_asm1 - mov eax,0 - jmp YUV_to_RGB_asm11 - -YUV_to_RGB_asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_asm11 - mov eax,0xFF00 - -YUV_to_RGB_asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,esi - imul ebx,475 - mov [temp2],ebx - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_asm2 - mov ebx, 0 - jmp YUV_to_RGB_asm21 - -YUV_to_RGB_asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_asm21 - mov ebx,0xFF00 - -YUV_to_RGB_asm21: - and ebx,0xFF00 - shr ebx,8 - - imul edx,120 - mov [temp3],edx - sub edi,edx - imul esi,48 - mov [temp4],esi - sub edi,esi - add edi,21632 - - jae YUV_to_RGB_asm3 - mov edi, 0 - jmp YUV_to_RGB_asm31 - -YUV_to_RGB_asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_asm31 - mov edi, 0xFF00 - -YUV_to_RGB_asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - - - -YUV_to_RGB_2asm: - shl edi,8 - - mov eax,[temp1] - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_2asm1 - mov eax,0 - jmp YUV_to_RGB_2asm11 - -YUV_to_RGB_2asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_2asm11 - mov eax,0xFF00 - -YUV_to_RGB_2asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,[temp2] - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_2asm2 - mov ebx, 0 - jmp YUV_to_RGB_2asm21 - -YUV_to_RGB_2asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_2asm21 - mov ebx,0xFF00 - -YUV_to_RGB_2asm21: - and ebx,0xFF00 - shr ebx,8 - - sub edi,[temp3] - sub edi,[temp4] - add edi,21632 - - jae YUV_to_RGB_2asm3 - mov edi, 0 - jmp YUV_to_RGB_2asm31 - -YUV_to_RGB_2asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_2asm31 - mov edi, 0xFF00 - -YUV_to_RGB_2asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - - -;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); - global freerdp_image_yuv_to_xrgb_asm -freerdp_image_yuv_to_xrgb_asm: - push rbp - mov rbp, rsp - ;cWidth: cx - sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight - push rbx - - - mov [rbp-8],rdi - - mov rax,[rsi] - mov [rbp-16],rax - mov rax,[rsi+8] - mov [rbp-24],rax - mov rax,[rsi+16] - mov [rbp-32],rax - - mov [rbp-40],rdx - - - shr rcx,1 ;/2 - mov [rbp-48],rcx - - - mov rax,[rbp-48] - mov [rbp-56],rax - -freerdp_image_yuv_to_xrgb_asm_loopH: - mov rcx,[rbp-40] - shr rcx,1 - - -freerdp_image_yuv_to_xrgb_asm_loopW: - mov rax,[rbp-16] - mov edi,[rax] - - mov rax,[rbp-24] - mov esi,[rax] - inc rax - mov [rbp-24],rax - - mov rax,[rbp-32] - mov edx,[rax] - inc rax - mov [rbp-32],rax - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov rbx,[rbp-40] - mov edi,[rax+rbx] - inc rax - mov [rbp-16],rax - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov rdx,[rbp-40] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - - - mov rax,[rbp-16] - mov edi,[rax] - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov rbx,[rbp-40] - mov edi,[rax+rbx] - inc rax - mov [rbp-16],rax - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov rdx,[rbp-40] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - - dec cx - jne freerdp_image_yuv_to_xrgb_asm_loopW - - - mov rax,[rbp-8] - add rax,[rbp-40] - mov [rbp-8],rax - - mov rax,[rbp-16] - add rax,[rbp-40] - mov [rbp-16],rax - - dec qword [rbp-56] - jne freerdp_image_yuv_to_xrgb_asm_loopH - -;END - mov rax,0 -END: - pop rbx - mov rsp,rbp - pop rbp - ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index abc8f9e0b..50d8cb330 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -30,9 +30,14 @@ #include +#ifdef WITH_OPENH264_SSSE3 +extern int check_ssse3(); +extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); +#else #ifdef WITH_OPENH264_ASM extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); #endif +#endif #define USE_GRAY_SCALE 0 #define USE_UPCONVERT 0 @@ -381,7 +386,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); gettimeofday(&T2,NULL); - printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); + //printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -416,14 +421,15 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; +#ifdef WITH_OPENH264_SSSE3 + freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); +#else #ifdef WITH_OPENH264_ASM - gettimeofday(&T1,NULL); freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); - gettimeofday(&T2,NULL); - printf("\tconverting took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); #else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); +#endif #endif return 1; @@ -448,6 +454,13 @@ static BOOL openh264_init(H264_CONTEXT* h264) SDecodingParam sDecParam; long status; + +#ifdef WITH_OPENH264_SSSE3 + if(check_ssse3()){ + printf("SSSE3 seems to be not supported on this system, try without WITH_OPENH264_ASM ..."); + return FALSE; + } +#endif WelsCreateDecoder(&h264->pDecoder); diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm new file mode 100644 index 000000000..f2198c9c6 --- /dev/null +++ b/libfreerdp/codec/h264_ssse3_x64.asm @@ -0,0 +1,447 @@ +section .text + global check_ssse3 + +check_ssse3: + push rbx + + pushf + pop rax + or rax,1<<21 + push rax + popf + pushf + pop rax + test rax,1<<21 + jz check_ssse3_end + + and rax,~(1<<21) + push rax + popf + + + mov eax,1 + mov ebx,0 + cpuid + test edx,1<<25 ;sse + jz check_ssse3_end + test edx,1<<26 ;sse2 + jz check_ssse3_end + test ecx,1<<0 ;sse3 + jz check_ssse3_end + test ecx,1<<9 ;ssse3 + jz check_ssse3_end + + + pop rbx + mov eax,0 + ret + + +check_ssse3_end: + pop rbx + mov eax,1 + ret + + +;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1) + global freerdp_image_yuv420p_to_xrgb +freerdp_image_yuv420p_to_xrgb: + push rbx + push rbp + +;check wether stack is aligned to 16 byte boundary + mov rax,rsp + and rax,1111B + mov r15,22 + sub r15b,al + sub rsp,r15 + + mov rbp,rsp + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + + sub rsp,316 ;pDstData 8,Y 8,U 8,V 8,nWidth 2,nHeight 2,iStride0 2,iStride1 2,last_column 1,last_line 1,G 16,B 16,R 16,add:128 16 + ;sub:128 16,mul:48 16,mul:475 16,mul:403 16,mul:120 16,VaddY 2,VaddUV 2,res 12,cmp:255 16,cmp:0 16,shuflleR 16,andG 16,shuffleB 16,shuffleY 16,shuffleUV 16,scanline 2 + +;last_line: if the last (U,V doubled) line should be skipped, set to 1B +;last_column: if the last 4 columns should be skipped, set to 1B + + mov [rbp-8],rdi + + mov rax,[rsi] + mov [rbp-16],rax + mov rax,[rsi+8] + mov [rbp-24],rax + mov rax,[rsi+16] + mov [rbp-32],rax + + mov [rbp-34],dx + mov r13w,cx + + and r8,0FFFFH + mov [rbp-38],r8w + and r9,0FFFFH + mov [rbp-40],r9w + + + shl r8w,1 + sub r8w,dx + mov r11w,r8w + + mov r10w,dx + shr dx,1 + sub r9w,dx + mov r12w,r9w + + + mov r8w,[rbp-34] + shr r8w,2 + shl r10w,2 + + mov r9w,[rbp-38] + + ;and al,11B + ;jz no_column_rest + + ;inc word [rbp-34] + +;no_column_rest: + ;mov [rbp-41],al + + + + mov r14b,r13b + and r14b,1B + ;jz no_line_rest + + inc r13w + +;no_line_rest: + shr r13w,1 + + + +;init masks + mov eax,00000080H + mov [rbp-106],eax + mov [rbp-102],eax + mov [rbp-98],eax + mov [rbp-94],eax + + mov eax,00800080H + mov [rbp-122],eax + mov [rbp-118],eax + mov [rbp-114],eax + mov [rbp-110],eax + + mov eax,00300030H + mov [rbp-138],eax + mov [rbp-134],eax + mov [rbp-130],eax + mov [rbp-126],eax + + mov eax,01DB01DBH + mov [rbp-154],eax + mov [rbp-150],eax + mov [rbp-146],eax + mov [rbp-142],eax + + mov eax,01930193H + mov [rbp-170],eax + mov [rbp-166],eax + mov [rbp-162],eax + mov [rbp-158],eax + + mov eax,00780078H + mov [rbp-186],eax + mov [rbp-182],eax + mov [rbp-178],eax + mov [rbp-174],eax + + mov eax,000FF0000H + mov [rbp-218],eax + mov [rbp-214],eax + mov [rbp-210],eax + mov [rbp-206],eax + + mov eax,00000000H + mov [rbp-234],eax + mov [rbp-230],eax + mov [rbp-226],eax + mov [rbp-222],eax + +;shuffle masks + ;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00 + ;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb + mov eax,00FF0000H + mov [rbp-250],eax + mov [rbp-246],eax + mov [rbp-242],eax + mov [rbp-238],eax + + mov eax,80800280H + mov [rbp-266],eax + mov eax,80800680H + mov [rbp-262],eax + mov eax,80800A80H + mov [rbp-258],eax + mov eax,80800E80H + mov [rbp-254],eax + + mov eax,80808002H + mov [rbp-282],eax + mov eax,80808006H + mov [rbp-278],eax + mov eax,8080800AH + mov [rbp-274],eax + mov eax,8080800EH + mov [rbp-270],eax + + ;dd cc bb aa + ;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00 + mov eax,80800080H + mov [rbp-298],eax + mov eax,80800180H + mov [rbp-294],eax + mov eax,80800280H + mov [rbp-290],eax + mov eax,80800380H + mov [rbp-286],eax + + ;dd cc bb aa + ;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa + mov eax,80008000H + mov [rbp-314],eax + mov eax,80018001H + mov [rbp-310],eax + mov eax,80028002H + mov [rbp-306],eax + mov eax,80038003H + mov [rbp-302],eax + + + mov rsi,[rbp-16] + mov rax,[rbp-24] + mov rbx,[rbp-32] + + +freerdp_image_yuv420p_to_xrgb_hloop: + dec r13w + js freerdp_image_yuv420p_to_xrgb_hloop_end + jnz not_last_line + + shl r14b,1 +not_last_line: + + xor cx,cx +freerdp_image_yuv420p_to_xrgb_wloop: +;main loop +; C = Y; +; D = U - 128; +; E = V - 128; +; +; R = clip(( 256 * C + 403 * E + 128) >> 8); +; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); +; B = clip(( 256 * C + 475 * D + 128) >> 8); + + test cx,1B + jnz load_yuv_data + + + ;prepare U data + movd xmm0,[rax] + movdqa xmm5,[rbp-314] + pshufb xmm0,xmm5 + + add rax,4 + + movdqa xmm3,[rbp-122] + psubsw xmm0,xmm3 + + movdqa xmm2,xmm0 + + movdqa xmm4,xmm0 + movdqa xmm7,[rbp-138] + pmullw xmm0,xmm7 + pmulhw xmm4,xmm7 + + movdqa xmm7,xmm0 + punpcklwd xmm0,xmm4 ;what an awesome instruction! + punpckhwd xmm7,xmm4 + movdqa xmm4,xmm7 + + movdqa xmm6,[rbp-106] + psubd xmm0,xmm6 + psubd xmm4,xmm6 + + + movdqa xmm1,xmm2 + movdqa xmm7,[rbp-154] + pmullw xmm1,xmm7 + pmulhw xmm2,xmm7 + + movdqa xmm7,xmm1 + punpcklwd xmm1,xmm2 + punpckhwd xmm7,xmm2 + + paddd xmm1,xmm6 + paddd xmm7,xmm6 + + movdqa [rbp-74],xmm7 + + + ;prepare V data + movd xmm2,[rbx] + pshufb xmm2,xmm5 + + add rbx,4 + + psubsw xmm2,xmm3 + + movdqa xmm5,xmm2 + + movdqa xmm3,xmm2 + movdqa xmm7,[rbp-170] + pmullw xmm2,xmm7 + pmulhw xmm3,xmm7 + + movdqa xmm7,xmm2 + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + + paddd xmm2,xmm6 + paddd xmm7,xmm6 + + movdqa [rbp-90],xmm7 + + + movdqa xmm3,xmm5 + movdqa xmm7,[rbp-186] + pmullw xmm3,xmm7 + pmulhw xmm5,xmm7 + + movdqa xmm7,xmm3 + punpcklwd xmm3,xmm5 + punpckhwd xmm7,xmm5 + + paddd xmm0,xmm3 + paddd xmm4,xmm7 + + movdqa [rbp-58],xmm4 + + jmp valid_yuv_data + +load_yuv_data: + movdqa xmm1,[rbp-74] + movdqa xmm2,[rbp-90] + movdqa xmm0,[rbp-58] + +valid_yuv_data: + + + ;Y data processing + movd xmm4,[rsi] + pshufb xmm4,[rbp-298] + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[rbp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[rbp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[rbp-250] + pshufb xmm5,[rbp-266] + pshufb xmm6,[rbp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + movdqa [rdi],xmm4 + + + ;Y data processing in secound line + test r14b,2 + jnz skip_last_line1 + + movd xmm4,[rsi+r9] + pshufb xmm4,[rbp-298] + + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[rbp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[rbp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[rbp-250] + pshufb xmm5,[rbp-266] + pshufb xmm6,[rbp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + movdqa [rdi+r10],xmm4 + +skip_last_line1: + add rdi,16 + add rsi,4 + + inc cx + cmp cx,r8w + jne freerdp_image_yuv420p_to_xrgb_wloop + +freerdp_image_yuv420p_to_xrgb_wloop_end: + add rdi,r10 + + add rsi,r11 + + add rax,r12 + add rbx,r12 + ;mov eax,r12d + ;jmp freerdp_image_yuv420p_to_xrgb_end + + jmp freerdp_image_yuv420p_to_xrgb_hloop + +freerdp_image_yuv420p_to_xrgb_hloop_end: + + mov eax,0 +freerdp_image_yuv420p_to_xrgb_end: + mov rsp,rbp + add rsp,r15 + pop rbp + pop rbx + ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.asm b/libfreerdp/codec/h264_x64.asm similarity index 98% rename from libfreerdp/codec/h264.asm rename to libfreerdp/codec/h264_x64.asm index 1473849e0..f0bf1d640 100644 --- a/libfreerdp/codec/h264.asm +++ b/libfreerdp/codec/h264_x64.asm @@ -2,10 +2,6 @@ ;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 ;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 -section .data - debug: db "DEBUG",10 - dblen: equ $-debug - section .text ;global YUV_to_RGB_asm YUV_to_RGB_asm: diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM b/libfreerdp/codec/test/Makefile.TestOpenH264ASM new file mode 100644 index 000000000..8e747a647 --- /dev/null +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM @@ -0,0 +1,20 @@ +TestOpenH264ASM: h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o + gcc -o TestOpenH264ASM h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o + +h264_ssse3.asm.o: ../h264_ssse3_x64.asm + nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm + +h264.asm.o: ../h264.asm + nasm -f elf64 -o h264.asm.o ../h264.asm + +TestOpenH264ASM.c.o: TestOpenH264ASM.c + gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c + +h264.c.o: ../h264.c + gcc -c -O3 -o h264.c.o ../h264.c + +clean: + rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o + +old: h264.asm.o TestOpenH264ASM.c.o h264.c.o + gcc -o TestOpenH264ASM h264.asm.o TestOpenH264ASM.c.o h264.c.o diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c index 27dd46b08..f1c463f0b 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -4,49 +4,70 @@ #include "TestOpenH264ASM.h" +#define WIDTH 1920 +#define HEIGHT 1080 + int main(void){ - int ret,i; + int i,j,k; + int ret; unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; int nSrcStep[2]; + if(check_ssse3()){ + fprintf(stderr,"ssse3 not supported!\n"); + return EXIT_FAILURE; + } + struct timeval t1,t2,t3; - pSrcData[0]=malloc(1920*1080*sizeof(char)); - pSrcData[1]=malloc(1920*1080/4*sizeof(char)); - pSrcData[2]=malloc(1920*1080/4*sizeof(char)); - pDstData_asm=malloc(1920*1080*4*sizeof(char)); - pDstData_c=malloc(1920*1080*4*sizeof(char)); + pSrcData[0]=malloc(1984*HEIGHT*sizeof(char)); + pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char)); + pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char)); + pDstData_asm=malloc(WIDTH*HEIGHT*4*sizeof(char)); + pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char)); - for(i=0;i<1920*1080;i++){ + for(i=0;i Date: Thu, 14 Aug 2014 18:46:10 +0200 Subject: [PATCH 07/31] drdynvc fix --- channels/drdynvc/client/dvcman.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/channels/drdynvc/client/dvcman.c b/channels/drdynvc/client/dvcman.c index dd51a95ca..b8834f103 100644 --- a/channels/drdynvc/client/dvcman.c +++ b/channels/drdynvc/client/dvcman.c @@ -429,8 +429,6 @@ int dvcman_close_channel(IWTSVirtualChannelManager* pChannelMgr, UINT32 ChannelI IWTSVirtualChannel* ichannel; DrdynvcClientContext* context; DVCMAN* dvcman = (DVCMAN*) pChannelMgr; - - printf("\t\tdvcman_close_channel\n"); channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); @@ -510,7 +508,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize); - if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1) + if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)) { Stream_SealLength(channel->dvc_data); Stream_SetPosition(channel->dvc_data, 0); From 497e130c21b0fa823ba070baaf04ea450e4de35d Mon Sep 17 00:00:00 2001 From: erbth Date: Mon, 18 Aug 2014 21:21:24 +0200 Subject: [PATCH 08/31] YUV data conversion using SSSE3/assembly with libavcodec implementation --- libfreerdp/codec/CMakeLists.txt | 86 +++++++++++++++++---------------- libfreerdp/codec/h264.c | 44 ++++++++++++----- 2 files changed, 76 insertions(+), 54 deletions(-) diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index 1289cd45e..39bcb033f 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -92,46 +92,6 @@ if(WITH_OPENH264) add_definitions(-DWITH_OPENH264) include_directories(${OPENH264_INCLUDE_DIR}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES}) - - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(arch64 TRUE) - else() - set(arch64 FALSE) - endif() - - if(WITH_OPENH264_ASM) - set(OPENH264_ASM OPENH264_ASM_o) - add_definitions(-DWITH_OPENH264_ASM) - add_custom_target(${OPENH264_ASM}) - - if(arch64) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_x64.asm.o) - add_custom_command(TARGET ${OPENH264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) - else() - message(FATAL_ERROR "OpenH264 YUV data converting is not implemented in 32 bit assembly yet.") - endif() - - set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) - endif() - - if(WITH_OPENH264_SSSE3) - set(OPENH264_ASM OPENH264_ASM_o) - add_definitions(-DWITH_OPENH264_SSSE3) - add_custom_target(${OPENH264_ASM}) - - if(arch64) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_ssse3_x64.asm.o) - add_custom_command(TARGET ${OPENH264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) - else() - message(FATAL_ERROR "OpenH264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.") - endif() - - set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) - endif() endif() if(WITH_LIBAVCODEC) @@ -141,6 +101,48 @@ if(WITH_LIBAVCODEC) set(FREERDP_LIBAVCODEC_LIBS ${LIBAVCODEC_LIB} ${LIBAVUTIL_LIB}) endif() +if(WITH_LIBAVCODEC OR WITH_OPENH264) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(arch64 TRUE) + else() + set(arch64 FALSE) + endif() + + if(WITH_H264_ASM) + set(H264_ASM H264_ASM_o) + add_definitions(-DWITH_H264_ASM) + add_custom_target(${H264_ASM}) + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x64.asm.o) + add_custom_command(TARGET ${H264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "H264 YUV data converting is not implemented in 32 bit assembly yet.") + endif() + + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + endif() + + if(WITH_H264_SSSE3) + set(H264_ASM H264_ASM_o) + add_definitions(-DWITH_H264_SSSE3) + add_custom_target(${H264_ASM}) + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x64.asm.o) + add_custom_command(TARGET ${H264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "H264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.") + endif() + + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + endif() +endif() + add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT" MONOLITHIC ${MONOLITHIC_BUILD} SOURCES ${${MODULE_PREFIX}_SRCS} @@ -171,8 +173,8 @@ else() install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) endif() -if(WITH_OPENH264_ASM OR WITH_OPENH264_SSSE3) - add_dependencies(${MODULE_NAME} ${OPENH264_ASM}) +if(WITH_H264_ASM OR WITH_H264_SSSE3) + add_dependencies(${MODULE_NAME} ${H264_ASM}) endif() set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp") diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 50d8cb330..5180ffa5b 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -30,11 +30,11 @@ #include -#ifdef WITH_OPENH264_SSSE3 +#ifdef WITH_H264_SSSE3 extern int check_ssse3(); extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); #else -#ifdef WITH_OPENH264_ASM +#ifdef WITH_H264_ASM extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); #endif #endif @@ -386,7 +386,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); gettimeofday(&T2,NULL); - //printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); + printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -421,16 +421,19 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; -#ifdef WITH_OPENH264_SSSE3 + gettimeofday(&T1,NULL); +#ifdef WITH_H264_SSSE3 freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); #else -#ifdef WITH_OPENH264_ASM +#ifdef WITH_H264_ASM freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); #else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); #endif #endif + gettimeofday(&T2,NULL); + printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); return 1; } @@ -454,13 +457,6 @@ static BOOL openh264_init(H264_CONTEXT* h264) SDecodingParam sDecParam; long status; - -#ifdef WITH_OPENH264_SSSE3 - if(check_ssse3()){ - printf("SSSE3 seems to be not supported on this system, try without WITH_OPENH264_ASM ..."); - return FALSE; - } -#endif WelsCreateDecoder(&h264->pDecoder); @@ -537,13 +533,19 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS AVPacket packet; int gotFrame = 0; int status; + + struct timeval T1,T2; av_init_packet(&packet); packet.data = pSrcData; packet.size = SrcSize; + gettimeofday(&T1,NULL); status = avcodec_decode_video2(h264->codecContext, h264->videoFrame, &gotFrame, &packet); + gettimeofday(&T2,NULL); + + printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); if (status < 0) { @@ -568,8 +570,19 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS if (h264_prepare_rgb_buffer(h264, h264->videoFrame->width, h264->videoFrame->height) < 0) return -1; + gettimeofday(&T1,NULL); +#ifdef WITH_H264_SSSE3 + freerdp_image_yuv420p_to_xrgb(h264->data,h264->videoFrame->data,h264->width,h264->height,h264->videoFrame->linesize[0],h264->videoFrame->linesize[1]); +#else +#ifdef WITH_H264_ASM + freerdp_image_yuv_to_xrgb_asm(h264->data,h264->videoFrame->data,h264->width,h264->height,h264->videoFrame->linesize[0],h264->videoFrame->linesize[1]); +#else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, h264->videoFrame->data, h264->videoFrame->linesize, 0, 0); +#endif +#endif + gettimeofday(&T2,NULL); + printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); } return 1; @@ -723,6 +736,13 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT)); +#ifdef WITH_H264_SSSE3 + if(check_ssse3()){ + printf("SSSE3 seems to be not supported on this system, try without WITH_H264_ASM ..."); + return FALSE; + } +#endif + if (h264) { h264->Compressor = Compressor; From 9eec9cb18aa141471450364ad877b47459a97d00 Mon Sep 17 00:00:00 2001 From: erbth Date: Thu, 21 Aug 2014 00:08:56 +0200 Subject: [PATCH 09/31] RDPEGFX H264 YUV data conversion in assembly/with SSSE3 in 32bit --- libfreerdp/codec/h264_ssse3_x32.asm | 449 ++++++++++++++++++ libfreerdp/codec/h264_ssse3_x64.asm | 30 +- libfreerdp/codec/h264_x32.asm | 240 ++++++++++ .../codec/test/Makefile.TestOpenH264ASM32 | 17 + .../codec/test/Makefile.TestOpenH264ASM64 | 17 + libfreerdp/codec/test/TestOpenH264ASM.c | 9 + libfreerdp/codec/test/TestOpenH264ASM.h | 7 +- 7 files changed, 754 insertions(+), 15 deletions(-) create mode 100644 libfreerdp/codec/h264_ssse3_x32.asm create mode 100644 libfreerdp/codec/h264_x32.asm create mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM32 create mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM64 diff --git a/libfreerdp/codec/h264_ssse3_x32.asm b/libfreerdp/codec/h264_ssse3_x32.asm new file mode 100644 index 000000000..66962b1ba --- /dev/null +++ b/libfreerdp/codec/h264_ssse3_x32.asm @@ -0,0 +1,449 @@ +section .text + global check_ssse3 + +check_ssse3: + push ebx + + pushf + pop eax + or eax,1<<21 + push eax + popf + pushf + pop eax + test eax,1<<21 + jz check_ssse3_end + + and eax,~(1<<21) + push eax + popf + + + mov eax,1 + mov ebx,0 + cpuid + test edx,1<<25 ;sse + jz check_ssse3_end + test edx,1<<26 ;sse2 + jz check_ssse3_end + test ecx,1<<0 ;sse3 + jz check_ssse3_end + test ecx,1<<9 ;ssse3 + jz check_ssse3_end + + + pop ebx + mov eax,0 + ret + + +check_ssse3_end: + pop ebx + mov eax,1 + ret + + +;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1) + global freerdp_image_yuv420p_to_xrgb +freerdp_image_yuv420p_to_xrgb: + push ebx + push ebp + +;check wether stack is aligned to 16 byte boundary +; +; ---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack--- +; lets say 508 2 506 464 +; 1FCH 2H 1FAH 1D0H +; 1F0H 1D0H +; |------1FCH&FH----|1FCH&^FH +; |1FCH&FH-AH |--AH-|---16 byte aligned stack------------ +; We've got only one problem: what if 1FCH&FH was smaller than AH? +; We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H] +; That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH + mov eax,esp + add eax,6H + and eax,1111B + sub esp,eax + + mov ebp,esp + +;"local variables" + sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74, + ;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 4 -190,VaddUV 4 -194,stack offset 8 -202, + ;cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 4 -318 + + ;pDstData:edi, + + mov [ebp-202],eax + +;last_line: if the last (U,V doubled) line should be skipped, set to 1B + + mov edi,[ebp+eax+12] + + mov ecx,[ebp+eax+16] + mov esi,[ecx] + mov ebx,[ecx+4] + mov [ebp-32],ebx + mov ebx,[ecx+8] + + + mov edx,[ebp+eax+20] + mov [ebp-34],dx + + shr word [ebp-34],2 + + mov [ebp-318],edx + shl dword [ebp-318],2 + + + mov ecx,[ebp+eax+24] + + mov [ebp-41],cl + and byte [ebp-41],1B + + inc cx + shr cx,1 + mov [ebp-36],cx + + + mov ecx,[ebp+eax+28] + mov [ebp-38],cx + + shl cx,1 + sub cx,dx + mov [ebp-190],ecx + + + mov ecx,[ebp+eax+32] + mov [ebp-40],cx + + + shr dx,1 + sub cx,dx + mov [ebp-194],ecx + + + mov eax,[ebp-32] + + +;init masks + mov ecx,00000080H + mov [ebp-106],ecx + mov [ebp-102],ecx + mov [ebp-98],ecx + mov [ebp-94],ecx + + mov ecx,00800080H + mov [ebp-122],ecx + mov [ebp-118],ecx + mov [ebp-114],ecx + mov [ebp-110],ecx + + mov ecx,00300030H + mov [ebp-138],ecx + mov [ebp-134],ecx + mov [ebp-130],ecx + mov [ebp-126],ecx + + mov ecx,01DB01DBH + mov [ebp-154],ecx + mov [ebp-150],ecx + mov [ebp-146],ecx + mov [ebp-142],ecx + + mov ecx,01930193H + mov [ebp-170],ecx + mov [ebp-166],ecx + mov [ebp-162],ecx + mov [ebp-158],ecx + + mov ecx,00780078H + mov [ebp-186],ecx + mov [ebp-182],ecx + mov [ebp-178],ecx + mov [ebp-174],ecx + + mov ecx,000FF0000H + mov [ebp-218],ecx + mov [ebp-214],ecx + mov [ebp-210],ecx + mov [ebp-206],ecx + + mov ecx,00000000H + mov [ebp-234],ecx + mov [ebp-230],ecx + mov [ebp-226],ecx + mov [ebp-222],ecx + +;shuffle masks + ;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00 + ;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb + mov ecx,00FF0000H + mov [ebp-250],ecx + mov [ebp-246],ecx + mov [ebp-242],ecx + mov [ebp-238],ecx + + mov ecx,80800280H + mov [ebp-266],ecx + mov ecx,80800680H + mov [ebp-262],ecx + mov ecx,80800A80H + mov [ebp-258],ecx + mov ecx,80800E80H + mov [ebp-254],ecx + + mov ecx,80808002H + mov [ebp-282],ecx + mov ecx,80808006H + mov [ebp-278],ecx + mov ecx,8080800AH + mov [ebp-274],ecx + mov ecx,8080800EH + mov [ebp-270],ecx + + ;dd cc bb aa + ;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00 + mov ecx,80800080H + mov [ebp-298],ecx + mov ecx,80800180H + mov [ebp-294],ecx + mov ecx,80800280H + mov [ebp-290],ecx + mov ecx,80800380H + mov [ebp-286],ecx + + ;dd cc bb aa + ;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa + mov ecx,80008000H + mov [ebp-314],ecx + mov ecx,80018001H + mov [ebp-310],ecx + mov ecx,80028002H + mov [ebp-306],ecx + mov ecx,80038003H + mov [ebp-302],ecx + + + +freerdp_image_yuv420p_to_xrgb_hloop: + dec word [ebp-36] + js freerdp_image_yuv420p_to_xrgb_hloop_end + jnz not_last_line + + shl byte [ebp-41],1 +not_last_line: + + mov cx,[ebp-34] +freerdp_image_yuv420p_to_xrgb_wloop: +;main loop +; C = Y; +; D = U - 128; +; E = V - 128; +; +; R = clip(( 256 * C + 403 * E + 128) >> 8); +; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); +; B = clip(( 256 * C + 475 * D + 128) >> 8); + + test cx,1B + jnz load_yuv_data + + + ;prepare U data + movd xmm0,[eax] + movdqa xmm5,[ebp-314] + pshufb xmm0,xmm5 ;but this is the omest instruction of all!! + + add eax,4 + + movdqa xmm3,[ebp-122] + psubsw xmm0,xmm3 + + movdqa xmm2,xmm0 + + movdqa xmm4,xmm0 + movdqa xmm7,[ebp-138] + pmullw xmm0,xmm7 + pmulhw xmm4,xmm7 + + movdqa xmm7,xmm0 + punpcklwd xmm0,xmm4 ;what an awesome instruction! + punpckhwd xmm7,xmm4 + movdqa xmm4,xmm7 + + movdqa xmm6,[ebp-106] + psubd xmm0,xmm6 + psubd xmm4,xmm6 + + + movdqa xmm1,xmm2 + movdqa xmm7,[ebp-154] + pmullw xmm1,xmm7 + pmulhw xmm2,xmm7 + + movdqa xmm7,xmm1 + punpcklwd xmm1,xmm2 + punpckhwd xmm7,xmm2 + + paddd xmm1,xmm6 + paddd xmm7,xmm6 + + movdqa [ebp-74],xmm7 + + + ;prepare V data + movd xmm2,[ebx] + pshufb xmm2,xmm5 + + add ebx,4 + + psubsw xmm2,xmm3 + + movdqa xmm5,xmm2 + + movdqa xmm3,xmm2 + movdqa xmm7,[ebp-170] + pmullw xmm2,xmm7 + pmulhw xmm3,xmm7 + + movdqa xmm7,xmm2 + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + + paddd xmm2,xmm6 + paddd xmm7,xmm6 + + movdqa [ebp-90],xmm7 + + + movdqa xmm3,xmm5 + movdqa xmm7,[ebp-186] + pmullw xmm3,xmm7 + pmulhw xmm5,xmm7 + + movdqa xmm7,xmm3 + punpcklwd xmm3,xmm5 + punpckhwd xmm7,xmm5 + + paddd xmm0,xmm3 + paddd xmm4,xmm7 + + movdqa [ebp-58],xmm4 + + jmp valid_yuv_data + +load_yuv_data: + movdqa xmm1,[ebp-74] + movdqa xmm2,[ebp-90] + movdqa xmm0,[ebp-58] + +valid_yuv_data: + + + ;Y data processing + movd xmm4,[esi] + pshufb xmm4,[ebp-298] + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[ebp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[ebp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[ebp-250] + pshufb xmm5,[ebp-266] + pshufb xmm6,[ebp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + movdqu [edi],xmm4 + + + ;Y data processing in secound line + test byte [ebp-41],2 + jnz skip_last_line1 + + mov dx,[ebp-38] + and edx,0FFFFH + movd xmm4,[esi+edx] + pshufb xmm4,[ebp-298] + + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[ebp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[ebp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[ebp-250] + pshufb xmm5,[ebp-266] + pshufb xmm6,[ebp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + mov edx,[ebp-318] + movdqu [edi+edx],xmm4 + +skip_last_line1: + add edi,16 + add esi,4 + + dec cx + jne freerdp_image_yuv420p_to_xrgb_wloop + +freerdp_image_yuv420p_to_xrgb_wloop_end: + mov edx,[ebp-318] + add edi,edx + + mov edx,[ebp-190] + add esi,edx + + mov edx,[ebp-194] + add eax,edx + add ebx,edx + + jmp freerdp_image_yuv420p_to_xrgb_hloop + +freerdp_image_yuv420p_to_xrgb_hloop_end: + + mov eax,0 +freerdp_image_yuv420p_to_xrgb_end: + mov edx,[ebp-202] + + mov esp,ebp + add esp,edx + pop ebp + pop ebx + ret diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm index f2198c9c6..8b1fda229 100644 --- a/libfreerdp/codec/h264_ssse3_x64.asm +++ b/libfreerdp/codec/h264_ssse3_x64.asm @@ -50,10 +50,19 @@ freerdp_image_yuv420p_to_xrgb: push rbp ;check wether stack is aligned to 16 byte boundary - mov rax,rsp - and rax,1111B - mov r15,22 - sub r15b,al +; +; ---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack--- +; lets say 508 2 506 464 +; 1FCH 2H 1FAH 1D0H +; 1F0H 1D0H +; |------1FCH&FH----|1FCH&^FH +; |1FCH&FH-AH |--AH-|---16 byte aligned stack------------ +; We've got only one problem: what if 1FCH&FH was smaller than AH? +; We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H] +; That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH + mov r15,rsp + add r15,6H + and r15,1111B sub rsp,r15 mov rbp,rsp @@ -64,11 +73,12 @@ freerdp_image_yuv420p_to_xrgb: xor r13,r13 xor r14,r14 - sub rsp,316 ;pDstData 8,Y 8,U 8,V 8,nWidth 2,nHeight 2,iStride0 2,iStride1 2,last_column 1,last_line 1,G 16,B 16,R 16,add:128 16 - ;sub:128 16,mul:48 16,mul:475 16,mul:403 16,mul:120 16,VaddY 2,VaddUV 2,res 12,cmp:255 16,cmp:0 16,shuflleR 16,andG 16,shuffleB 16,shuffleY 16,shuffleUV 16,scanline 2 +;"local variables" + sub rsp,316 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74, + ;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,res 12 -202,cmp:255 16 -218, + ;cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 2 -316 ;last_line: if the last (U,V doubled) line should be skipped, set to 1B -;last_column: if the last 4 columns should be skipped, set to 1B mov [rbp-8],rdi @@ -255,7 +265,7 @@ freerdp_image_yuv420p_to_xrgb_wloop: ;prepare U data movd xmm0,[rax] movdqa xmm5,[rbp-314] - pshufb xmm0,xmm5 + pshufb xmm0,xmm5 ;but this is the omest instruction of all!! add rax,4 @@ -375,7 +385,7 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqa [rdi],xmm4 + movdqu [rdi],xmm4 ;Y data processing in secound line @@ -414,7 +424,7 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqa [rdi+r10],xmm4 + movdqu [rdi+r10],xmm4 skip_last_line1: add rdi,16 diff --git a/libfreerdp/codec/h264_x32.asm b/libfreerdp/codec/h264_x32.asm new file mode 100644 index 000000000..09011d9e5 --- /dev/null +++ b/libfreerdp/codec/h264_x32.asm @@ -0,0 +1,240 @@ +;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 +;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 +;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 + +section .text + ;global YUV_to_RGB_asm +YUV_to_RGB_asm: + shl edi,8 + + mov eax,edx + imul eax,403 + add eax,edi + sub eax,51456 + + jae YUV_to_RGB_asm1 + mov eax,0 + jmp YUV_to_RGB_asm11 + +YUV_to_RGB_asm1: + cmp eax, 0xFFFF + jbe YUV_to_RGB_asm11 + mov eax,0xFF00 + +YUV_to_RGB_asm11: + and eax,0xFF00 + shl eax,8 + + mov ebx,esi + imul ebx,475 + add ebx,edi + sub ebx,60672 + + jae YUV_to_RGB_asm2 + mov ebx, 0 + jmp YUV_to_RGB_asm21 + +YUV_to_RGB_asm2: + cmp ebx,0xFFFF + jbe YUV_to_RGB_asm21 + mov ebx,0xFF00 + +YUV_to_RGB_asm21: + and ebx,0xFF00 + shr ebx,8 + + imul edx,120 + sub edi,edx + imul esi,48 + sub edi,esi + add edi,21632 + + bt edi,31 + jae YUV_to_RGB_asm3 + mov edi, 0 + jmp YUV_to_RGB_asm31 + +YUV_to_RGB_asm3: + cmp edi,0xFFFF + jbe YUV_to_RGB_asm31 + mov edi, 0xFF00 + +YUV_to_RGB_asm31: + and edi,0xFF00 + + or eax,edi + or eax,ebx + + ret + +;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); + global freerdp_image_yuv_to_xrgb_asm +freerdp_image_yuv_to_xrgb_asm: + push ebp + mov ebp, esp + ;cWidth: cx + sub esp,36 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[0] addition + push ebx + + + mov edi,[ebp+8] + mov [ebp-4],edi + + mov esi,[ebp+12] + mov eax,[esi] + mov [ebp-8],eax + mov eax,[esi+4] + mov [ebp-12],eax + mov eax,[esi+8] + mov [ebp-16],eax + + mov edx,[ebp+16] + mov [ebp-20],edx + + + mov ecx,[ebp+20] + shr ecx,1 ;/2 + mov [ebp-24],ecx + + + shl edx,2 + mov [ebp-32],edx + + + mov eax,[ebp-24] + mov [ebp-28],eax + + + mov ebx,[ebp+24] + mov [ebp-36],ebx + mov eax,[ebp-20] + shl dword [ebp-36],1 + sub [ebp-36],eax + + shr eax,1 + sub [ebp+28],eax + +freerdp_image_yuv_to_xrgb_asm_loopH: + mov ecx,[ebp-20] + shr ecx,1 + + +freerdp_image_yuv_to_xrgb_asm_loopW: + mov eax,[ebp-8] + mov edi,[eax] + and edi,0xFF + + mov eax,[ebp-12] + mov esi,[eax] + and esi,0xFF + + mov eax,[ebp-16] + mov edx,[eax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov ebx,[ebp-4] + mov [ebx],eax + + + mov eax,[ebp-8] + mov ebx,[ebp+24] + mov edi,[eax+ebx] + inc eax + mov [ebp-8],eax + and edi,0xFF + + mov eax,[ebp-12] + mov esi,[eax] + and esi,0xFF + + mov eax,[ebp-16] + mov edx,[eax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov ebx,[ebp-4] + mov edx,[ebp-32] + mov [ebx+edx],eax + add ebx,4 + mov [ebp-4],ebx + + + mov eax,[ebp-8] + mov edi,[eax] + and edi,0xFF + + mov eax,[ebp-12] + mov esi,[eax] + and esi,0xFF + + mov eax,[ebp-16] + mov edx,[eax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov ebx,[ebp-4] + mov [ebx],eax + + + mov eax,[ebp-8] + mov ebx,[ebp+24] + mov edi,[eax+ebx] + inc eax + mov [ebp-8],eax + and edi,0xFF + + mov eax,[ebp-12] + mov esi,[eax] + inc eax + mov [ebp-12],eax + and esi,0xFF + + mov eax,[ebp-16] + mov edx,[eax] + inc eax + mov [ebp-16],eax + and edx,0xFF + + call YUV_to_RGB_asm + + mov ebx,[ebp-4] + mov edx,[ebp-32] + mov [ebx+edx],eax + add ebx,4 + mov [ebp-4],ebx + + dec cx + jne freerdp_image_yuv_to_xrgb_asm_loopW + + + mov eax,[ebp-4] + add eax,[ebp-32] + mov [ebp-4],eax + + mov eax,[ebp-8] + add eax,[ebp-36] + mov [ebp-8],eax + + mov ebx,[ebp+28] + mov eax,[ebp-12] + add eax,ebx + mov [ebp-12],eax + + mov eax,[ebp-16] + add eax,ebx + mov [ebp-16],eax + + dec dword [ebp-28] + jne freerdp_image_yuv_to_xrgb_asm_loopH + +;END + mov eax,0 +END: + pop ebx + mov esp,ebp + pop ebp + ret diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 new file mode 100644 index 000000000..ab52a3b7d --- /dev/null +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 @@ -0,0 +1,17 @@ +TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o #h264.asm.o + gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o #h264.asm.o + +h264_ssse3.asm.o: ../h264_ssse3_x32.asm + nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x32.asm + +h264.asm.o: ../h264_x32.asm + nasm -f elf64 -o h264.asm.o ../h264_x32.asm + +TestOpenH264ASM.c.o: TestOpenH264ASM.c + gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c + +h264.c.o: ../h264.c + gcc -c -o h264.c.o ../h264.c + +clean: + rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 new file mode 100644 index 000000000..ace4451ae --- /dev/null +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 @@ -0,0 +1,17 @@ +TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o + gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o + +h264_ssse3.asm.o: ../h264_ssse3_x64.asm + nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm + +h264.asm.o: ../h264_x64.asm + nasm -f elf64 -o h264.asm.o ../h264_x64.asm + +TestOpenH264ASM.c.o: TestOpenH264ASM.c + gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c + +h264.c.o: ../h264.c + gcc -c -o h264.c.o ../h264.c + +clean: + rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o \ No newline at end of file diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c index f1c463f0b..dc0f2e6d5 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -7,16 +7,21 @@ #define WIDTH 1920 #define HEIGHT 1080 +#define SSSE3 1 + + int main(void){ int i,j,k; int ret; unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; int nSrcStep[2]; +#if SSSE3 if(check_ssse3()){ fprintf(stderr,"ssse3 not supported!\n"); return EXIT_FAILURE; } +#endif struct timeval t1,t2,t3; @@ -36,7 +41,11 @@ int main(void){ nSrcStep[1]=992; gettimeofday(&t1,NULL); +#if SSSE3 ret=freerdp_image_yuv420p_to_xrgb(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]); +#else + ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]); +#endif gettimeofday(&t2,NULL); freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,WIDTH*4,0,0,WIDTH,HEIGHT,pSrcData,nSrcStep,0,0); gettimeofday(&t3,NULL); diff --git a/libfreerdp/codec/test/TestOpenH264ASM.h b/libfreerdp/codec/test/TestOpenH264ASM.h index c5f537cee..f13ff0db3 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.h +++ b/libfreerdp/codec/test/TestOpenH264ASM.h @@ -1,10 +1,7 @@ -extern int YUV_to_RGB_asm(unsigned char Y,unsigned char U,unsigned char V); -extern int YUV_to_RGB_2asm(unsigned char Y); -extern int YUV_to_RGB(unsigned char Y,unsigned char U,unsigned char V); - -extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1); int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc); +extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1); + extern int check_ssse3(); extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1); \ No newline at end of file From dee50a8ca248ae6273f9f5097c4855456b2d73dc Mon Sep 17 00:00:00 2001 From: erbth Date: Thu, 21 Aug 2014 00:58:08 +0200 Subject: [PATCH 10/31] H264 data alignement and 32 bit comilation ... --- libfreerdp/codec/CMakeLists.txt | 12 ++++++++--- libfreerdp/codec/h264.c | 8 ++------ libfreerdp/codec/h264_ssse3_x32.asm | 9 +++++++-- libfreerdp/codec/h264_ssse3_x64.asm | 9 +++++++-- .../codec/test/Makefile.TestOpenH264ASM | 20 ------------------- .../codec/test/Makefile.TestOpenH264ASM32 | 8 ++++---- .../codec/test/Makefile.TestOpenH264ASM64 | 2 +- libfreerdp/codec/test/TestOpenH264ASM.c | 6 ++++-- 8 files changed, 34 insertions(+), 40 deletions(-) delete mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index 39bcb033f..bd714b760 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -117,9 +117,12 @@ if(WITH_LIBAVCODEC OR WITH_OPENH264) set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x64.asm.o) add_custom_command(TARGET ${H264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) else() - message(FATAL_ERROR "H264 YUV data converting is not implemented in 32 bit assembly yet.") + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x32.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x32.asm.o) + add_custom_command(TARGET ${H264_ASM} + COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC}) endif() set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) @@ -136,7 +139,10 @@ if(WITH_LIBAVCODEC OR WITH_OPENH264) add_custom_command(TARGET ${H264_ASM} COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) else() - message(FATAL_ERROR "H264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.") + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x32.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x32.asm.o) + add_custom_command(TARGET ${H264_ASM} + COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC}) endif() set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 5180ffa5b..ef66cf8bc 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -216,8 +216,7 @@ int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) if (size > h264->size) { h264->size = size; - h264->data = (BYTE*) realloc(h264->data, h264->size); - memset(h264->data, 0, h264->size); + h264->data = (BYTE*) _aligned_realloc(h264->data, h264->size,16); } if (!h264->data) @@ -747,9 +746,6 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) { h264->Compressor = Compressor; - if (h264_prepare_rgb_buffer(h264, 256, 256) < 0) - return NULL; - #ifdef WITH_OPENH264 if (!openh264_init(h264)) { @@ -776,7 +772,7 @@ void h264_context_free(H264_CONTEXT* h264) { if (h264) { - free(h264->data); + _aligne_free(h264->data); #ifdef WITH_OPENH264 openh264_free(h264); diff --git a/libfreerdp/codec/h264_ssse3_x32.asm b/libfreerdp/codec/h264_ssse3_x32.asm index 66962b1ba..b1a57e545 100644 --- a/libfreerdp/codec/h264_ssse3_x32.asm +++ b/libfreerdp/codec/h264_ssse3_x32.asm @@ -1,3 +1,8 @@ +; a entire function for converting YUV420p data to the RGB format (without any special upconverting) +; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. +; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!) +; and the width of resolution must be divisable by four. +; section .text global check_ssse3 @@ -372,7 +377,7 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqu [edi],xmm4 + movdqa [edi],xmm4 ;Y data processing in secound line @@ -414,7 +419,7 @@ valid_yuv_data: por xmm4,xmm6 mov edx,[ebp-318] - movdqu [edi+edx],xmm4 + movdqa [edi+edx],xmm4 skip_last_line1: add edi,16 diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm index 8b1fda229..51428b46f 100644 --- a/libfreerdp/codec/h264_ssse3_x64.asm +++ b/libfreerdp/codec/h264_ssse3_x64.asm @@ -1,3 +1,8 @@ +; a entire function for converting YUV420p data to the RGB format (without any special upconverting) +; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. +; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!) +; and the width of resolution must be divisable by four. +; section .text global check_ssse3 @@ -385,7 +390,7 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqu [rdi],xmm4 + movdqa [rdi],xmm4 ;Y data processing in secound line @@ -424,7 +429,7 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqu [rdi+r10],xmm4 + movdqa [rdi+r10],xmm4 skip_last_line1: add rdi,16 diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM b/libfreerdp/codec/test/Makefile.TestOpenH264ASM deleted file mode 100644 index 8e747a647..000000000 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM +++ /dev/null @@ -1,20 +0,0 @@ -TestOpenH264ASM: h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o - gcc -o TestOpenH264ASM h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o - -h264_ssse3.asm.o: ../h264_ssse3_x64.asm - nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm - -h264.asm.o: ../h264.asm - nasm -f elf64 -o h264.asm.o ../h264.asm - -TestOpenH264ASM.c.o: TestOpenH264ASM.c - gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c - -h264.c.o: ../h264.c - gcc -c -O3 -o h264.c.o ../h264.c - -clean: - rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o - -old: h264.asm.o TestOpenH264ASM.c.o h264.c.o - gcc -o TestOpenH264ASM h264.asm.o TestOpenH264ASM.c.o h264.c.o diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 index ab52a3b7d..2a0308db4 100644 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 @@ -1,11 +1,11 @@ -TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o #h264.asm.o - gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o #h264.asm.o +TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o + gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr h264_ssse3.asm.o: ../h264_ssse3_x32.asm - nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x32.asm + nasm -f elf32 -o h264_ssse3.asm.o ../h264_ssse3_x32.asm h264.asm.o: ../h264_x32.asm - nasm -f elf64 -o h264.asm.o ../h264_x32.asm + nasm -f elf32 -o h264.asm.o ../h264_x32.asm TestOpenH264ASM.c.o: TestOpenH264ASM.c gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 index ace4451ae..a060926b7 100644 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 @@ -1,5 +1,5 @@ TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o - gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o + gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr h264_ssse3.asm.o: ../h264_ssse3_x64.asm nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c index dc0f2e6d5..d0c04787f 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -2,6 +2,8 @@ #include #include +#include + #include "TestOpenH264ASM.h" #define WIDTH 1920 @@ -28,7 +30,7 @@ int main(void){ pSrcData[0]=malloc(1984*HEIGHT*sizeof(char)); pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char)); pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char)); - pDstData_asm=malloc(WIDTH*HEIGHT*4*sizeof(char)); + pDstData_asm=_aligned_malloc(WIDTH*HEIGHT*4*sizeof(char),16); pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char)); for(i=0;i Date: Tue, 2 Sep 2014 22:16:56 +0200 Subject: [PATCH 11/31] H.264: converting only clipping rects to XRGB --- client/X11/xf_gfx.c | 60 +---- include/freerdp/codec/h264.h | 11 +- libfreerdp/codec/h264.c | 113 +++++--- libfreerdp/codec/h264_ssse3_x32.asm | 2 +- libfreerdp/codec/h264_ssse3_x64.asm | 242 +++++++++++++++--- libfreerdp/codec/h264_x64.asm | 233 ++++++++++------- .../codec/test/Makefile.TestOpenH264ASM64 | 2 +- libfreerdp/codec/test/TestOpenH264ASM.c | 13 +- libfreerdp/codec/test/TestOpenH264ASM.h | 6 +- 9 files changed, 437 insertions(+), 245 deletions(-) diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index b7b7cbccc..0cac1e316 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -23,8 +23,6 @@ #include "xf_gfx.h" -#include - int xf_ResetGraphics(RdpgfxClientContext* context, RDPGFX_RESET_GRAPHICS_PDU* resetGraphics) { xfContext* xfc = (xfContext*) context->custom; @@ -350,19 +348,10 @@ int xf_SurfaceCommand_Planar(xfContext* xfc, RdpgfxClientContext* context, RDPGF int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_SURFACE_COMMAND* cmd) { int status; - UINT32 i, j; - int nXDst, nYDst; - int nWidth, nHeight; - int nbUpdateRects; + UINT32 i; BYTE* DstData = NULL; - RDPGFX_RECT16* rect; H264_CONTEXT* h264; xfGfxSurface* surface; - REGION16 updateRegion; - RECTANGLE_16 updateRect; - RECTANGLE_16* updateRects; - REGION16 clippingRects; - RECTANGLE_16 clippingRect; RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_BITMAP_STREAM* bs; @@ -384,7 +373,7 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ DstData = surface->data; status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, - PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); + PIXEL_FORMAT_XRGB32, surface->scanline , surface->height, meta->regionRects, meta->numRegionRects); if (status < 0) { @@ -392,54 +381,11 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ return -1; } - if (status < 0) - return -1; - - region16_init(&clippingRects); - for (i = 0; i < meta->numRegionRects; i++) { - rect = &(meta->regionRects[i]); - - clippingRect.left = rect->left; - clippingRect.top = rect->top; - clippingRect.right = rect->right; - clippingRect.bottom = rect->bottom; - - region16_union_rect(&clippingRects, &clippingRects, &clippingRect); + region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), (RECTANGLE_16*) &(meta->regionRects[i])); } - updateRect.left = cmd->left; - updateRect.top = cmd->top; - updateRect.right = cmd->right; - updateRect.bottom = cmd->bottom; - - region16_init(&updateRegion); - region16_intersect_rect(&updateRegion, &clippingRects, &updateRect); - - updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects); - - - for (j = 0; j < nbUpdateRects; j++) - { - nXDst = updateRects[j].left; - nYDst = updateRects[j].top; - nWidth = updateRects[j].right - updateRects[j].left; - nHeight = updateRects[j].bottom - updateRects[j].top; - - /* update region from decoded H264 buffer */ - freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, - nXDst, nYDst, nWidth, nHeight, - h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst); - - - region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), &updateRects[j]); - } - - region16_uninit(&updateRegion); - region16_uninit(&clippingRects); - - if (!xfc->inGfxFrame) xf_OutputUpdate(xfc); diff --git a/include/freerdp/codec/h264.h b/include/freerdp/codec/h264.h index 3c445d61a..ccc37be9e 100644 --- a/include/freerdp/codec/h264.h +++ b/include/freerdp/codec/h264.h @@ -22,6 +22,7 @@ #include #include +#include #ifdef WITH_LIBAVCODEC #ifdef WITH_OPENH264 @@ -43,14 +44,16 @@ struct _H264_CONTEXT { BOOL Compressor; - BYTE* data; - UINT32 size; + //BYTE* data; + //UINT32 size; UINT32 width; UINT32 height; - int scanline; + //int scanline; #ifdef WITH_OPENH264 ISVCDecoder* pDecoder; + BYTE* pYUVData[3]; + int iStride[2]; #endif #ifdef WITH_LIBAVCODEC @@ -69,7 +72,7 @@ extern "C" { FREERDP_API int h264_compress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, UINT32* pDstSize); FREERDP_API int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); + BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRect); FREERDP_API void h264_context_reset(H264_CONTEXT* h264); diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index ef66cf8bc..8c39d0fc6 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -32,7 +32,7 @@ #ifdef WITH_H264_SSSE3 extern int check_ssse3(); -extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); +extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline); #else #ifdef WITH_H264_ASM extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); @@ -204,6 +204,7 @@ void h264_dump_yuv_data(BYTE* yuv[], int width, int height, int stride[]) fclose(fp); } +#ifdef WITH_LIBAVCODEC int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) { UINT32 size; @@ -224,6 +225,7 @@ int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) return 1; } +#endif int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, BYTE* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc) @@ -343,13 +345,11 @@ static void openh264_trace_callback(H264_CONTEXT* h264, int level, const char* m printf("%d - %s\n", level, message); } -static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) +static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize) { DECODING_STATE state; SBufferInfo sBufferInfo; SSysMEMBuffer* pSystemBuffer; - BYTE* pYUVData[3]; struct timeval T1,T2; @@ -360,9 +360,9 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz * Decompress the image. The RDP host only seems to send I420 format. */ - pYUVData[0] = NULL; - pYUVData[1] = NULL; - pYUVData[2] = NULL; + h264->pYUVData[0] = NULL; + h264->pYUVData[1] = NULL; + h264->pYUVData[2] = NULL; ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); @@ -371,7 +371,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz h264->pDecoder, pSrcData, SrcSize, - pYUVData, + h264->pYUVData, &sBufferInfo); /** @@ -382,7 +382,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz */ if (sBufferInfo.iBufferStatus != 1) - state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); + state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo); gettimeofday(&T2,NULL); printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); @@ -391,7 +391,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz #if 0 printf("h264_decompress: state=%u, pYUVData=[%p,%p,%p], bufferStatus=%d, width=%d, height=%d, format=%d, stride=[%d,%d]\n", - state, pYUVData[0], pYUVData[1], pYUVData[2], sBufferInfo.iBufferStatus, + state, h264->pYUVData[0], h264->pYUVData[1], h264->pYUVData[2], sBufferInfo.iBufferStatus, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iFormat, pSystemBuffer->iStride[0], pSystemBuffer->iStride[1]); #endif @@ -399,7 +399,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (state != 0) return -1; - if (!pYUVData[0] || !pYUVData[1] || !pYUVData[2]) + if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2]) return -1; if (sBufferInfo.iBufferStatus != 1) @@ -412,11 +412,18 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (g_H264DumpFrames) { - h264_dump_yuv_data(pYUVData, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iStride); + h264_dump_yuv_data(h264->pYUVData, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iStride); } g_H264FrameId++; + + h264->iStride[0] = pSystemBuffer->iStride[0]; + h264->iStride[1] = pSystemBuffer->iStride[1]; + h264->width = pSystemBuffer->iWidth; + h264->height = pSystemBuffer->iHeight; + +#if 0 if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; @@ -433,6 +440,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz #endif gettimeofday(&T2,NULL); printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); +#endif return 1; } @@ -662,10 +670,20 @@ EXCEPTION: int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) + BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects) { UINT32 UncompressedSize; BYTE* pDstData; + BYTE* pDstPoint; + + BYTE** pYUVData; + BYTE* pYUVPoint[2]; + + RDPGFX_RECT16* rect; + int* iStride; + int ret, i, cx, cy; + + struct timeval T1,T2; if (!h264) return -1; @@ -675,39 +693,27 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, #endif #if 0 - printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", - pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight); + printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, numRegionRects=%d\n", + pSrcData, SrcSize, *ppDstData, nDstStep, numRegionRects); #endif - /* Allocate a destination buffer (if needed). */ - - UncompressedSize = nWidth * nHeight * 4; - - if (UncompressedSize == 0) + if (!(pDstData = *ppDstData)) return -1; - pDstData = *ppDstData; - - if (!pDstData) - { - pDstData = (BYTE*) malloc(UncompressedSize); - - if (!pDstData) - return -1; - - *ppDstData = pDstData; - } if (g_H264DumpFrames) { h264_dump_h264_data(pSrcData, SrcSize); } + #ifdef WITH_OPENH264 - return openh264_decompress( - h264, pSrcData, SrcSize, - pDstData, DstFormat, nDstStep, - nXDst, nYDst, nWidth, nHeight); + ret = openh264_decompress(h264, pSrcData, SrcSize); + if (ret != 1) + return ret; + + pYUVData = h264->pYUVData; + iStride = h264->iStride; #endif #ifdef WITH_LIBAVCODEC @@ -717,6 +723,38 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, nXDst, nYDst, nWidth, nHeight); #endif + + /* Convert I420 (same as IYUV) to XRGB. */ + UncompressedSize = h264->width * h264->height * 4; + if (UncompressedSize > (nDstStep * nDstHeight)) + return -1; + + + gettimeofday(&T1,NULL); + for (i = 0; i < numRegionRects; i++){ + rect = &(regionRects[i]); + cx = rect->right - rect->left; + cy = rect->bottom - rect->top; + + pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4; + pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left; + + ret = rect->top/2 * iStride[1] + rect->left/2; + pYUVPoint[1] = pYUVData[1] + ret; + pYUVPoint[2] = pYUVData[2] + ret; + +#if 1 + printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n", + rect->left, rect->top, cx, cy); +#endif + +#ifdef WITH_H264_SSSE3 + freerdp_image_yuv420p_to_xrgb(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep); +#endif + } + gettimeofday(&T2,NULL); + printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); + return 1; } @@ -737,7 +775,7 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) #ifdef WITH_H264_SSSE3 if(check_ssse3()){ - printf("SSSE3 seems to be not supported on this system, try without WITH_H264_ASM ..."); + printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ..."); return FALSE; } #endif @@ -772,14 +810,13 @@ void h264_context_free(H264_CONTEXT* h264) { if (h264) { - _aligne_free(h264->data); - #ifdef WITH_OPENH264 openh264_free(h264); #endif #ifdef WITH_LIBAVCODEC libavcodec_free(h264); + _aligned_free(h264->data); #endif free(h264); diff --git a/libfreerdp/codec/h264_ssse3_x32.asm b/libfreerdp/codec/h264_ssse3_x32.asm index b1a57e545..c7f62b868 100644 --- a/libfreerdp/codec/h264_ssse3_x32.asm +++ b/libfreerdp/codec/h264_ssse3_x32.asm @@ -73,7 +73,7 @@ freerdp_image_yuv420p_to_xrgb: mov ebp,esp ;"local variables" - sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74, + sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,res 1 -42,G 16 -58,B 16 -74, ;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 4 -190,VaddUV 4 -194,stack offset 8 -202, ;cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 4 -318 diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm index 51428b46f..b62febe2d 100644 --- a/libfreerdp/codec/h264_ssse3_x64.asm +++ b/libfreerdp/codec/h264_ssse3_x64.asm @@ -1,7 +1,8 @@ -; a entire function for converting YUV420p data to the RGB format (without any special upconverting) +; function for converting YUV420p data to the RGB format (but without any special upconverting) ; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. -; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!) -; and the width of resolution must be divisable by four. +; The target scanline (6th parameter) must be a multiple of 16. +; iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four +; of the half of iStride[0] or bigger ; section .text global check_ssse3 @@ -48,7 +49,7 @@ check_ssse3_end: ret -;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1) +;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline) global freerdp_image_yuv420p_to_xrgb freerdp_image_yuv420p_to_xrgb: push rbx @@ -79,11 +80,13 @@ freerdp_image_yuv420p_to_xrgb: xor r14,r14 ;"local variables" - sub rsp,316 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74, - ;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,res 12 -202,cmp:255 16 -218, - ;cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 2 -316 + sub rsp,338 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,last_column 1 -42, + ;G 16 -58,B 16 -74,R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190, + ;res 12 -202,cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,andRemainingColumns 16 -330, + ;VddDst 8 -338 -;last_line: if the last (U,V doubled) line should be skipped, set to 1B +;last_line: if the last (U,V doubled) line should be skipped, set to 10B +;last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) mov [rbp-8],rdi @@ -97,28 +100,46 @@ freerdp_image_yuv420p_to_xrgb: mov [rbp-34],dx mov r13w,cx - and r8,0FFFFH - mov [rbp-38],r8w - and r9,0FFFFH - mov [rbp-40],r9w + mov r10w,r9w + and r10,0FFFFH - shl r8w,1 - sub r8w,dx - mov r11w,r8w + mov ecx,[r8] + mov [rbp-38],ecx + mov r12d,[r8+4] + mov [rbp-40],r12w - mov r10w,dx - shr dx,1 - sub r9w,dx - mov r12w,r9w + mov [rbp-42],dl + and byte [rbp-42],11B + + + mov [rbp-338],r10 + shr word [rbp-338],1 + shl cx,1 mov r8w,[rbp-34] - shr r8w,2 - shl r10w,2 + add r8w,3 + and r8w, 0FFFCH + + sub [rbp-338],r8w + sub cx,r8w + + shr r8w,1 + + mov dx,r8w + add dx,2 + and dx,0FFFCH + sub r12w,dx + + shl dword [rbp-338],2 + mov r11w,cx + + shr r8w,1 mov r9w,[rbp-38] + ;and al,11B ;jz no_column_rest @@ -238,11 +259,40 @@ freerdp_image_yuv420p_to_xrgb: mov eax,80038003H mov [rbp-302],eax +;remaining columns and mask + cmp byte [rbp-42],0 + je freerdp_image_yuv420p_to_xrgb_no_columns_remain + + mov dl,[rbp-42] + xor ebx,ebx + xor ecx,ecx + xor esi,esi + + mov eax,0FFFFFFFFH + cmp dl,1H + je freerdp_image_yuv420p_to_xrgb_write_columns_remain + + mov ebx,0FFFFFFFFH + cmp dl,2H + je freerdp_image_yuv420p_to_xrgb_write_columns_remain + + mov ecx,0FFFFFFFFH + +freerdp_image_yuv420p_to_xrgb_write_columns_remain: + mov [rbp-330],eax + mov [rbp-326],ebx + mov [rbp-322],ecx + mov [rbp-318],esi + mov byte [rbp-42],1 + +freerdp_image_yuv420p_to_xrgb_no_columns_remain: + mov rsi,[rbp-16] mov rax,[rbp-24] mov rbx,[rbp-32] + ;jmp freerdp_image_yuv420p_to_xrgb_end freerdp_image_yuv420p_to_xrgb_hloop: dec r13w @@ -254,7 +304,7 @@ not_last_line: xor cx,cx freerdp_image_yuv420p_to_xrgb_wloop: -;main loop +; Well, in the end it should look like this: ; C = Y; ; D = U - 128; ; E = V - 128; @@ -264,21 +314,31 @@ freerdp_image_yuv420p_to_xrgb_wloop: ; B = clip(( 256 * C + 475 * D + 128) >> 8); test cx,1B - jnz load_yuv_data + jnz freerdp_image_yuv420p_to_xrgb_load_yuv_data - ;prepare U data +; Y-, U- and V-data is stored in different arrays. +; We start with processing U-data. + +; at first we fetch four U-values from its array and shuffle them like this: +; 0d0d 0c0c 0b0b 0a0a +; we've done two things: converting the values to signed words and duplicating +; each value, because always two pixel "share" the same U- (and V-) data movd xmm0,[rax] movdqa xmm5,[rbp-314] - pshufb xmm0,xmm5 ;but this is the omest instruction of all!! + pshufb xmm0,xmm5 ;but this is the awesomest instruction of all!! add rax,4 +; then we subtract 128 from each value, so we get D movdqa xmm3,[rbp-122] psubsw xmm0,xmm3 +; we need to do two things with our D, so let's store it for later use movdqa xmm2,xmm0 +; now we can multiply our D with 48 and unpack it to xmm4:xmm0 +; this is what we need to get G data later on movdqa xmm4,xmm0 movdqa xmm7,[rbp-138] pmullw xmm0,xmm7 @@ -289,11 +349,16 @@ freerdp_image_yuv420p_to_xrgb_wloop: punpckhwd xmm7,xmm4 movdqa xmm4,xmm7 +; to complete this step, add (?) 128 to each value (rounding ?!) +; yeah, add. in the end this will be subtracted from something, +; because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! +; by the way, our values have become signed dwords during multiplication! movdqa xmm6,[rbp-106] psubd xmm0,xmm6 psubd xmm4,xmm6 +; to get B data, we need to prepare a secound value, D*475+128 movdqa xmm1,xmm2 movdqa xmm7,[rbp-154] pmullw xmm1,xmm7 @@ -306,10 +371,14 @@ freerdp_image_yuv420p_to_xrgb_wloop: paddd xmm1,xmm6 paddd xmm7,xmm6 +; so we got something like this: xmm7:xmm1 +; this pair contains values for 16 pixel: +; aabbccdd +; aabbccdd, but we can only work on four pixel at once, so we need to save upper values movdqa [rbp-74],xmm7 - ;prepare V data +; Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients. movd xmm2,[rbx] pshufb xmm2,xmm5 @@ -319,6 +388,7 @@ freerdp_image_yuv420p_to_xrgb_wloop: movdqa xmm5,xmm2 +; this is also known as E*403+128, we need it to convert R data movdqa xmm3,xmm2 movdqa xmm7,[rbp-170] pmullw xmm2,xmm7 @@ -331,9 +401,11 @@ freerdp_image_yuv420p_to_xrgb_wloop: paddd xmm2,xmm6 paddd xmm7,xmm6 +; and preserve upper four values for future ... movdqa [rbp-90],xmm7 +; doing this step: E*120 movdqa xmm3,xmm5 movdqa xmm7,[rbp-186] pmullw xmm3,xmm7 @@ -343,59 +415,128 @@ freerdp_image_yuv420p_to_xrgb_wloop: punpcklwd xmm3,xmm5 punpckhwd xmm7,xmm5 +; now we complete what we've begun above: +; (48*D-128) + (120*E) = (48*D +120*E -128) paddd xmm0,xmm3 paddd xmm4,xmm7 +; and store to memory ! movdqa [rbp-58],xmm4 - jmp valid_yuv_data - -load_yuv_data: +; real assembly programmers do not only produce best results between 0 and 5 o'clock, +; but are also kangaroos! + jmp freerdp_image_yuv420p_to_xrgb_valid_yuv_data + +freerdp_image_yuv420p_to_xrgb_load_yuv_data: +; maybe you've wondered about the conditional jump to this label above ? +; Well, we prepared UV data for eight pixel in each line, but can only process four +; per loop. So we need to load the upper four pixel data from memory each secound loop! movdqa xmm1,[rbp-74] movdqa xmm2,[rbp-90] movdqa xmm0,[rbp-58] -valid_yuv_data: +freerdp_image_yuv420p_to_xrgb_valid_yuv_data: + inc cx + cmp cx,r8w + jne freerdp_image_yuv420p_to_xrgb_not_last_columns - ;Y data processing + shl byte [rbp-42],1 + + +freerdp_image_yuv420p_to_xrgb_not_last_columns: + +; We didn't produce any output yet, so let's do so! +; Ok, fetch four pixel from the Y-data array and shuffle them like this: +; 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 movd xmm4,[rsi] pshufb xmm4,[rbp-298] movdqa xmm5,xmm4 movdqa xmm6,xmm4 +; no we can perform the "real" conversion itself and produce output! paddd xmm4,xmm2 psubd xmm5,xmm0 paddd xmm6,xmm1 +; in the end, we only need bytes for RGB values. +; So, what do we do? right! shifting left makes values bigger and thats always good. +; before we had dwords of data, and by shifting left and treating the result +; as packed words, we get not only signed words, but do also divide by 256 +; imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least +; significant byte, that we don't need anymore, because we've done some rounding pslld xmm4,8 pslld xmm5,8 pslld xmm6,8 +; one thing we still have to face is the clip() function ... +; we have still signed words, and there are those min/max instructions in SSE2 ... +; the max instruction takes always the bigger of the two operands and stores it in the first one, +; and it operates with signs ! +; if we feed it with our values and zeros, it takes the zeros if our values are smaller than +; zero and otherwise our values movdqa xmm7,[rbp-234] pmaxsw xmm4,xmm7 ;what an awesome instruction! pmaxsw xmm5,xmm7 pmaxsw xmm6,xmm7 +; the same thing just completely different can be used to limit our values to 255, +; but now using the min instruction and 255s movdqa xmm7,[rbp-218] pminsw xmm4,xmm7 pminsw xmm5,xmm7 pminsw xmm6,xmm7 +; Now we got our bytes. +; the moment has come to assemble the three channels R,G and B to the xrgb dwords +; on Red channel we just have to and each futural dword with 00FF0000H pand xmm4,[rbp-250] +; on Green channel we have to shuffle somehow, so we get something like this: +; 00d0 00c0 00b0 00a0 pshufb xmm5,[rbp-266] +; and on Blue channel that one: +; 000d 000c 000b 000a pshufb xmm6,[rbp-282] +; and at last we or it together and get this one: +; xrgb xrgb xrgb xrgb por xmm4,xmm5 por xmm4,xmm6 - movdqa [rdi],xmm4 +; Only thing to do know is writing data to memory, but this gets a bit more +; complicated if the width is not a multiple of four and it is the last column in line. +; but otherwise just play the kangaroo + test byte [rbp-42],2 + je freerdp_image_yuv420p_to_xrgb_column_process_complete + +; let's say, we need to only convert six pixel in width +; Ok, the first 4 pixel will be converted just like every 4 pixel else, but +; if it's the last loop in line, [rbp-42] is shifted left by one (curious? have a look above), +; and we land here. Through initialisation a mask was prepared. In this case it looks like +; 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH + movdqa xmm6,[rbp-330] +; we and our output data with this mask to get only the valid pixel + pand xmm4,xmm6 +; then we fetch memory from the destination array ... + movdqu xmm5,[rdi] +; ... and and it with the inverse mask. We get only those pixel, which should not be updated + pandn xmm6,xmm5 +; we only have to or the two values together and write it back to the destination array, +; and only the pixel that should be updated really get changed. + por xmm4,xmm6 + +freerdp_image_yuv420p_to_xrgb_column_process_complete: + movdqu [rdi],xmm4 - ;Y data processing in secound line +; Because UV data is the same for two lines, we can process the secound line just here, +; in the same loop. Only thing we need to do is to add some offsets to the Y- and destination +; pointer. These offsets are iStride[0] and the target scanline. +; But if we don't need to process the secound line, like if we are in the last line of processing nine lines, +; we just skip all this. test r14b,2 - jnz skip_last_line1 + jnz freerdp_yuv420p_to_xrgb_skip_last_line movd xmm4,[rsi+r9] pshufb xmm4,[rbp-298] @@ -429,21 +570,46 @@ valid_yuv_data: por xmm4,xmm5 por xmm4,xmm6 - movdqa [rdi+r10],xmm4 + test byte [rbp-42],2 + je freerdp_image_yuv420p_to_xrgb_column_process_complete2 -skip_last_line1: + movdqa xmm6,[rbp-330] + pand xmm4,xmm6 + movdqu xmm5,[rdi+r10] + pandn xmm6,xmm5 + por xmm4,xmm6 + +; only thing is, we should shift [rbp-42] back here, because we have processed the last column, +; and this "special condition" can be released + shr byte [rbp-42],1 + +freerdp_image_yuv420p_to_xrgb_column_process_complete2: + movdqu [rdi+r10],xmm4 + + +freerdp_yuv420p_to_xrgb_skip_last_line: +; after all we have to increase the destination- and Y-data pointer by four pixel add rdi,16 add rsi,4 - inc cx cmp cx,r8w jne freerdp_image_yuv420p_to_xrgb_wloop freerdp_image_yuv420p_to_xrgb_wloop_end: - add rdi,r10 +; after each line we have to add the scanline to the destination pointer, because +; we are processing two lines at once, but only increasing the destination pointer +; in the first line. Well, we only have one pointer, so it's the easiest way to access +; the secound line with the one pointer and an offset (scanline) +; if we're not converting the full width of the scanline, like only 64 pixel, but the +; output buffer was "designed" for 1920p HD, we have to add the remaining length for each line, +; to get into the next line. + add rdi,[rbp-338] +; same thing has to be done for Y-data, but with iStride[0] instead of the target scanline add rsi,r11 +; and again for UV data, but here it's enough to add the remaining length, because +; UV data is the same for two lines and there exists only one "UV line" on two "real lines" add rax,r12 add rbx,r12 ;mov eax,r12d diff --git a/libfreerdp/codec/h264_x64.asm b/libfreerdp/codec/h264_x64.asm index f0bf1d640..c7963220e 100644 --- a/libfreerdp/codec/h264_x64.asm +++ b/libfreerdp/codec/h264_x64.asm @@ -67,14 +67,17 @@ YUV_to_RGB_asm31: ret -;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); +;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline); global freerdp_image_yuv_to_xrgb_asm freerdp_image_yuv_to_xrgb_asm: + push rbx push rbp mov rbp, rsp ;cWidth: cx - sub rsp,72 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[1] - push rbx + sub rsp,82 ;pDstData -8,pSrcData[3] -32,nWidth -40,nHeight -48,cHeight -56,scanline -64,iStride[0] -72,VaddDst -80,last_column 1 -81,last_line 1 -82 + +;last_column: set to 10B, if last column should be skipped ('cause UV data is the same for two columns and two columns are processed at once) +;last_line: set to 10B, if last line should be skipped ('cause UV data is the same for two lines and two lines are processed at once) mov [rbp-8],rdi @@ -86,126 +89,160 @@ freerdp_image_yuv_to_xrgb_asm: mov rax,[rsi+16] mov [rbp-32],rax - mov [rbp-40],rdx + and rdx,0FFFFH + ;mov [rbp-40],rdx shr rcx,1 ;/2 mov [rbp-48],rcx - shl rdx,2 - mov [rbp-64],rdx + and r9,0FFFFH + mov [rbp-64],r9 + + shr r9d,1 + sub r9d,edx + shl r9d,2 + mov [rbp-80],r9 mov rax,[rbp-48] mov [rbp-56],rax - mov [rbp-72],r8 - mov rax,[rbp-40] + mov rcx,[r8] + and rcx,0FFFFH + mov [rbp-72],rcx shl dword [rbp-72],1 - sub [rbp-72],rax + sub [rbp-72],rdx + mov r9,[r8+4] + mov r8,rcx + + and r9,0FFFFH shr rax,1 sub r9,rax + + mov al,dl + and al,1B + mov [rbp-81],al + inc dx + shr edx,1 + mov [rbp-40],rdx + freerdp_image_yuv_to_xrgb_asm_loopH: - mov rcx,[rbp-40] - shr rcx,1 + mov cx,[rbp-40] freerdp_image_yuv_to_xrgb_asm_loopW: - mov rax,[rbp-16] - mov edi,[rax] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov edi,[rax+r8] - inc rax - mov [rbp-16],rax - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov rdx,[rbp-64] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - - - mov rax,[rbp-16] - mov edi,[rax] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov edi,[rax+r8] - inc rax - mov [rbp-16],rax - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - inc rax - mov [rbp-24],rax - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - inc rax - mov [rbp-32],rax - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov rdx,[rbp-64] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - dec cx + jne freerdp_image_yuv_to_xrgb_asm_not_last_column + + shl byte [rbp-81],1 + +freerdp_image_yuv_to_xrgb_asm_not_last_column: + + + mov rax,[rbp-16] + mov edi,[rax] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + test byte [rbp-81],2 + jne freerdp_image_yuv_to_xrgb_asm_skip_last_column + + mov rax,[rbp-16] + mov edi,[rax+r8] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov rdx,[rbp-64] + mov [rbx+rdx],eax + +freerdp_image_yuv_to_xrgb_asm_skip_last_column: + add qword [rbp-8],4 + inc qword [rbp-16] + + + mov rax,[rbp-16] + mov edi,[rax] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + mov rbx,[rbp-8] + mov [rbx],eax + + + test byte [rbp-81],2 + jne freerdp_image_yuv_to_xrgb_asm_skip_last_column2 + + mov rax,[rbp-16] + mov edi,[rax+r8] + and edi,0xFF + + mov rax,[rbp-24] + mov esi,[rax] + and esi,0xFF + + mov rax,[rbp-32] + mov edx,[rax] + and edx,0xFF + + call YUV_to_RGB_asm + + ;shr [rbp-81],1 + + mov rbx,[rbp-8] + mov rdx,[rbp-64] + mov [rbx+rdx],eax + +freerdp_image_yuv_to_xrgb_asm_skip_last_column2: + add qword [rbp-8],4 + inc qword [rbp-16] + inc qword [rbp-24] + inc qword [rbp-32] + + + test cx,0FFFFH jne freerdp_image_yuv_to_xrgb_asm_loopW + jmp END mov rax,[rbp-8] - add rax,[rbp-64] + add rax,[rbp-80] mov [rbp-8],rax mov rax,[rbp-16] @@ -226,7 +263,7 @@ freerdp_image_yuv_to_xrgb_asm_loopW: ;END mov rax,0 END: - pop rbx mov rsp,rbp pop rbp + pop rbx ret \ No newline at end of file diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 index a060926b7..53e208b69 100644 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 @@ -14,4 +14,4 @@ h264.c.o: ../h264.c gcc -c -o h264.c.o ../h264.c clean: - rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o \ No newline at end of file + rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c index d0c04787f..040b1650d 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -19,7 +19,7 @@ int main(void){ int nSrcStep[2]; #if SSSE3 - if(check_ssse3()){ + if(freerdp_check_ssse3()){ fprintf(stderr,"ssse3 not supported!\n"); return EXIT_FAILURE; } @@ -30,8 +30,11 @@ int main(void){ pSrcData[0]=malloc(1984*HEIGHT*sizeof(char)); pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char)); pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char)); - pDstData_asm=_aligned_malloc(WIDTH*HEIGHT*4*sizeof(char),16); - pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char)); + pDstData_asm=_aligned_malloc(WIDTH*(HEIGHT+1)*4*sizeof(char),16); + pDstData_c=malloc(WIDTH*(HEIGHT+1)*4*sizeof(char)); + + memset(pDstData_asm,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char)); + memset(pDstData_c,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char)); for(i=0;i Date: Fri, 5 Sep 2014 20:16:56 -0400 Subject: [PATCH 12/31] libfreerdp-codec: improve YUV to RGB color conversion --- libfreerdp/codec/h264.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 84095e7e7..0688bb038 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -37,16 +37,16 @@ static INLINE BYTE clip(int x) static INLINE UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) { - int C, D, E; BYTE R, G, B; + int Yp, Up, Vp; - C = Y; - D = U - 128; - E = V - 128; + Yp = Y * 256; + Up = U - 128; + Vp = V - 128; - R = clip(( 256 * C + 403 * E + 128) >> 8); - G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); - B = clip(( 256 * C + 475 * D + 128) >> 8); + R = clip((Yp + (403 * Vp)) >> 8); + G = clip((Yp - (48 * Up) - (120 * Vp)) >> 8); + B = clip((Yp + (475 * Up)) >> 8); return RGB32(R, G, B); } From 437583aa9ae388ac6f256ad8fc4edb8e0aca8621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sat, 6 Sep 2014 17:10:27 -0400 Subject: [PATCH 13/31] libfreerdp-primitives: add YUV420 to RGB conversion --- include/freerdp/primitives.h | 5 + libfreerdp/codec/h264.c | 169 ++++++-------------------- libfreerdp/primitives/CMakeLists.txt | 1 + libfreerdp/primitives/prim_YUV.c | 97 +++++++++++++++ libfreerdp/primitives/prim_YUV.h | 27 ++++ libfreerdp/primitives/prim_internal.h | 63 ++++------ libfreerdp/primitives/primitives.c | 10 +- 7 files changed, 197 insertions(+), 175 deletions(-) create mode 100644 libfreerdp/primitives/prim_YUV.c create mode 100644 libfreerdp/primitives/prim_YUV.h diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index be0a01816..e75e8c69c 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -164,6 +164,10 @@ typedef pstatus_t (*__RGB565ToARGB_16u32u_C3C4_t)( UINT32* pDst, INT32 dstStep, UINT32 width, UINT32 height, BOOL alpha, BOOL invert); +typedef pstatus_t (*__YUV420ToRGB_8u_P3AC4R_t)( + const BYTE* pSrc[3], INT32 srcStep[3], + BYTE* pDst, INT32 dstStep, + const prim_size_t* roi); typedef pstatus_t (*__andC_32u_t)( const UINT32 *pSrc, UINT32 val, @@ -209,6 +213,7 @@ typedef struct __RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R; __YCoCgRToRGB_8u_AC4R_t YCoCgRToRGB_8u_AC4R; __RGB565ToARGB_16u32u_C3C4_t RGB565ToARGB_16u32u_C3C4; + __YUV420ToRGB_8u_P3AC4R_t YUV420ToRGB_8u_P3AC4R; } primitives_t; #ifdef __cplusplus diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index c607a4895..1a02887e2 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -25,73 +25,10 @@ #include #include -#include +#include + #include -static INLINE BYTE clip(int x) -{ - if (x < 0) return 0; - if (x > 255) return 255; - return (BYTE) x; -} - -static INLINE UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) -{ - BYTE R, G, B; - int Yp, Up, Vp; - - Yp = Y * 256; - Up = U - 128; - Vp = V - 128; - - R = clip((Yp + (403 * Vp)) >> 8); - G = clip((Yp - (48 * Up) - (120 * Vp)) >> 8); - B = clip((Yp + (475 * Up)) >> 8); - - return RGB32(R, G, B); -} - -static int g_H264FrameId = 0; -static BOOL g_H264DumpFrames = FALSE; - -static void h264_dump_h264_data(BYTE* data, int size) -{ - FILE* fp; - char buf[4096]; - - sprintf_s(buf, sizeof(buf), "/tmp/wlog/bs_%d.h264", g_H264FrameId); - fp = fopen(buf, "wb"); - fwrite(data, 1, size, fp); - fflush(fp); - fclose(fp); -} - -void h264_dump_yuv_data(BYTE* yuv[], int width, int height, int stride[]) -{ - FILE* fp; - BYTE* srcp; - char buf[4096]; - int j; - - sprintf_s(buf, sizeof(buf), "/tmp/wlog/H264_%d.ppm", g_H264FrameId); - fp = fopen(buf, "wb"); - fwrite("P5\n", 1, 3, fp); - sprintf_s(buf, sizeof(buf), "%d %d\n", width, height); - fwrite(buf, 1, strlen(buf), fp); - fwrite("255\n", 1, 4, fp); - - srcp = yuv[0]; - - for (j = 0; j < height; j++) - { - fwrite(srcp, 1, width, fp); - srcp += stride[0]; - } - - fflush(fp); - fclose(fp); -} - int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) { UINT32 size; @@ -104,8 +41,11 @@ int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) if (size > h264->size) { h264->size = size; - h264->data = (BYTE*) realloc(h264->data, h264->size); - memset(h264->data, 0, h264->size); + + if (!h264->data) + h264->data = (BYTE*) _aligned_malloc(h264->size, 16); + else + h264->data = (BYTE*) _aligned_realloc(h264->data, h264->size, 16); } if (!h264->data) @@ -114,44 +54,6 @@ int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) return 1; } -int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, int nYDst, - int nWidth, int nHeight, BYTE* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc) -{ - int x, y; - BYTE* pDstPixel8; - BYTE *pY, *pU, *pV; - int shift = 1; - - pY = pSrcData[0] + (nYSrc * nSrcStep[0]) + nXSrc; - - pDstPixel8 = &pDstData[(nYDst * nDstStep) + (nXDst * 4)]; - - for (y = 0; y < nHeight; y++) - { - pU = pSrcData[1] + ((nYSrc + y) >> shift) * nSrcStep[1]; - pV = pSrcData[2] + ((nYSrc + y) >> shift) * nSrcStep[1]; - - for (x = 0; x < nWidth; x++) - { - BYTE Y, U, V; - - Y = *pY; - U = pU[(nXSrc + x) >> shift]; - V = pV[(nXSrc + x) >> shift]; - - *((UINT32*) pDstPixel8) = YUV_to_RGB(Y, U, V); - - pDstPixel8 += 4; - pY++; - } - - pDstPixel8 += (nDstStep - (nWidth * 4)); - pY += (nSrcStep[0] - nWidth); - } - - return 1; -} - /** * Dummy subsystem */ @@ -205,10 +107,13 @@ static void openh264_trace_callback(H264_CONTEXT* h264, int level, const char* m static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) { + int srcStep[3]; + prim_size_t roi; BYTE* pYUVData[3]; DECODING_STATE state; SBufferInfo sBufferInfo; SSysMEMBuffer* pSystemBuffer; + primitives_t* prims = primitives_get(); H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData; if (!sys->pDecoder) @@ -262,20 +167,18 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (pSystemBuffer->iFormat != videoFormatI420) return -1; - /* Convert I420 (same as IYUV) to XRGB. */ - - if (g_H264DumpFrames) - { - h264_dump_yuv_data(pYUVData, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iStride); - } - - g_H264FrameId++; - if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; - freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, - h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); + roi.width = h264->width; + roi.height = h264->height; + + /* convert iStride[2] to srcStep[3] */ + srcStep[0] = pSystemBuffer->iStride[0]; + srcStep[1] = pSystemBuffer->iStride[1]; + srcStep[2] = pSystemBuffer->iStride[1]; + + prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVData, srcStep, h264->data, h264->scanline, &roi); return 1; } @@ -408,8 +311,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) { int status; + int srcStep[3]; int gotFrame = 0; AVPacket packet; + prim_size_t roi; + const BYTE* pSrc[3]; + primitives_t* prims = primitives_get(); H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData; av_init_packet(&packet); @@ -425,26 +332,31 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS return -1; } +#if 0 printf("libavcodec_decompress: frame decoded (status=%d, gotFrame=%d, width=%d, height=%d, Y=[%p,%d], U=[%p,%d], V=[%p,%d])\n", status, gotFrame, sys->videoFrame->width, sys->videoFrame->height, sys->videoFrame->data[0], sys->videoFrame->linesize[0], sys->videoFrame->data[1], sys->videoFrame->linesize[1], sys->videoFrame->data[2], sys->videoFrame->linesize[2]); - - fflush(stdout); +#endif if (gotFrame) { - if (g_H264DumpFrames) - { - h264_dump_yuv_data(sys->videoFrame->data, sys->videoFrame->width, sys->videoFrame->height, sys->videoFrame->linesize); - } - if (h264_prepare_rgb_buffer(h264, sys->videoFrame->width, sys->videoFrame->height) < 0) return -1; - freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, - h264->width, h264->height, sys->videoFrame->data, sys->videoFrame->linesize, 0, 0); + roi.width = h264->width; + roi.height = h264->height; + + pSrc[0] = sys->videoFrame->data[0]; + pSrc[1] = sys->videoFrame->data[1]; + pSrc[2] = sys->videoFrame->data[2]; + + srcStep[0] = sys->videoFrame->linesize[0]; + srcStep[1] = sys->videoFrame->linesize[1]; + srcStep[2] = sys->videoFrame->linesize[2]; + + prims->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, h264->data, h264->scanline, &roi); } return 1; @@ -586,11 +498,6 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, *ppDstData = pDstData; } - if (g_H264DumpFrames) - { - h264_dump_h264_data(pSrcData, SrcSize); - } - return h264->subsystem->Decompress(h264, pSrcData, SrcSize, pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight); } @@ -650,7 +557,7 @@ void h264_context_free(H264_CONTEXT* h264) { if (h264) { - free(h264->data); + _aligned_free(h264->data); h264->subsystem->Uninit(h264); diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 9bc898c18..2c4ef7414 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -26,6 +26,7 @@ set(${MODULE_PREFIX}_SRCS prim_set.c prim_shift.c prim_sign.c + prim_YUV.c prim_YCoCg.c primitives.c prim_internal.h) diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c new file mode 100644 index 000000000..000f14d8e --- /dev/null +++ b/libfreerdp/primitives/prim_YUV.c @@ -0,0 +1,97 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * + * Copyright 2014 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + +#include "prim_internal.h" +#include "prim_YUV.h" + +static INLINE BYTE clip(int x) +{ + if (x < 0) return 0; + if (x > 255) return 255; + return (BYTE) x; +} + +static INLINE UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) +{ + BYTE R, G, B; + int Yp, Up, Vp; + + Yp = Y * 256; + Up = U - 128; + Vp = V - 128; + + R = clip((Yp + (403 * Vp)) >> 8); + G = clip((Yp - (48 * Up) - (120 * Vp)) >> 8); + B = clip((Yp + (475 * Up)) >> 8); + + return ARGB32(0xFF, R, G, B); +} + +pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], + BYTE* pDst, int dstStep, const prim_size_t* roi) +{ + int x, y; + BYTE Y, U, V; + const BYTE* pY; + const BYTE* pU; + const BYTE* pV; + BYTE* pRGB = pDst; + + pY = pSrc[0]; + + for (y = 0; y < roi->height; y++) + { + pU = pSrc[1] + (y / 2) * srcStep[1]; + pV = pSrc[2] + (y / 2) * srcStep[2]; + + for (x = 0; x < roi->width; x++) + { + Y = *pY; + U = pU[x / 2]; + V = pV[x / 2]; + + *((UINT32*) pRGB) = YUV_to_RGB(Y, U, V); + + pRGB += 4; + pY++; + } + + pRGB += (dstStep - (roi->width * 4)); + pY += (srcStep[0] - roi->width); + } + + return PRIMITIVES_SUCCESS; +} + +void primitives_init_YUV(primitives_t* prims) +{ + prims->YUV420ToRGB_8u_P3AC4R = general_YUV420ToRGB_8u_P3AC4R; +} + +void primitives_deinit_YUV(primitives_t* prims) +{ + +} diff --git a/libfreerdp/primitives/prim_YUV.h b/libfreerdp/primitives/prim_YUV.h new file mode 100644 index 000000000..12f796b61 --- /dev/null +++ b/libfreerdp/primitives/prim_YUV.h @@ -0,0 +1,27 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * + * Copyright 2014 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_PRIMITIVES_YUV_H +#define FREERDP_PRIMITIVES_YUV_H + +pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, BYTE* pDst, int dstStep, const prim_size_t* roi); + +void primitives_init_YUV(primitives_t* prims); +void primitives_deinit_YUV(primitives_t* prims); + +#endif /* FREERDP_PRIMITIVES_YUV_H */ diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h index e1a248c69..04c830a1c 100644 --- a/libfreerdp/primitives/prim_internal.h +++ b/libfreerdp/primitives/prim_internal.h @@ -35,54 +35,37 @@ : _mm_load_si128((__m128i *) (_ptr_))) /* Function prototypes for all the init/deinit routines. */ -extern void primitives_init_copy( - primitives_t *prims); -extern void primitives_deinit_copy( - primitives_t *prims); +extern void primitives_init_copy(primitives_t *prims); +extern void primitives_deinit_copy(primitives_t *prims); -extern void primitives_init_set( - primitives_t *prims); -extern void primitives_deinit_set( - primitives_t *prims); +extern void primitives_init_set(primitives_t *prims); +extern void primitives_deinit_set(primitives_t *prims); -extern void primitives_init_add( - primitives_t *prims); -extern void primitives_deinit_add( - primitives_t *prims); +extern void primitives_init_add(primitives_t *prims); +extern void primitives_deinit_add(primitives_t *prims); -extern void primitives_init_andor( - primitives_t *prims); -extern void primitives_deinit_andor( - primitives_t *prims); +extern void primitives_init_andor(primitives_t *prims); +extern void primitives_deinit_andor(primitives_t *prims); -extern void primitives_init_shift( - primitives_t *prims); -extern void primitives_deinit_shift( - primitives_t *prims); +extern void primitives_init_shift(primitives_t *prims); +extern void primitives_deinit_shift(primitives_t *prims); -extern void primitives_init_sign( - primitives_t *prims); -extern void primitives_deinit_sign( - primitives_t *prims); +extern void primitives_init_sign(primitives_t *prims); +extern void primitives_deinit_sign(primitives_t *prims); -extern void primitives_init_alphaComp( - primitives_t *prims); -extern void primitives_deinit_alphaComp( - primitives_t *prims); +extern void primitives_init_alphaComp(primitives_t *prims); +extern void primitives_deinit_alphaComp(primitives_t *prims); -extern void primitives_init_colors( - primitives_t *prims); -extern void primitives_deinit_colors( - primitives_t *prims); +extern void primitives_init_colors(primitives_t *prims); +extern void primitives_deinit_colors(primitives_t *prims); -extern void primitives_init_YCoCg( - primitives_t *prims); -extern void primitives_deinit_YCoCg( - primitives_t *prims); +extern void primitives_init_YCoCg(primitives_t *prims); +extern void primitives_deinit_YCoCg(primitives_t *prims); -extern void primitives_init_16to32bpp( - primitives_t *prims); -extern void primitives_deinit_16to32bpp( - primitives_t *prims); +extern void primitives_init_YUV(primitives_t *prims); +extern void primitives_deinit_YUV(primitives_t *prims); + +extern void primitives_init_16to32bpp(primitives_t *prims); +extern void primitives_deinit_16to32bpp(primitives_t *prims); #endif /* !__PRIM_INTERNAL_H_INCLUDED__ */ diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c index dc8d038b9..dcdd5941a 100644 --- a/libfreerdp/primitives/primitives.c +++ b/libfreerdp/primitives/primitives.c @@ -32,11 +32,11 @@ static primitives_t* pPrimitives = NULL; /* ------------------------------------------------------------------------- */ void primitives_init(void) { - if (pPrimitives == NULL) + if (!pPrimitives) { pPrimitives = calloc(1, sizeof(primitives_t)); - if (pPrimitives == NULL) + if (!pPrimitives) return; } @@ -50,13 +50,14 @@ void primitives_init(void) primitives_init_sign(pPrimitives); primitives_init_colors(pPrimitives); primitives_init_YCoCg(pPrimitives); + primitives_init_YUV(pPrimitives); primitives_init_16to32bpp(pPrimitives); } /* ------------------------------------------------------------------------- */ primitives_t* primitives_get(void) { - if (pPrimitives == NULL) + if (!pPrimitives) primitives_init(); return pPrimitives; @@ -65,7 +66,7 @@ primitives_t* primitives_get(void) /* ------------------------------------------------------------------------- */ void primitives_deinit(void) { - if (pPrimitives == NULL) + if (!pPrimitives) return; /* Call each section's de-initialization routine. */ @@ -78,6 +79,7 @@ void primitives_deinit(void) primitives_deinit_sign(pPrimitives); primitives_deinit_colors(pPrimitives); primitives_deinit_YCoCg(pPrimitives); + primitives_deinit_YUV(pPrimitives); primitives_deinit_16to32bpp(pPrimitives); free((void*) pPrimitives); From 3203d37bdfed491709f30880f6aa78bd293b7e6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sat, 6 Sep 2014 20:15:40 -0400 Subject: [PATCH 14/31] libfreerdp-primitives: optimize YUV420p to RGB conversion --- libfreerdp/primitives/prim_YUV.c | 206 +++++++++++++++++++++++++------ 1 file changed, 170 insertions(+), 36 deletions(-) diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index 000f14d8e..c57b122b8 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -27,60 +27,194 @@ #include "prim_internal.h" #include "prim_YUV.h" -static INLINE BYTE clip(int x) -{ - if (x < 0) return 0; - if (x > 255) return 255; - return (BYTE) x; -} - -static INLINE UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V) -{ - BYTE R, G, B; - int Yp, Up, Vp; - - Yp = Y * 256; - Up = U - 128; - Vp = V - 128; - - R = clip((Yp + (403 * Vp)) >> 8); - G = clip((Yp - (48 * Up) - (120 * Vp)) >> 8); - B = clip((Yp + (475 * Up)) >> 8); - - return ARGB32(0xFF, R, G, B); -} - pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], BYTE* pDst, int dstStep, const prim_size_t* roi) { int x, y; + int dstPad; + int srcPad[3]; BYTE Y, U, V; + int halfWidth; + int halfHeight; const BYTE* pY; const BYTE* pU; const BYTE* pV; + int R, G, B; + int Yp, Up, Vp; + int Up48, Up475; + int Vp403, Vp120; BYTE* pRGB = pDst; pY = pSrc[0]; + pU = pSrc[1]; + pV = pSrc[2]; - for (y = 0; y < roi->height; y++) + halfWidth = roi->width / 2; + halfHeight = roi->height / 2; + + srcPad[0] = (srcStep[0] - roi->width); + srcPad[1] = (srcStep[1] - halfWidth); + srcPad[2] = (srcStep[2] - halfWidth); + + dstPad = (dstStep - (roi->width * 4)); + + for (y = 0; y < halfHeight; y++) { - pU = pSrc[1] + (y / 2) * srcStep[1]; - pV = pSrc[2] + (y / 2) * srcStep[2]; - - for (x = 0; x < roi->width; x++) + for (x = 0; x < halfWidth; x++) { - Y = *pY; - U = pU[x / 2]; - V = pV[x / 2]; + U = *pU++; + V = *pV++; - *((UINT32*) pRGB) = YUV_to_RGB(Y, U, V); + Up = U - 128; + Vp = V - 128; - pRGB += 4; - pY++; + Up48 = 48 * Up; + Up475 = 475 * Up; + + Vp403 = Vp * 403; + Vp120 = Vp * 120; + + /* 1st pixel */ + + Y = *pY++; + Yp = Y << 8; + + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; + + if (R < 0) + R = 0; + else if (R > 255) + R = 255; + + if (G < 0) + G = 0; + else if (G > 255) + G = 255; + + if (B < 0) + B = 0; + else if (B > 255) + B = 255; + + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; + + /* 2nd pixel */ + + Y = *pY++; + Yp = Y << 8; + + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; + + if (R < 0) + R = 0; + else if (R > 255) + R = 255; + + if (G < 0) + G = 0; + else if (G > 255) + G = 255; + + if (B < 0) + B = 0; + else if (B > 255) + B = 255; + + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; } - pRGB += (dstStep - (roi->width * 4)); - pY += (srcStep[0] - roi->width); + pY += srcPad[0]; + pU -= halfWidth; + pV -= halfWidth; + pRGB += dstPad; + + for (x = 0; x < halfWidth; x++) + { + U = *pU++; + V = *pV++; + + Up = U - 128; + Vp = V - 128; + + Up48 = 48 * Up; + Up475 = 475 * Up; + + Vp403 = Vp * 403; + Vp120 = Vp * 120; + + /* 3rd pixel */ + + Y = *pY++; + Yp = Y << 8; + + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; + + if (R < 0) + R = 0; + else if (R > 255) + R = 255; + + if (G < 0) + G = 0; + else if (G > 255) + G = 255; + + if (B < 0) + B = 0; + else if (B > 255) + B = 255; + + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; + + /* 4th pixel */ + + Y = *pY++; + Yp = Y << 8; + + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; + + if (R < 0) + R = 0; + else if (R > 255) + R = 255; + + if (G < 0) + G = 0; + else if (G > 255) + G = 255; + + if (B < 0) + B = 0; + else if (B > 255) + B = 255; + + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; + } + + pY += srcPad[0]; + pU += srcPad[1]; + pV += srcPad[2]; + pRGB += dstPad; } return PRIMITIVES_SUCCESS; From bd516e04fa6726c3a69966209a0d8f0575c6cd44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sat, 6 Sep 2014 21:13:37 -0400 Subject: [PATCH 15/31] libfreerdp-primitives: cleanup YCoCg --- client/X11/xf_client.c | 1 + include/freerdp/primitives.h | 4 +- libfreerdp/codec/planar.c | 4 +- libfreerdp/core/capabilities.c | 13 ++- libfreerdp/core/settings.c | 4 + libfreerdp/primitives/prim_YCoCg.c | 102 ++++++++++-------- libfreerdp/primitives/prim_YCoCg.h | 2 +- libfreerdp/primitives/prim_YCoCg_opt.c | 14 +-- .../primitives/test/TestPrimitivesYCoCg.c | 8 +- 9 files changed, 86 insertions(+), 66 deletions(-) diff --git a/client/X11/xf_client.c b/client/X11/xf_client.c index a803edf74..d43ed6359 100644 --- a/client/X11/xf_client.c +++ b/client/X11/xf_client.c @@ -810,6 +810,7 @@ BOOL xf_pre_connect(freerdp *instance) xfc->fullscreen_toggle = settings->ToggleFullscreen; xf_detect_monitors(xfc, settings); xfc->colormap = DefaultColormap(xfc->display, xfc->screen_number); + return TRUE; } diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index e75e8c69c..d47300c01 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -152,7 +152,7 @@ typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)( const INT16 *pSrc[3], INT32 srcStep, BYTE *pDst, INT32 dstStep, const prim_size_t *roi); -typedef pstatus_t (*__YCoCgRToRGB_8u_AC4R_t)( +typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)( const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, @@ -211,7 +211,7 @@ typedef struct __yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3; __RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3; __RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R; - __YCoCgRToRGB_8u_AC4R_t YCoCgRToRGB_8u_AC4R; + __YCoCgToRGB_8u_AC4R_t YCoCgToRGB_8u_AC4R; __RGB565ToARGB_16u32u_C3C4_t RGB565ToARGB_16u32u_C3C4; __YUV420ToRGB_8u_P3AC4R_t YUV420ToRGB_8u_P3AC4R; } primitives_t; diff --git a/libfreerdp/codec/planar.c b/libfreerdp/codec/planar.c index a48795a21..5a3e35e6a 100644 --- a/libfreerdp/codec/planar.c +++ b/libfreerdp/codec/planar.c @@ -336,7 +336,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS { static BOOL been_warned = FALSE; if (!been_warned) - DEBUG_WARN( "Chroma-Subsampling is not implemented.\n"); + DEBUG_WARN("Chroma-Subsampling is not implemented.\n"); been_warned = TRUE; } else @@ -346,7 +346,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS alpha = (FormatHeader & PLANAR_FORMAT_HEADER_NA) ? FALSE : TRUE; cll = FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK; - primitives_get()->YCoCgRToRGB_8u_AC4R( + primitives_get()->YCoCgToRGB_8u_AC4R( pDstData, nDstStep, pDstData, nDstStep, nWidth, nHeight, cll, alpha, FALSE); } diff --git a/libfreerdp/core/capabilities.c b/libfreerdp/core/capabilities.c index 168f05d8e..91bc8a931 100644 --- a/libfreerdp/core/capabilities.c +++ b/libfreerdp/core/capabilities.c @@ -361,7 +361,15 @@ void rdp_write_bitmap_capability_set(wStream* s, rdpSettings* settings) header = rdp_capability_set_start(s); - drawingFlags |= DRAW_ALLOW_SKIP_ALPHA; + if (settings->DrawAllowSkipAlpha) + drawingFlags |= DRAW_ALLOW_SKIP_ALPHA; + + if (settings->DrawAllowColorSubsampling) + drawingFlags |= DRAW_ALLOW_DYNAMIC_COLOR_FIDELITY; + + if (settings->DrawAllowDynamicColorFidelity) + drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING; /* currently unimplemented */ + /* While bitmap_decode.c now implements YCoCg, in turning it * on we have found Microsoft is inconsistent on whether to invert R & B. * And it's not only from one server to another; on Win7/2008R2, it appears @@ -370,9 +378,6 @@ void rdp_write_bitmap_capability_set(wStream* s, rdpSettings* settings) * will not send it. YCoCg is still needed for EGFX, but it at least * appears consistent in its use. */ - /* drawingFlags |= DRAW_ALLOW_DYNAMIC_COLOR_FIDELITY; */ - /* YCoCg with chroma subsampling is not implemented in bitmap_decode.c. */ - /* drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING; */ if (settings->RdpVersion > 5) preferredBitsPerPixel = settings->ColorDepth; diff --git a/libfreerdp/core/settings.c b/libfreerdp/core/settings.c index 0eec9b087..3c070827d 100644 --- a/libfreerdp/core/settings.c +++ b/libfreerdp/core/settings.c @@ -321,6 +321,10 @@ rdpSettings* freerdp_settings_new(DWORD flags) settings->DrawGdiPlusEnabled = FALSE; + settings->DrawAllowSkipAlpha = TRUE; + settings->DrawAllowColorSubsampling = FALSE; + settings->DrawAllowDynamicColorFidelity = FALSE; + settings->FrameMarkerCommandEnabled = TRUE; settings->SurfaceFrameMarkerEnabled = TRUE; settings->BitmapCacheV3Enabled = FALSE; diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c index 3e7505676..ca6484795 100644 --- a/libfreerdp/primitives/prim_YCoCg.c +++ b/libfreerdp/primitives/prim_YCoCg.c @@ -33,7 +33,7 @@ #endif /* !MINMAX */ /* ------------------------------------------------------------------------- */ -pstatus_t general_YCoCgRToRGB_8u_AC4R( +pstatus_t general_YCoCgToRGB_8u_AC4R( const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, @@ -41,75 +41,85 @@ pstatus_t general_YCoCgRToRGB_8u_AC4R( BOOL withAlpha, BOOL invert) { - const BYTE *sptr = pSrc; + BYTE A; + int x, y; BYTE *dptr = pDst; + const BYTE *sptr = pSrc; + INT16 Cg, Co, Y, T, R, G, B; int cll = shift - 1; /* -1 builds in the /2's */ - int x,y; - int srcRowBump = srcStep - width*sizeof(UINT32); - int dstRowBump = dstStep - width*sizeof(UINT32); + int srcPad = srcStep - (width * 4); + int dstPad = dstStep - (width * 4); + if (invert) { - for (y=0; yINT16 */ - a = *sptr++; - if (!withAlpha) a = 0xFFU; - t = y - cg; - r = t + co; - g = y + cg; - b = t - co; - *dptr++ = (BYTE) MINMAX(r, 0, 255); - *dptr++ = (BYTE) MINMAX(g, 0, 255); - *dptr++ = (BYTE) MINMAX(b, 0, 255); - *dptr++ = a; + Cg = (INT16) ((INT8) ((*sptr++) << cll)); + Co = (INT16) ((INT8) ((*sptr++) << cll)); + Y = (INT16) (*sptr++); /* UINT8->INT16 */ + + A = *sptr++; + + if (!withAlpha) + A = 0xFFU; + + T = Y - Cg; + R = T + Co; + G = Y + Cg; + B = T - Co; + + *dptr++ = (BYTE) MINMAX(R, 0, 255); + *dptr++ = (BYTE) MINMAX(G, 0, 255); + *dptr++ = (BYTE) MINMAX(B, 0, 255); + *dptr++ = A; } - sptr += srcRowBump; - dptr += dstRowBump; + + sptr += srcPad; + dptr += dstPad; } } else { - for (y=0; yINT16 */ - a = *sptr++; - if (!withAlpha) a = 0xFFU; - t = y - cg; - r = t + co; - g = y + cg; - b = t - co; - *dptr++ = (BYTE) MINMAX(b, 0, 255); - *dptr++ = (BYTE) MINMAX(g, 0, 255); - *dptr++ = (BYTE) MINMAX(r, 0, 255); - *dptr++ = a; + Cg = (INT16) ((INT8) ((*sptr++) << cll)); + Co = (INT16) ((INT8) ((*sptr++) << cll)); + Y = (INT16) (*sptr++); /* UINT8->INT16 */ + + A = *sptr++; + + if (!withAlpha) + A = 0xFFU; + + T = Y - Cg; + R = T + Co; + G = Y + Cg; + B = T - Co; + + *dptr++ = (BYTE) MINMAX(B, 0, 255); + *dptr++ = (BYTE) MINMAX(G, 0, 255); + *dptr++ = (BYTE) MINMAX(R, 0, 255); + *dptr++ = A; } - sptr += srcRowBump; - dptr += dstRowBump; + + sptr += srcPad; + dptr += dstPad; } } + return PRIMITIVES_SUCCESS; } /* ------------------------------------------------------------------------- */ void primitives_init_YCoCg(primitives_t* prims) { - prims->YCoCgRToRGB_8u_AC4R = general_YCoCgRToRGB_8u_AC4R; + prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R; primitives_init_YCoCg_opt(prims); } diff --git a/libfreerdp/primitives/prim_YCoCg.h b/libfreerdp/primitives/prim_YCoCg.h index aa3929aff..c03715bda 100644 --- a/libfreerdp/primitives/prim_YCoCg.h +++ b/libfreerdp/primitives/prim_YCoCg.h @@ -24,7 +24,7 @@ #ifndef __PRIM_YCOCG_H_INCLUDED__ #define __PRIM_YCOCG_H_INCLUDED__ -pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha, BOOL invert); +pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha, BOOL invert); void primitives_init_YCoCg_opt(primitives_t* prims); diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c index 51fce1fc3..e022662b3 100644 --- a/libfreerdp/primitives/prim_YCoCg_opt.c +++ b/libfreerdp/primitives/prim_YCoCg_opt.c @@ -69,7 +69,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( if ((width < 8) || (ULONG_PTR) dptr & 0x03) { /* Too small, or we'll never hit a 16-byte boundary. Punt. */ - return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep, + return general_YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, dstStep, width, height, shift, withAlpha, TRUE); } @@ -83,7 +83,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( { int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; if (startup > width) startup = width; - general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, + general_YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, startup, 1, shift, withAlpha, TRUE); sptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32); @@ -185,7 +185,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( /* Handle any remainder pixels. */ if (w > 0) { - general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, + general_YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, w, 1, shift, withAlpha, TRUE); sptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32); @@ -228,7 +228,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( if ((width < 8) || (ULONG_PTR) dptr & 0x03) { /* Too small, or we'll never hit a 16-byte boundary. Punt. */ - return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep, + return general_YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, dstStep, width, height, shift, withAlpha, FALSE); } @@ -242,7 +242,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( { int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; if (startup > width) startup = width; - general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, + general_YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, startup, 1, shift, withAlpha, FALSE); sptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32); @@ -348,7 +348,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( /* Handle any remainder pixels. */ if (w > 0) { - general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, + general_YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, w, 1, shift, withAlpha, FALSE); sptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32); @@ -393,7 +393,7 @@ void primitives_init_YCoCg_opt(primitives_t* prims) if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { - prims->YCoCgRToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; + prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; } #endif /* WITH_SSE2 */ } diff --git a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c index d6f4d4289..c280b5be3 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c @@ -28,7 +28,7 @@ static const float TEST_TIME = 4.0; extern BOOL g_TestPrimitivesPerformance; -extern pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, +extern pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha, BOOL invert); extern pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, @@ -48,9 +48,9 @@ int test_YCoCgRToRGB_8u_AC4R_func(void) testStr[0] = '\0'; get_random_data(in, sizeof(in)); - general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4, + general_YCoCgToRGB_8u_AC4R((const BYTE *) (in+1), 63*4, (BYTE *) out_c, 63*4, 63, 61, 2, TRUE, FALSE); - general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4, + general_YCoCgToRGB_8u_AC4R((const BYTE *) (in+1), 63*4, (BYTE *) out_c_inv, 63*4, 63, 61, 2, TRUE, TRUE); #ifdef WITH_SSE2 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) @@ -86,7 +86,7 @@ int test_YCoCgRToRGB_8u_AC4R_func(void) /* ------------------------------------------------------------------------- */ STD_SPEED_TEST( ycocg_to_rgb_speed, const BYTE, BYTE, PRIM_NOP, - TRUE, general_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE), + TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE), #ifdef WITH_SSE2 TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE), PF_EX_SSSE3, TRUE, From 5b8fb70e8cc9d3c132d1802a02ef0c6c048956c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sun, 7 Sep 2014 14:08:29 -0400 Subject: [PATCH 16/31] libfreerdp-codec: simplify and optimize planar raw rgb decoding --- libfreerdp/codec/planar.c | 233 ++++++++++++++++++++++++++++++++++---- 1 file changed, 209 insertions(+), 24 deletions(-) diff --git a/libfreerdp/codec/planar.c b/libfreerdp/codec/planar.c index 5a3e35e6a..d6727f610 100644 --- a/libfreerdp/codec/planar.c +++ b/libfreerdp/codec/planar.c @@ -24,9 +24,9 @@ #include #include +#include #include #include -#include #include "planar.h" @@ -204,15 +204,78 @@ static int planar_decompress_plane_raw(BYTE* pSrcData, UINT32 SrcSize, BYTE* pDs return (int) (srcp - pSrcData); } +static int planar_decompress_rgb_planes_raw(const BYTE* pSrcData[4], int nSrcStep, BYTE* pDstData, + int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, BOOL alpha, BOOL vFlip) +{ + int x, y; + int beg, end, inc; + BYTE* pRGB = pDstData; + const BYTE* pR = pSrcData[0]; + const BYTE* pG = pSrcData[1]; + const BYTE* pB = pSrcData[2]; + const BYTE* pA = pSrcData[3]; + + if (vFlip) + { + beg = nHeight - 1; + end = -1; + inc = -1; + } + else + { + beg = 0; + end = nHeight; + inc = 1; + } + + if (alpha) + { + for (y = beg; y != end; y += inc) + { + pRGB = &pDstData[((nYDst + y) * nDstStep) + (nXDst * 4)]; + + for (x = 0; x < nWidth; x++) + { + *pRGB++ = *pB++; + *pRGB++ = *pG++; + *pRGB++ = *pR++; + *pRGB++ = *pA++; + } + } + } + else + { + for (y = beg; y != end; y += inc) + { + pRGB = &pDstData[((nYDst + y) * nDstStep) + (nXDst * 4)]; + + for (x = 0; x < nWidth; x++) + { + *pRGB++ = *pB++; + *pRGB++ = *pG++; + *pRGB++ = *pR++; + *pRGB++ = 0xFF; + } + } + } + + return 1; +} + int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) { + BOOL cs; + BOOL rle; + UINT32 cll; + BOOL alpha; int status; BYTE* srcp; BOOL vFlip; BYTE FormatHeader; BYTE* pDstData = NULL; UINT32 UncompressedSize; + const primitives_t* prims = primitives_get(); if ((nWidth * nHeight) <= 0) return -1; @@ -237,11 +300,142 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS FormatHeader = *srcp; srcp++; + cll = (FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK); + cs = (FormatHeader & PLANAR_FORMAT_HEADER_CS) ? TRUE : FALSE; + rle = (FormatHeader & PLANAR_FORMAT_HEADER_RLE) ? TRUE : FALSE; + alpha = (FormatHeader & PLANAR_FORMAT_HEADER_NA) ? FALSE : TRUE; + + //printf("CLL: %d CS: %d RLE: %d ALPHA: %d\n", cll, cs, rle, alpha); + + if (!cll) /* RGB */ + { + if (!rle) /* RAW */ + { + int planeSize; + const BYTE* planes[4]; + + planeSize = nWidth * nHeight; + + if (alpha) + { + if ((SrcSize - (srcp - pSrcData)) < (planeSize * 4)) + return -1; + + planes[3] = &srcp[planeSize * 0]; /* AlphaPlane */ + planes[0] = &srcp[planeSize * 1]; /* RedPlane */ + planes[1] = &srcp[planeSize * 2]; /* GreenPlane */ + planes[2] = &srcp[planeSize * 3]; /* BluePlane */ + + planar_decompress_rgb_planes_raw(planes, nWidth, pDstData, nDstStep, + nXDst, nYDst, nWidth, nHeight, alpha, vFlip); + + srcp += (planeSize * 4); + srcp++; /* pad */ + } + else /* NoAlpha */ + { + if ((SrcSize - (srcp - pSrcData)) < (planeSize * 3)) + return -1; + + planes[0] = &srcp[planeSize * 0]; /* RedPlane */ + planes[1] = &srcp[planeSize * 1]; /* GreenPlane */ + planes[2] = &srcp[planeSize * 2]; /* BluePlane */ + + planar_decompress_rgb_planes_raw(planes, nWidth, pDstData, nDstStep, + nXDst, nYDst, nWidth, nHeight, alpha, vFlip); + + srcp += (planeSize * 3); + srcp++; /* pad */ + } + } + else /* RLE */ + { + if (alpha) + { + /* AlphaPlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); + + if (status < 0) + return -1; + + srcp += status; + + /* RedPlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); + + if (status < 0) + return -1; + + srcp += status; + + /* GreenPlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); + + if (status < 0) + return -1; + + srcp += status; + + /* BluePlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); + + if (status < 0) + return -1; + + srcp += status; + } + else /* NoAlpha */ + { + /* RedPlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); + + if (status < 0) + return -1; + + srcp += status; + + /* GreenPlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); + + if (status < 0) + return -1; + + srcp += status; + + /* BluePlane */ + + status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); + + if (status < 0) + return -1; + + srcp += status; + } + } + + status = (SrcSize == (srcp - pSrcData)) ? 1 : -1; + + return status; + } + /* AlphaPlane */ - if (!(FormatHeader & PLANAR_FORMAT_HEADER_NA)) + if (alpha) { - if (FormatHeader & PLANAR_FORMAT_HEADER_RLE) + if (rle) { status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); @@ -263,9 +457,9 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS } } - if (FormatHeader & PLANAR_FORMAT_HEADER_RLE) + if (rle) { - /* LumaOrRedPlane */ + /* LumaPlane */ status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); @@ -275,7 +469,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS srcp += status; - /* OrangeChromaOrGreenPlane */ + /* OrangeChromaPlane */ status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); @@ -285,7 +479,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS srcp += status; - /* GreenChromeOrBluePlane */ + /* GreenChromaPlane */ status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); @@ -297,7 +491,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS } else { - /* LumaOrRedPlane */ + /* LumaPlane */ status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); @@ -307,7 +501,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS srcp += status; - /* OrangeChromaOrGreenPlane */ + /* OrangeChromaPlane */ status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); @@ -317,7 +511,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS srcp += status; - /* GreenChromeOrBluePlane */ + /* GreenChromaPlane */ status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); @@ -329,26 +523,17 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS srcp++; } - if (FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK) + if (cll) { /* The data is in YCoCg colorspace rather than RGB. */ - if (FormatHeader & PLANAR_FORMAT_HEADER_CS) + if (cs) { - static BOOL been_warned = FALSE; - if (!been_warned) - DEBUG_WARN("Chroma-Subsampling is not implemented.\n"); - been_warned = TRUE; + DEBUG_WARN("Chroma-Subsampling is not implemented"); } else { - BOOL alpha; - int cll; - - alpha = (FormatHeader & PLANAR_FORMAT_HEADER_NA) ? FALSE : TRUE; - cll = FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK; - primitives_get()->YCoCgToRGB_8u_AC4R( - pDstData, nDstStep, pDstData, nDstStep, - nWidth, nHeight, cll, alpha, FALSE); + prims->YCoCgToRGB_8u_AC4R(pDstData, nDstStep, + pDstData, nDstStep, nWidth, nHeight, cll, alpha, FALSE); } } From ad9092baf957037e8d6d5a0ac9d26cba1f32e864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sun, 7 Sep 2014 15:40:36 -0400 Subject: [PATCH 17/31] libfreerdp-codec: cleanup and restructure planar decompressor for chroma subsampling --- libfreerdp/codec/planar.c | 500 ++++++++++++++++++++------------------ 1 file changed, 269 insertions(+), 231 deletions(-) diff --git a/libfreerdp/codec/planar.c b/libfreerdp/codec/planar.c index d6727f610..37ce3ed7e 100644 --- a/libfreerdp/codec/planar.c +++ b/libfreerdp/codec/planar.c @@ -30,11 +30,60 @@ #include "planar.h" -static int planar_decompress_plane_rle(BYTE* pSrcData, UINT32 SrcSize, BYTE* pDstData, +static int planar_skip_plane_rle(const BYTE* pSrcData, UINT32 SrcSize, int nWidth, int nHeight) +{ + int x, y; + int cRawBytes; + int nRunLength; + BYTE controlByte; + const BYTE* pRLE = pSrcData; + const BYTE* pEnd = &pSrcData[SrcSize]; + + for (y = 0; y < nHeight; y++) + { + for (x = 0; x < nWidth; ) + { + if (pRLE >= pEnd) + return -1; + + controlByte = *pRLE++; + + nRunLength = PLANAR_CONTROL_BYTE_RUN_LENGTH(controlByte); + cRawBytes = PLANAR_CONTROL_BYTE_RAW_BYTES(controlByte); + + if (nRunLength == 1) + { + nRunLength = cRawBytes + 16; + cRawBytes = 0; + } + else if (nRunLength == 2) + { + nRunLength = cRawBytes + 32; + cRawBytes = 0; + } + + pRLE += cRawBytes; + x += cRawBytes; + cRawBytes = 0; + + x += nRunLength; + nRunLength = 0; + + if (x > nWidth) + return -1; + + if (pRLE > pEnd) + return -1; + } + } + + return (int) (pRLE - pSrcData); +} + +static int planar_decompress_plane_rle(const BYTE* pSrcData, UINT32 SrcSize, BYTE* pDstData, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, int nChannel, BOOL vFlip) { int x, y; - BYTE* srcp; BYTE* dstp; UINT32 pixel; int cRawBytes; @@ -44,8 +93,8 @@ static int planar_decompress_plane_rle(BYTE* pSrcData, UINT32 SrcSize, BYTE* pDs BYTE controlByte; BYTE* currentScanline; BYTE* previousScanline; + const BYTE* srcp = pSrcData; - srcp = pSrcData; dstp = pDstData; previousScanline = NULL; @@ -168,43 +217,7 @@ static int planar_decompress_plane_rle(BYTE* pSrcData, UINT32 SrcSize, BYTE* pDs return (int) (srcp - pSrcData); } -static int planar_decompress_plane_raw(BYTE* pSrcData, UINT32 SrcSize, BYTE* pDstData, - int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, int nChannel, BOOL vFlip) -{ - int x, y; - int beg, end, inc; - BYTE* dstp = NULL; - BYTE* srcp = pSrcData; - - if (vFlip) - { - beg = nHeight - 1; - end = -1; - inc = -1; - } - else - { - beg = 0; - end = nHeight; - inc = 1; - } - - for (y = beg; y != end; y += inc) - { - dstp = &pDstData[((nYDst + y) * nDstStep) + (nXDst * 4) + nChannel]; - - for (x = 0; x < nWidth; x++) - { - *dstp = *srcp; - dstp += 4; - srcp++; - } - } - - return (int) (srcp - pSrcData); -} - -static int planar_decompress_rgb_planes_raw(const BYTE* pSrcData[4], int nSrcStep, BYTE* pDstData, +static int planar_decompress_planes_raw(const BYTE* pSrcData[4], int nSrcStep, BYTE* pDstData, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight, BOOL alpha, BOOL vFlip) { int x, y; @@ -272,8 +285,17 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS int status; BYTE* srcp; BOOL vFlip; + int subSize; + int subWidth; + int subHeight; + int planeSize; + BYTE* pDstData; + int rleSizes[4]; + int rawSizes[4]; + int rawWidths[4]; + int rawHeights[4]; BYTE FormatHeader; - BYTE* pDstData = NULL; + const BYTE* planes[4]; UINT32 UncompressedSize; const primitives_t* prims = primitives_get(); @@ -297,8 +319,7 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS *ppDstData = pDstData; } - FormatHeader = *srcp; - srcp++; + FormatHeader = *srcp++; cll = (FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK); cs = (FormatHeader & PLANAR_FORMAT_HEADER_CS) ? TRUE : FALSE; @@ -307,234 +328,251 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS //printf("CLL: %d CS: %d RLE: %d ALPHA: %d\n", cll, cs, rle, alpha); + if (!cll && cs) + return -1; /* Chroma subsampling requires YCoCg */ + + subWidth = (nWidth / 2) + (nWidth % 2); + subHeight = (nHeight / 2) + (nHeight % 2); + + planeSize = nWidth * nHeight; + subSize = subWidth * subHeight; + + if (!cs) + { + rawSizes[0] = planeSize; /* LumaOrRedPlane */ + rawWidths[0] = nWidth; + rawHeights[0] = nHeight; + + rawSizes[1] = planeSize; /* OrangeChromaOrGreenPlane */ + rawWidths[1] = nWidth; + rawHeights[1] = nHeight; + + rawSizes[2] = planeSize; /* GreenChromaOrBluePlane */ + rawWidths[2] = nWidth; + rawHeights[2] = nHeight; + + rawSizes[3] = planeSize; /* AlphaPlane */ + rawWidths[3] = nWidth; + rawHeights[3] = nHeight; + } + else /* Chroma Subsampling */ + { + rawSizes[0] = planeSize; /* LumaOrRedPlane */ + rawWidths[0] = nWidth; + rawHeights[0] = nHeight; + + rawSizes[1] = subSize; /* OrangeChromaOrGreenPlane */ + rawWidths[1] = subWidth; + rawHeights[1] = subHeight; + + rawSizes[2] = subSize; /* GreenChromaOrBluePlane */ + rawWidths[2] = subWidth; + rawHeights[2] = subHeight; + + rawSizes[3] = planeSize; /* AlphaPlane */ + rawWidths[3] = nWidth; + rawHeights[3] = nHeight; + } + + if (!rle) /* RAW */ + { + if (alpha) + { + planes[3] = srcp; /* AlphaPlane */ + planes[0] = planes[3] + rawSizes[3]; /* LumaOrRedPlane */ + planes[1] = planes[0] + rawSizes[0]; /* OrangeChromaOrGreenPlane */ + planes[2] = planes[1] + rawSizes[1]; /* GreenChromaOrBluePlane */ + + if ((planes[2] + rawSizes[2]) > &pSrcData[SrcSize]) + return -1; + } + else + { + if ((SrcSize - (srcp - pSrcData)) < (planeSize * 3)) + return -1; + + planes[0] = srcp; /* LumaOrRedPlane */ + planes[1] = planes[0] + rawSizes[0]; /* OrangeChromaOrGreenPlane */ + planes[2] = planes[1] + rawSizes[1]; /* GreenChromaOrBluePlane */ + + if ((planes[2] + rawSizes[2]) > &pSrcData[SrcSize]) + return -1; + } + } + else /* RLE */ + { + if (alpha) + { + planes[3] = srcp; + rleSizes[3] = planar_skip_plane_rle(planes[3], SrcSize - (planes[3] - pSrcData), + rawWidths[3], rawHeights[3]); /* AlphaPlane */ + + if (rleSizes[3] < 0) + return -1; + + planes[0] = planes[3] + rleSizes[3]; + rleSizes[0] = planar_skip_plane_rle(planes[0], SrcSize - (planes[0] - pSrcData), + rawWidths[0], rawHeights[0]); /* RedPlane */ + + if (rleSizes[0] < 0) + return -1; + + planes[1] = planes[0] + rleSizes[0]; + rleSizes[1] = planar_skip_plane_rle(planes[1], SrcSize - (planes[1] - pSrcData), + rawWidths[1], rawHeights[1]); /* GreenPlane */ + + if (rleSizes[1] < 1) + return -1; + + planes[2] = planes[1] + rleSizes[1]; + rleSizes[2] = planar_skip_plane_rle(planes[2], SrcSize - (planes[2] - pSrcData), + rawWidths[2], rawHeights[2]); /* BluePlane */ + + if (rleSizes[2] < 1) + return -1; + } + else + { + planes[0] = srcp; + rleSizes[0] = planar_skip_plane_rle(planes[0], SrcSize - (planes[0] - pSrcData), + rawWidths[0], rawHeights[0]); /* RedPlane */ + + if (rleSizes[0] < 0) + return -1; + + planes[1] = planes[0] + rleSizes[0]; + rleSizes[1] = planar_skip_plane_rle(planes[1], SrcSize - (planes[1] - pSrcData), + rawWidths[1], rawHeights[1]); /* GreenPlane */ + + if (rleSizes[1] < 1) + return -1; + + planes[2] = planes[1] + rleSizes[1]; + rleSizes[2] = planar_skip_plane_rle(planes[2], SrcSize - (planes[2] - pSrcData), + rawWidths[2], rawHeights[2]); /* BluePlane */ + + if (rleSizes[2] < 1) + return -1; + } + } + if (!cll) /* RGB */ { if (!rle) /* RAW */ { - int planeSize; - const BYTE* planes[4]; - - planeSize = nWidth * nHeight; - if (alpha) { - if ((SrcSize - (srcp - pSrcData)) < (planeSize * 4)) - return -1; - - planes[3] = &srcp[planeSize * 0]; /* AlphaPlane */ - planes[0] = &srcp[planeSize * 1]; /* RedPlane */ - planes[1] = &srcp[planeSize * 2]; /* GreenPlane */ - planes[2] = &srcp[planeSize * 3]; /* BluePlane */ - - planar_decompress_rgb_planes_raw(planes, nWidth, pDstData, nDstStep, + planar_decompress_planes_raw(planes, nWidth, pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, alpha, vFlip); - srcp += (planeSize * 4); - srcp++; /* pad */ + srcp += rawSizes[0] + rawSizes[1] + rawSizes[2] + rawSizes[3]; } else /* NoAlpha */ { - if ((SrcSize - (srcp - pSrcData)) < (planeSize * 3)) - return -1; - - planes[0] = &srcp[planeSize * 0]; /* RedPlane */ - planes[1] = &srcp[planeSize * 1]; /* GreenPlane */ - planes[2] = &srcp[planeSize * 2]; /* BluePlane */ - - planar_decompress_rgb_planes_raw(planes, nWidth, pDstData, nDstStep, + planar_decompress_planes_raw(planes, nWidth, pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, alpha, vFlip); - srcp += (planeSize * 3); - srcp++; /* pad */ + srcp += rawSizes[0] + rawSizes[1] + rawSizes[2]; } + + if ((SrcSize - (srcp - pSrcData)) == 1) + srcp++; /* pad */ } else /* RLE */ { if (alpha) { - /* AlphaPlane */ + status = planar_decompress_plane_rle(planes[3], rleSizes[3], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); /* AlphaPlane */ - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); + status = planar_decompress_plane_rle(planes[0], rleSizes[0], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); /* RedPlane */ - if (status < 0) - return -1; + status = planar_decompress_plane_rle(planes[1], rleSizes[1], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); /* GreenPlane */ - srcp += status; + status = planar_decompress_plane_rle(planes[2], rleSizes[2], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); /* BluePlane */ - /* RedPlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* GreenPlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* BluePlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); - - if (status < 0) - return -1; - - srcp += status; + srcp += rleSizes[0] + rleSizes[1] + rleSizes[2] + rleSizes[3]; } else /* NoAlpha */ { - /* RedPlane */ + status = planar_decompress_plane_rle(planes[0], rleSizes[0], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); /* RedPlane */ - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); + status = planar_decompress_plane_rle(planes[1], rleSizes[1], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); /* GreenPlane */ - if (status < 0) - return -1; + status = planar_decompress_plane_rle(planes[2], rleSizes[2], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); /* BluePlane */ - srcp += status; + srcp += rleSizes[0] + rleSizes[1] + rleSizes[2]; + } + } + } + else /* YCoCg */ + { + if (cs) + { + fprintf(stderr, "Chroma subsampling unimplemented\n"); + return -1; + } - /* GreenPlane */ + if (!rle) /* RAW */ + { + if (alpha) + { + planar_decompress_planes_raw(planes, nWidth, pDstData, nDstStep, + nXDst, nYDst, nWidth, nHeight, alpha, vFlip); - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); + srcp += rawSizes[0] + rawSizes[1] + rawSizes[2] + rawSizes[3]; + } + else /* NoAlpha */ + { + planar_decompress_planes_raw(planes, nWidth, pDstData, nDstStep, + nXDst, nYDst, nWidth, nHeight, alpha, vFlip); - if (status < 0) - return -1; + srcp += rawSizes[0] + rawSizes[1] + rawSizes[2]; + } - srcp += status; + if ((SrcSize - (srcp - pSrcData)) == 1) + srcp++; /* pad */ + } + else /* RLE */ + { + if (alpha) + { + status = planar_decompress_plane_rle(planes[3], rleSizes[3], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); /* AlphaPlane */ - /* BluePlane */ + status = planar_decompress_plane_rle(planes[0], rleSizes[0], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); /* LumaPlane */ - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); + status = planar_decompress_plane_rle(planes[1], rleSizes[1], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); /* OrangeChromaPlane */ - if (status < 0) - return -1; + status = planar_decompress_plane_rle(planes[2], rleSizes[2], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); /* GreenChromaPlane */ - srcp += status; + srcp += rleSizes[0] + rleSizes[1] + rleSizes[2] + rleSizes[3]; + } + else /* NoAlpha */ + { + status = planar_decompress_plane_rle(planes[0], rleSizes[0], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); /* LumaPlane */ + + status = planar_decompress_plane_rle(planes[1], rleSizes[1], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); /* OrangeChromaPlane */ + + status = planar_decompress_plane_rle(planes[2], rleSizes[2], + pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); /* GreenChromaPlane */ + + srcp += rleSizes[0] + rleSizes[1] + rleSizes[2]; } } - status = (SrcSize == (srcp - pSrcData)) ? 1 : -1; - - return status; - } - - /* AlphaPlane */ - - if (alpha) - { - if (rle) - { - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); - - if (status < 0) - return -1; - - srcp += status; - } - else - { - status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 3, vFlip); - - if (status < 0) - return -1; - - srcp += status; - } - } - - if (rle) - { - /* LumaPlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* OrangeChromaPlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* GreenChromaPlane */ - - status = planar_decompress_plane_rle(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); - - if (status < 0) - return -1; - - srcp += status; - } - else - { - /* LumaPlane */ - - status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 2, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* OrangeChromaPlane */ - - status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 1, vFlip); - - if (status < 0) - return -1; - - srcp += status; - - /* GreenChromaPlane */ - - status = planar_decompress_plane_raw(srcp, SrcSize - (srcp - pSrcData), - pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, 0, vFlip); - - if (status < 0) - return -1; - - srcp += status; - srcp++; - } - - if (cll) - { - /* The data is in YCoCg colorspace rather than RGB. */ - if (cs) - { - DEBUG_WARN("Chroma-Subsampling is not implemented"); - } - else - { - prims->YCoCgToRGB_8u_AC4R(pDstData, nDstStep, - pDstData, nDstStep, nWidth, nHeight, cll, alpha, FALSE); - } + prims->YCoCgToRGB_8u_AC4R(pDstData, nDstStep, pDstData, nDstStep, nWidth, nHeight, cll, alpha, FALSE); } status = (SrcSize == (srcp - pSrcData)) ? 1 : -1; From fee370e4b2a9327b14be3cfb24a24e70a1567544 Mon Sep 17 00:00:00 2001 From: erbth Date: Mon, 8 Sep 2014 16:29:01 +0200 Subject: [PATCH 18/31] YUV data conversion with SSSE3 using intrinsics --- libfreerdp/codec/CMakeLists.txt | 57 +- libfreerdp/codec/h264.c | 43 +- libfreerdp/codec/h264_ssse3.c | 298 +++++++++ libfreerdp/codec/h264_ssse3_x32.asm | 454 ------------- libfreerdp/codec/h264_ssse3_x64.asm | 628 ------------------ libfreerdp/codec/h264_x32.asm | 240 ------- libfreerdp/codec/h264_x64.asm | 269 -------- .../codec/test/Makefile.TestOpenH264SSSE3 | 14 + libfreerdp/codec/test/TestOpenH264 | Bin 0 -> 15584 bytes 9 files changed, 335 insertions(+), 1668 deletions(-) create mode 100644 libfreerdp/codec/h264_ssse3.c delete mode 100644 libfreerdp/codec/h264_ssse3_x32.asm delete mode 100644 libfreerdp/codec/h264_ssse3_x64.asm delete mode 100644 libfreerdp/codec/h264_x32.asm delete mode 100644 libfreerdp/codec/h264_x64.asm create mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 create mode 100755 libfreerdp/codec/test/TestOpenH264 diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index bd714b760..f8ac3faa5 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -102,50 +102,21 @@ if(WITH_LIBAVCODEC) endif() if(WITH_LIBAVCODEC OR WITH_OPENH264) - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(arch64 TRUE) - else() - set(arch64 FALSE) - endif() - - if(WITH_H264_ASM) - set(H264_ASM H264_ASM_o) - add_definitions(-DWITH_H264_ASM) - add_custom_target(${H264_ASM}) - - if(arch64) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x64.asm.o) - add_custom_command(TARGET ${H264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) - else() - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x32.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x32.asm.o) - add_custom_command(TARGET ${H264_ASM} - COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC}) - endif() - - set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) - endif() - if(WITH_H264_SSSE3) - set(H264_ASM H264_ASM_o) add_definitions(-DWITH_H264_SSSE3) - add_custom_target(${H264_ASM}) - - if(arch64) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x64.asm.o) - add_custom_command(TARGET ${H264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) - else() - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x32.asm) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x32.asm.o) - add_custom_command(TARGET ${H264_ASM} - COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC}) + set(${MODULE_PREFIX}_SRCS + ${${MODULE_PREFIX}_SRCS} + h264_ssse3.c) + + if(CMAKE_COMPILER_IS_GNUCC) + set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3") endif() - - set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + + if(MSVC) + set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2") + endif() + + set_property(SOURCE h264_ssse3.c PROPERTY COMPILE_FLAGS ${OPTIMIZATION}) endif() endif() @@ -179,10 +150,6 @@ else() install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) endif() -if(WITH_H264_ASM OR WITH_H264_SSSE3) - add_dependencies(${MODULE_NAME} ${H264_ASM}) -endif() - set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp") if(BUILD_TESTING) diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 8c39d0fc6..4322231e7 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -31,12 +31,8 @@ #include #ifdef WITH_H264_SSSE3 -extern int check_ssse3(); -extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline); -#else -#ifdef WITH_H264_ASM -extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); -#endif +extern int freerdp_check_ssse3(); +extern int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline); #endif #define USE_GRAY_SCALE 0 @@ -408,7 +404,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (pSystemBuffer->iFormat != videoFormatI420) return -1; - /* Convert I420 (same as IYUV) to XRGB. */ if (g_H264DumpFrames) { @@ -416,31 +411,12 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz } g_H264FrameId++; - + h264->iStride[0] = pSystemBuffer->iStride[0]; h264->iStride[1] = pSystemBuffer->iStride[1]; h264->width = pSystemBuffer->iWidth; h264->height = pSystemBuffer->iHeight; - -#if 0 - if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) - return -1; - - gettimeofday(&T1,NULL); -#ifdef WITH_H264_SSSE3 - freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); -#else -#ifdef WITH_H264_ASM - freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); -#else - freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, - h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); -#endif -#endif - gettimeofday(&T2,NULL); - printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); -#endif return 1; } @@ -677,7 +653,7 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE* pDstPoint; BYTE** pYUVData; - BYTE* pYUVPoint[2]; + BYTE* pYUVPoint[3]; RDPGFX_RECT16* rect; int* iStride; @@ -743,13 +719,16 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pYUVPoint[1] = pYUVData[1] + ret; pYUVPoint[2] = pYUVData[2] + ret; -#if 1 +#if 0 printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n", rect->left, rect->top, cx, cy); #endif #ifdef WITH_H264_SSSE3 - freerdp_image_yuv420p_to_xrgb(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep); + freerdp_image_yuv420p_to_xrgb_ssse3(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep); +#else + freerdp_image_copy_yuv420p_to_xrgb(pDstPoint, nDstStep, 0, 0, + cx, cy, pYUVPoint, iStride, 0, 0); #endif } gettimeofday(&T2,NULL); @@ -774,9 +753,9 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT)); #ifdef WITH_H264_SSSE3 - if(check_ssse3()){ + if(freerdp_check_ssse3()){ printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ..."); - return FALSE; + return NULL; } #endif diff --git a/libfreerdp/codec/h264_ssse3.c b/libfreerdp/codec/h264_ssse3.c new file mode 100644 index 000000000..1774856b4 --- /dev/null +++ b/libfreerdp/codec/h264_ssse3.c @@ -0,0 +1,298 @@ +/** function for converting YUV420p data to the RGB format (but without any special upconverting) + * It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. + * The target scanline (6th parameter) must be a multiple of 16. + * iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four + * of the half of iStride[0] or bigger + */ + +#include + +#include +//#include +#include + +#include +#include + +int freerdp_check_ssse3() +{ + if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) + return 0; + + return 1; +} + + +int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline) +{ + char last_line,last_column; + int i,VaddDst,VaddY,VaddUV; + + BYTE *UData,*VData,*YData; + + __m128i r0,r1,r2,r3,r4,r5,r6,r7; + __m128i *buffer; + + + buffer=_aligned_malloc(4*16,16); + + + YData=pSrcData[0]; + UData=pSrcData[1]; + VData=pSrcData[2]; + + + if((last_column=nWidth&3)){ + switch(last_column){ + case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break; + case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; + case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; + } + _mm_store_si128(buffer+48,r7); + last_column=1; + } + + nWidth+=3; + nWidth=nWidth>>2; + + + last_line=nHeight&1; + nHeight++; + nHeight=nHeight>>1; + + + VaddDst=(scanline<<1)-(nWidth<<4); + VaddY=(iStride[0]<<1)-(nWidth<<2); + VaddUV=iStride[1]-(((nWidth<<1)+2)&0xFFFC); + + + + while(nHeight-- >0){ + if(nHeight==0){ + last_line=last_line<<1; + } + + i=0; + do{ +/* + * Well, in the end it should look like this: + * C = Y; + * D = U - 128; + * E = V - 128; + * + * R = clip(( 256 * C + 403 * E + 128) >> 8); + * G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); + * B = clip(( 256 * C + 475 * D + 128) >> 8); + */ + if(!(i&0x01)){ +/* Y-, U- and V-data is stored in different arrays. + * We start with processing U-data. + * + * at first we fetch four U-values from its array and shuffle them like this: + * 0d0d 0c0c 0b0b 0a0a + * we've done two things: converting the values to signed words and duplicating + * each value, because always two pixel "share" the same U- (and V-) data + */ + r0=_mm_cvtsi32_si128(*(UINT32 *)UData); + r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); + r0=_mm_shuffle_epi8(r0,r5); + + UData+=4; + + r3=_mm_set_epi16(128,128,128,128,128,128,128,128); + r0=_mm_subs_epi16(r0,r3); + + r2=r0; + + r4=r0; + r7=_mm_set_epi16(48,48,48,48,48,48,48,48); + r0=_mm_mullo_epi16(r0,r7); + r4=_mm_mulhi_epi16(r4,r7); + r7=r0; + r0=_mm_unpacklo_epi16(r0,r4); + r4=_mm_unpackhi_epi16(r7,r4); + + + r6=_mm_set_epi32(128,128,128,128); + r0=_mm_sub_epi32(r0,r6); + r4=_mm_sub_epi32(r4,r6); + + + r1=r2; + r7=_mm_set_epi16(475,475,475,475,475,475,475,475); + r1=_mm_mullo_epi16(r1,r7); + r2=_mm_mulhi_epi16(r2,r7); + r7=r1; + r1=_mm_unpacklo_epi16(r1,r2); + r7=_mm_unpackhi_epi16(r7,r2); + + r1=_mm_add_epi32(r1,r6); + r7=_mm_add_epi32(r7,r6); + + _mm_store_si128(buffer+16,r7); + +/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ + r2=_mm_cvtsi32_si128(*(UINT32 *)VData); + r2=_mm_shuffle_epi8(r2,r5); + + VData+=4; + + r2=_mm_subs_epi16(r2,r3); + + r5=r2; + + + r3=r2; + r7=_mm_set_epi16(403,403,403,403,403,403,403,403); + r2=_mm_mullo_epi16(r2,r7); + r3=_mm_mulhi_epi16(r3,r7); + r7=r2; + r2=_mm_unpacklo_epi16(r2,r3); + r7=_mm_unpackhi_epi16(r7,r3); + + r2=_mm_add_epi32(r2,r6); + r7=_mm_add_epi32(r7,r6); + + _mm_store_si128(buffer+32,r7); + + + + r3=r5; + r7=_mm_set_epi16(120,120,120,120,120,120,120,120); + r3=_mm_mullo_epi16(r3,r7); + r5=_mm_mulhi_epi16(r5,r7); + r7=r3; + r3=_mm_unpacklo_epi16(r3,r5); + r7=_mm_unpackhi_epi16(r7,r5); + + r0=_mm_add_epi32(r0,r3); + r4=_mm_add_epi32(r4,r7); + + _mm_store_si128(buffer,r4); + }else{ + r1=_mm_load_si128(buffer+16); + r2=_mm_load_si128(buffer+32); + r0=_mm_load_si128(buffer); + } + + if(++i==nWidth) + last_column=last_column<<1; + + //processing Y data + r4=_mm_cvtsi32_si128(*(UINT32 *)YData); + r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4=_mm_shuffle_epi8(r4,r7); + + r5=r4; + r6=r4; + + r4=_mm_add_epi32(r4,r2); + r5=_mm_sub_epi32(r5,r0); + r6=_mm_add_epi32(r6,r1); + + + r4=_mm_slli_epi32(r4,8); + r5=_mm_slli_epi32(r5,8); + r6=_mm_slli_epi32(r6,8); + + r7=_mm_set_epi32(0,0,0,0); + r4=_mm_max_epi16(r4,r7); + r5=_mm_max_epi16(r5,r7); + r6=_mm_max_epi16(r6,r7); + + r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4=_mm_min_epi16(r4,r7); + r5=_mm_min_epi16(r5,r7); + r6=_mm_min_epi16(r6,r7); + + //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4=_mm_and_si128(r4,r7); + + r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5=_mm_shuffle_epi8(r5,r7); + + r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6=_mm_shuffle_epi8(r6,r7); + + + r4=_mm_or_si128(r4,r5); + r4=_mm_or_si128(r4,r6); + + + if(last_column&0x02){ + r6=_mm_load_si128(buffer+48); + r4=_mm_and_si128(r4,r6); + r5=_mm_lddqu_si128((__m128i *)pDstData); + r6=_mm_andnot_si128(r6,r5); + r4=_mm_or_si128(r4,r6); + } + _mm_storeu_si128((__m128i *)pDstData,r4); + + //Y data processing in secound line + if(!(last_line&0x02)){ + r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+iStride[0])); + r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4=_mm_shuffle_epi8(r4,r7); + + r5=r4; + r6=r4; + + r4=_mm_add_epi32(r4,r2); + r5=_mm_sub_epi32(r5,r0); + r6=_mm_add_epi32(r6,r1); + + + r4=_mm_slli_epi32(r4,8); + r5=_mm_slli_epi32(r5,8); + r6=_mm_slli_epi32(r6,8); + + r7=_mm_set_epi32(0,0,0,0); + r4=_mm_max_epi16(r4,r7); + r5=_mm_max_epi16(r5,r7); + r6=_mm_max_epi16(r6,r7); + + r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4=_mm_min_epi16(r4,r7); + r5=_mm_min_epi16(r5,r7); + r6=_mm_min_epi16(r6,r7); + + r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4=_mm_and_si128(r4,r7); + + r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5=_mm_shuffle_epi8(r5,r7); + + r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6=_mm_shuffle_epi8(r6,r7); + + + r4=_mm_or_si128(r4,r5); + r4=_mm_or_si128(r4,r6); + + + if(last_column&0x02){ + r6=_mm_load_si128(buffer+48); + r4=_mm_and_si128(r4,r6); + r5=_mm_lddqu_si128((__m128i *)(pDstData+scanline)); + r6=_mm_andnot_si128(r6,r5); + r4=_mm_or_si128(r4,r6); + + last_column=last_column>>1; + } + _mm_storeu_si128((__m128i *)(pDstData+scanline),r4); + } + + pDstData+=16; + YData+=4; + + }while(i> 8); -; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); -; B = clip(( 256 * C + 475 * D + 128) >> 8); - - test cx,1B - jnz load_yuv_data - - - ;prepare U data - movd xmm0,[eax] - movdqa xmm5,[ebp-314] - pshufb xmm0,xmm5 ;but this is the omest instruction of all!! - - add eax,4 - - movdqa xmm3,[ebp-122] - psubsw xmm0,xmm3 - - movdqa xmm2,xmm0 - - movdqa xmm4,xmm0 - movdqa xmm7,[ebp-138] - pmullw xmm0,xmm7 - pmulhw xmm4,xmm7 - - movdqa xmm7,xmm0 - punpcklwd xmm0,xmm4 ;what an awesome instruction! - punpckhwd xmm7,xmm4 - movdqa xmm4,xmm7 - - movdqa xmm6,[ebp-106] - psubd xmm0,xmm6 - psubd xmm4,xmm6 - - - movdqa xmm1,xmm2 - movdqa xmm7,[ebp-154] - pmullw xmm1,xmm7 - pmulhw xmm2,xmm7 - - movdqa xmm7,xmm1 - punpcklwd xmm1,xmm2 - punpckhwd xmm7,xmm2 - - paddd xmm1,xmm6 - paddd xmm7,xmm6 - - movdqa [ebp-74],xmm7 - - - ;prepare V data - movd xmm2,[ebx] - pshufb xmm2,xmm5 - - add ebx,4 - - psubsw xmm2,xmm3 - - movdqa xmm5,xmm2 - - movdqa xmm3,xmm2 - movdqa xmm7,[ebp-170] - pmullw xmm2,xmm7 - pmulhw xmm3,xmm7 - - movdqa xmm7,xmm2 - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - - paddd xmm2,xmm6 - paddd xmm7,xmm6 - - movdqa [ebp-90],xmm7 - - - movdqa xmm3,xmm5 - movdqa xmm7,[ebp-186] - pmullw xmm3,xmm7 - pmulhw xmm5,xmm7 - - movdqa xmm7,xmm3 - punpcklwd xmm3,xmm5 - punpckhwd xmm7,xmm5 - - paddd xmm0,xmm3 - paddd xmm4,xmm7 - - movdqa [ebp-58],xmm4 - - jmp valid_yuv_data - -load_yuv_data: - movdqa xmm1,[ebp-74] - movdqa xmm2,[ebp-90] - movdqa xmm0,[ebp-58] - -valid_yuv_data: - - - ;Y data processing - movd xmm4,[esi] - pshufb xmm4,[ebp-298] - - movdqa xmm5,xmm4 - movdqa xmm6,xmm4 - - paddd xmm4,xmm2 - psubd xmm5,xmm0 - paddd xmm6,xmm1 - - pslld xmm4,8 - pslld xmm5,8 - pslld xmm6,8 - - movdqa xmm7,[ebp-234] - pmaxsw xmm4,xmm7 ;what an awesome instruction! - pmaxsw xmm5,xmm7 - pmaxsw xmm6,xmm7 - - movdqa xmm7,[ebp-218] - pminsw xmm4,xmm7 - pminsw xmm5,xmm7 - pminsw xmm6,xmm7 - - pand xmm4,[ebp-250] - pshufb xmm5,[ebp-266] - pshufb xmm6,[ebp-282] - - por xmm4,xmm5 - por xmm4,xmm6 - - movdqa [edi],xmm4 - - - ;Y data processing in secound line - test byte [ebp-41],2 - jnz skip_last_line1 - - mov dx,[ebp-38] - and edx,0FFFFH - movd xmm4,[esi+edx] - pshufb xmm4,[ebp-298] - - - movdqa xmm5,xmm4 - movdqa xmm6,xmm4 - - paddd xmm4,xmm2 - psubd xmm5,xmm0 - paddd xmm6,xmm1 - - pslld xmm4,8 - pslld xmm5,8 - pslld xmm6,8 - - movdqa xmm7,[ebp-234] - pmaxsw xmm4,xmm7 ;what an awesome instruction! - pmaxsw xmm5,xmm7 - pmaxsw xmm6,xmm7 - - movdqa xmm7,[ebp-218] - pminsw xmm4,xmm7 - pminsw xmm5,xmm7 - pminsw xmm6,xmm7 - - pand xmm4,[ebp-250] - pshufb xmm5,[ebp-266] - pshufb xmm6,[ebp-282] - - por xmm4,xmm5 - por xmm4,xmm6 - - mov edx,[ebp-318] - movdqa [edi+edx],xmm4 - -skip_last_line1: - add edi,16 - add esi,4 - - dec cx - jne freerdp_image_yuv420p_to_xrgb_wloop - -freerdp_image_yuv420p_to_xrgb_wloop_end: - mov edx,[ebp-318] - add edi,edx - - mov edx,[ebp-190] - add esi,edx - - mov edx,[ebp-194] - add eax,edx - add ebx,edx - - jmp freerdp_image_yuv420p_to_xrgb_hloop - -freerdp_image_yuv420p_to_xrgb_hloop_end: - - mov eax,0 -freerdp_image_yuv420p_to_xrgb_end: - mov edx,[ebp-202] - - mov esp,ebp - add esp,edx - pop ebp - pop ebx - ret diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm deleted file mode 100644 index b62febe2d..000000000 --- a/libfreerdp/codec/h264_ssse3_x64.asm +++ /dev/null @@ -1,628 +0,0 @@ -; function for converting YUV420p data to the RGB format (but without any special upconverting) -; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. -; The target scanline (6th parameter) must be a multiple of 16. -; iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four -; of the half of iStride[0] or bigger -; -section .text - global check_ssse3 - -check_ssse3: - push rbx - - pushf - pop rax - or rax,1<<21 - push rax - popf - pushf - pop rax - test rax,1<<21 - jz check_ssse3_end - - and rax,~(1<<21) - push rax - popf - - - mov eax,1 - mov ebx,0 - cpuid - test edx,1<<25 ;sse - jz check_ssse3_end - test edx,1<<26 ;sse2 - jz check_ssse3_end - test ecx,1<<0 ;sse3 - jz check_ssse3_end - test ecx,1<<9 ;ssse3 - jz check_ssse3_end - - - pop rbx - mov eax,0 - ret - - -check_ssse3_end: - pop rbx - mov eax,1 - ret - - -;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline) - global freerdp_image_yuv420p_to_xrgb -freerdp_image_yuv420p_to_xrgb: - push rbx - push rbp - -;check wether stack is aligned to 16 byte boundary -; -; ---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack--- -; lets say 508 2 506 464 -; 1FCH 2H 1FAH 1D0H -; 1F0H 1D0H -; |------1FCH&FH----|1FCH&^FH -; |1FCH&FH-AH |--AH-|---16 byte aligned stack------------ -; We've got only one problem: what if 1FCH&FH was smaller than AH? -; We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H] -; That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH - mov r15,rsp - add r15,6H - and r15,1111B - sub rsp,r15 - - mov rbp,rsp - - xor r10,r10 - xor r11,r11 - xor r12,r12 - xor r13,r13 - xor r14,r14 - -;"local variables" - sub rsp,338 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,last_column 1 -42, - ;G 16 -58,B 16 -74,R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190, - ;res 12 -202,cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,andRemainingColumns 16 -330, - ;VddDst 8 -338 - -;last_line: if the last (U,V doubled) line should be skipped, set to 10B -;last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) - - mov [rbp-8],rdi - - mov rax,[rsi] - mov [rbp-16],rax - mov rax,[rsi+8] - mov [rbp-24],rax - mov rax,[rsi+16] - mov [rbp-32],rax - - mov [rbp-34],dx - mov r13w,cx - - mov r10w,r9w - and r10,0FFFFH - - - mov ecx,[r8] - mov [rbp-38],ecx - mov r12d,[r8+4] - mov [rbp-40],r12w - - - mov [rbp-42],dl - and byte [rbp-42],11B - - - mov [rbp-338],r10 - shr word [rbp-338],1 - shl cx,1 - - mov r8w,[rbp-34] - add r8w,3 - and r8w, 0FFFCH - - sub [rbp-338],r8w - sub cx,r8w - - shr r8w,1 - - mov dx,r8w - add dx,2 - and dx,0FFFCH - sub r12w,dx - - shl dword [rbp-338],2 - mov r11w,cx - - shr r8w,1 - - mov r9w,[rbp-38] - - - ;and al,11B - ;jz no_column_rest - - ;inc word [rbp-34] - -;no_column_rest: - ;mov [rbp-41],al - - - - mov r14b,r13b - and r14b,1B - ;jz no_line_rest - - inc r13w - -;no_line_rest: - shr r13w,1 - - - -;init masks - mov eax,00000080H - mov [rbp-106],eax - mov [rbp-102],eax - mov [rbp-98],eax - mov [rbp-94],eax - - mov eax,00800080H - mov [rbp-122],eax - mov [rbp-118],eax - mov [rbp-114],eax - mov [rbp-110],eax - - mov eax,00300030H - mov [rbp-138],eax - mov [rbp-134],eax - mov [rbp-130],eax - mov [rbp-126],eax - - mov eax,01DB01DBH - mov [rbp-154],eax - mov [rbp-150],eax - mov [rbp-146],eax - mov [rbp-142],eax - - mov eax,01930193H - mov [rbp-170],eax - mov [rbp-166],eax - mov [rbp-162],eax - mov [rbp-158],eax - - mov eax,00780078H - mov [rbp-186],eax - mov [rbp-182],eax - mov [rbp-178],eax - mov [rbp-174],eax - - mov eax,000FF0000H - mov [rbp-218],eax - mov [rbp-214],eax - mov [rbp-210],eax - mov [rbp-206],eax - - mov eax,00000000H - mov [rbp-234],eax - mov [rbp-230],eax - mov [rbp-226],eax - mov [rbp-222],eax - -;shuffle masks - ;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00 - ;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb - mov eax,00FF0000H - mov [rbp-250],eax - mov [rbp-246],eax - mov [rbp-242],eax - mov [rbp-238],eax - - mov eax,80800280H - mov [rbp-266],eax - mov eax,80800680H - mov [rbp-262],eax - mov eax,80800A80H - mov [rbp-258],eax - mov eax,80800E80H - mov [rbp-254],eax - - mov eax,80808002H - mov [rbp-282],eax - mov eax,80808006H - mov [rbp-278],eax - mov eax,8080800AH - mov [rbp-274],eax - mov eax,8080800EH - mov [rbp-270],eax - - ;dd cc bb aa - ;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00 - mov eax,80800080H - mov [rbp-298],eax - mov eax,80800180H - mov [rbp-294],eax - mov eax,80800280H - mov [rbp-290],eax - mov eax,80800380H - mov [rbp-286],eax - - ;dd cc bb aa - ;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa - mov eax,80008000H - mov [rbp-314],eax - mov eax,80018001H - mov [rbp-310],eax - mov eax,80028002H - mov [rbp-306],eax - mov eax,80038003H - mov [rbp-302],eax - -;remaining columns and mask - cmp byte [rbp-42],0 - je freerdp_image_yuv420p_to_xrgb_no_columns_remain - - mov dl,[rbp-42] - xor ebx,ebx - xor ecx,ecx - xor esi,esi - - mov eax,0FFFFFFFFH - cmp dl,1H - je freerdp_image_yuv420p_to_xrgb_write_columns_remain - - mov ebx,0FFFFFFFFH - cmp dl,2H - je freerdp_image_yuv420p_to_xrgb_write_columns_remain - - mov ecx,0FFFFFFFFH - -freerdp_image_yuv420p_to_xrgb_write_columns_remain: - mov [rbp-330],eax - mov [rbp-326],ebx - mov [rbp-322],ecx - mov [rbp-318],esi - mov byte [rbp-42],1 - -freerdp_image_yuv420p_to_xrgb_no_columns_remain: - - - mov rsi,[rbp-16] - mov rax,[rbp-24] - mov rbx,[rbp-32] - - ;jmp freerdp_image_yuv420p_to_xrgb_end - -freerdp_image_yuv420p_to_xrgb_hloop: - dec r13w - js freerdp_image_yuv420p_to_xrgb_hloop_end - jnz not_last_line - - shl r14b,1 -not_last_line: - - xor cx,cx -freerdp_image_yuv420p_to_xrgb_wloop: -; Well, in the end it should look like this: -; C = Y; -; D = U - 128; -; E = V - 128; -; -; R = clip(( 256 * C + 403 * E + 128) >> 8); -; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); -; B = clip(( 256 * C + 475 * D + 128) >> 8); - - test cx,1B - jnz freerdp_image_yuv420p_to_xrgb_load_yuv_data - - -; Y-, U- and V-data is stored in different arrays. -; We start with processing U-data. - -; at first we fetch four U-values from its array and shuffle them like this: -; 0d0d 0c0c 0b0b 0a0a -; we've done two things: converting the values to signed words and duplicating -; each value, because always two pixel "share" the same U- (and V-) data - movd xmm0,[rax] - movdqa xmm5,[rbp-314] - pshufb xmm0,xmm5 ;but this is the awesomest instruction of all!! - - add rax,4 - -; then we subtract 128 from each value, so we get D - movdqa xmm3,[rbp-122] - psubsw xmm0,xmm3 - -; we need to do two things with our D, so let's store it for later use - movdqa xmm2,xmm0 - -; now we can multiply our D with 48 and unpack it to xmm4:xmm0 -; this is what we need to get G data later on - movdqa xmm4,xmm0 - movdqa xmm7,[rbp-138] - pmullw xmm0,xmm7 - pmulhw xmm4,xmm7 - - movdqa xmm7,xmm0 - punpcklwd xmm0,xmm4 ;what an awesome instruction! - punpckhwd xmm7,xmm4 - movdqa xmm4,xmm7 - -; to complete this step, add (?) 128 to each value (rounding ?!) -; yeah, add. in the end this will be subtracted from something, -; because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! -; by the way, our values have become signed dwords during multiplication! - movdqa xmm6,[rbp-106] - psubd xmm0,xmm6 - psubd xmm4,xmm6 - - -; to get B data, we need to prepare a secound value, D*475+128 - movdqa xmm1,xmm2 - movdqa xmm7,[rbp-154] - pmullw xmm1,xmm7 - pmulhw xmm2,xmm7 - - movdqa xmm7,xmm1 - punpcklwd xmm1,xmm2 - punpckhwd xmm7,xmm2 - - paddd xmm1,xmm6 - paddd xmm7,xmm6 - -; so we got something like this: xmm7:xmm1 -; this pair contains values for 16 pixel: -; aabbccdd -; aabbccdd, but we can only work on four pixel at once, so we need to save upper values - movdqa [rbp-74],xmm7 - - -; Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients. - movd xmm2,[rbx] - pshufb xmm2,xmm5 - - add rbx,4 - - psubsw xmm2,xmm3 - - movdqa xmm5,xmm2 - -; this is also known as E*403+128, we need it to convert R data - movdqa xmm3,xmm2 - movdqa xmm7,[rbp-170] - pmullw xmm2,xmm7 - pmulhw xmm3,xmm7 - - movdqa xmm7,xmm2 - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - - paddd xmm2,xmm6 - paddd xmm7,xmm6 - -; and preserve upper four values for future ... - movdqa [rbp-90],xmm7 - - -; doing this step: E*120 - movdqa xmm3,xmm5 - movdqa xmm7,[rbp-186] - pmullw xmm3,xmm7 - pmulhw xmm5,xmm7 - - movdqa xmm7,xmm3 - punpcklwd xmm3,xmm5 - punpckhwd xmm7,xmm5 - -; now we complete what we've begun above: -; (48*D-128) + (120*E) = (48*D +120*E -128) - paddd xmm0,xmm3 - paddd xmm4,xmm7 - -; and store to memory ! - movdqa [rbp-58],xmm4 - -; real assembly programmers do not only produce best results between 0 and 5 o'clock, -; but are also kangaroos! - jmp freerdp_image_yuv420p_to_xrgb_valid_yuv_data - -freerdp_image_yuv420p_to_xrgb_load_yuv_data: -; maybe you've wondered about the conditional jump to this label above ? -; Well, we prepared UV data for eight pixel in each line, but can only process four -; per loop. So we need to load the upper four pixel data from memory each secound loop! - movdqa xmm1,[rbp-74] - movdqa xmm2,[rbp-90] - movdqa xmm0,[rbp-58] - -freerdp_image_yuv420p_to_xrgb_valid_yuv_data: - - inc cx - cmp cx,r8w - jne freerdp_image_yuv420p_to_xrgb_not_last_columns - - shl byte [rbp-42],1 - - -freerdp_image_yuv420p_to_xrgb_not_last_columns: - -; We didn't produce any output yet, so let's do so! -; Ok, fetch four pixel from the Y-data array and shuffle them like this: -; 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 - movd xmm4,[rsi] - pshufb xmm4,[rbp-298] - - movdqa xmm5,xmm4 - movdqa xmm6,xmm4 - -; no we can perform the "real" conversion itself and produce output! - paddd xmm4,xmm2 - psubd xmm5,xmm0 - paddd xmm6,xmm1 - -; in the end, we only need bytes for RGB values. -; So, what do we do? right! shifting left makes values bigger and thats always good. -; before we had dwords of data, and by shifting left and treating the result -; as packed words, we get not only signed words, but do also divide by 256 -; imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least -; significant byte, that we don't need anymore, because we've done some rounding - pslld xmm4,8 - pslld xmm5,8 - pslld xmm6,8 - -; one thing we still have to face is the clip() function ... -; we have still signed words, and there are those min/max instructions in SSE2 ... -; the max instruction takes always the bigger of the two operands and stores it in the first one, -; and it operates with signs ! -; if we feed it with our values and zeros, it takes the zeros if our values are smaller than -; zero and otherwise our values - movdqa xmm7,[rbp-234] - pmaxsw xmm4,xmm7 ;what an awesome instruction! - pmaxsw xmm5,xmm7 - pmaxsw xmm6,xmm7 - -; the same thing just completely different can be used to limit our values to 255, -; but now using the min instruction and 255s - movdqa xmm7,[rbp-218] - pminsw xmm4,xmm7 - pminsw xmm5,xmm7 - pminsw xmm6,xmm7 - -; Now we got our bytes. -; the moment has come to assemble the three channels R,G and B to the xrgb dwords -; on Red channel we just have to and each futural dword with 00FF0000H - pand xmm4,[rbp-250] -; on Green channel we have to shuffle somehow, so we get something like this: -; 00d0 00c0 00b0 00a0 - pshufb xmm5,[rbp-266] -; and on Blue channel that one: -; 000d 000c 000b 000a - pshufb xmm6,[rbp-282] - -; and at last we or it together and get this one: -; xrgb xrgb xrgb xrgb - por xmm4,xmm5 - por xmm4,xmm6 - -; Only thing to do know is writing data to memory, but this gets a bit more -; complicated if the width is not a multiple of four and it is the last column in line. -; but otherwise just play the kangaroo - test byte [rbp-42],2 - je freerdp_image_yuv420p_to_xrgb_column_process_complete - -; let's say, we need to only convert six pixel in width -; Ok, the first 4 pixel will be converted just like every 4 pixel else, but -; if it's the last loop in line, [rbp-42] is shifted left by one (curious? have a look above), -; and we land here. Through initialisation a mask was prepared. In this case it looks like -; 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH - movdqa xmm6,[rbp-330] -; we and our output data with this mask to get only the valid pixel - pand xmm4,xmm6 -; then we fetch memory from the destination array ... - movdqu xmm5,[rdi] -; ... and and it with the inverse mask. We get only those pixel, which should not be updated - pandn xmm6,xmm5 -; we only have to or the two values together and write it back to the destination array, -; and only the pixel that should be updated really get changed. - por xmm4,xmm6 - -freerdp_image_yuv420p_to_xrgb_column_process_complete: - movdqu [rdi],xmm4 - - -; Because UV data is the same for two lines, we can process the secound line just here, -; in the same loop. Only thing we need to do is to add some offsets to the Y- and destination -; pointer. These offsets are iStride[0] and the target scanline. -; But if we don't need to process the secound line, like if we are in the last line of processing nine lines, -; we just skip all this. - test r14b,2 - jnz freerdp_yuv420p_to_xrgb_skip_last_line - - movd xmm4,[rsi+r9] - pshufb xmm4,[rbp-298] - - - movdqa xmm5,xmm4 - movdqa xmm6,xmm4 - - paddd xmm4,xmm2 - psubd xmm5,xmm0 - paddd xmm6,xmm1 - - pslld xmm4,8 - pslld xmm5,8 - pslld xmm6,8 - - movdqa xmm7,[rbp-234] - pmaxsw xmm4,xmm7 ;what an awesome instruction! - pmaxsw xmm5,xmm7 - pmaxsw xmm6,xmm7 - - movdqa xmm7,[rbp-218] - pminsw xmm4,xmm7 - pminsw xmm5,xmm7 - pminsw xmm6,xmm7 - - pand xmm4,[rbp-250] - pshufb xmm5,[rbp-266] - pshufb xmm6,[rbp-282] - - por xmm4,xmm5 - por xmm4,xmm6 - - test byte [rbp-42],2 - je freerdp_image_yuv420p_to_xrgb_column_process_complete2 - - movdqa xmm6,[rbp-330] - pand xmm4,xmm6 - movdqu xmm5,[rdi+r10] - pandn xmm6,xmm5 - por xmm4,xmm6 - -; only thing is, we should shift [rbp-42] back here, because we have processed the last column, -; and this "special condition" can be released - shr byte [rbp-42],1 - -freerdp_image_yuv420p_to_xrgb_column_process_complete2: - movdqu [rdi+r10],xmm4 - - -freerdp_yuv420p_to_xrgb_skip_last_line: -; after all we have to increase the destination- and Y-data pointer by four pixel - add rdi,16 - add rsi,4 - - cmp cx,r8w - jne freerdp_image_yuv420p_to_xrgb_wloop - -freerdp_image_yuv420p_to_xrgb_wloop_end: -; after each line we have to add the scanline to the destination pointer, because -; we are processing two lines at once, but only increasing the destination pointer -; in the first line. Well, we only have one pointer, so it's the easiest way to access -; the secound line with the one pointer and an offset (scanline) -; if we're not converting the full width of the scanline, like only 64 pixel, but the -; output buffer was "designed" for 1920p HD, we have to add the remaining length for each line, -; to get into the next line. - add rdi,[rbp-338] - -; same thing has to be done for Y-data, but with iStride[0] instead of the target scanline - add rsi,r11 - -; and again for UV data, but here it's enough to add the remaining length, because -; UV data is the same for two lines and there exists only one "UV line" on two "real lines" - add rax,r12 - add rbx,r12 - ;mov eax,r12d - ;jmp freerdp_image_yuv420p_to_xrgb_end - - jmp freerdp_image_yuv420p_to_xrgb_hloop - -freerdp_image_yuv420p_to_xrgb_hloop_end: - - mov eax,0 -freerdp_image_yuv420p_to_xrgb_end: - mov rsp,rbp - add rsp,r15 - pop rbp - pop rbx - ret \ No newline at end of file diff --git a/libfreerdp/codec/h264_x32.asm b/libfreerdp/codec/h264_x32.asm deleted file mode 100644 index 09011d9e5..000000000 --- a/libfreerdp/codec/h264_x32.asm +++ /dev/null @@ -1,240 +0,0 @@ -;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 -;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 -;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 - -section .text - ;global YUV_to_RGB_asm -YUV_to_RGB_asm: - shl edi,8 - - mov eax,edx - imul eax,403 - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_asm1 - mov eax,0 - jmp YUV_to_RGB_asm11 - -YUV_to_RGB_asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_asm11 - mov eax,0xFF00 - -YUV_to_RGB_asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,esi - imul ebx,475 - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_asm2 - mov ebx, 0 - jmp YUV_to_RGB_asm21 - -YUV_to_RGB_asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_asm21 - mov ebx,0xFF00 - -YUV_to_RGB_asm21: - and ebx,0xFF00 - shr ebx,8 - - imul edx,120 - sub edi,edx - imul esi,48 - sub edi,esi - add edi,21632 - - bt edi,31 - jae YUV_to_RGB_asm3 - mov edi, 0 - jmp YUV_to_RGB_asm31 - -YUV_to_RGB_asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_asm31 - mov edi, 0xFF00 - -YUV_to_RGB_asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - -;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); - global freerdp_image_yuv_to_xrgb_asm -freerdp_image_yuv_to_xrgb_asm: - push ebp - mov ebp, esp - ;cWidth: cx - sub esp,36 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[0] addition - push ebx - - - mov edi,[ebp+8] - mov [ebp-4],edi - - mov esi,[ebp+12] - mov eax,[esi] - mov [ebp-8],eax - mov eax,[esi+4] - mov [ebp-12],eax - mov eax,[esi+8] - mov [ebp-16],eax - - mov edx,[ebp+16] - mov [ebp-20],edx - - - mov ecx,[ebp+20] - shr ecx,1 ;/2 - mov [ebp-24],ecx - - - shl edx,2 - mov [ebp-32],edx - - - mov eax,[ebp-24] - mov [ebp-28],eax - - - mov ebx,[ebp+24] - mov [ebp-36],ebx - mov eax,[ebp-20] - shl dword [ebp-36],1 - sub [ebp-36],eax - - shr eax,1 - sub [ebp+28],eax - -freerdp_image_yuv_to_xrgb_asm_loopH: - mov ecx,[ebp-20] - shr ecx,1 - - -freerdp_image_yuv_to_xrgb_asm_loopW: - mov eax,[ebp-8] - mov edi,[eax] - and edi,0xFF - - mov eax,[ebp-12] - mov esi,[eax] - and esi,0xFF - - mov eax,[ebp-16] - mov edx,[eax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov ebx,[ebp-4] - mov [ebx],eax - - - mov eax,[ebp-8] - mov ebx,[ebp+24] - mov edi,[eax+ebx] - inc eax - mov [ebp-8],eax - and edi,0xFF - - mov eax,[ebp-12] - mov esi,[eax] - and esi,0xFF - - mov eax,[ebp-16] - mov edx,[eax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov ebx,[ebp-4] - mov edx,[ebp-32] - mov [ebx+edx],eax - add ebx,4 - mov [ebp-4],ebx - - - mov eax,[ebp-8] - mov edi,[eax] - and edi,0xFF - - mov eax,[ebp-12] - mov esi,[eax] - and esi,0xFF - - mov eax,[ebp-16] - mov edx,[eax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov ebx,[ebp-4] - mov [ebx],eax - - - mov eax,[ebp-8] - mov ebx,[ebp+24] - mov edi,[eax+ebx] - inc eax - mov [ebp-8],eax - and edi,0xFF - - mov eax,[ebp-12] - mov esi,[eax] - inc eax - mov [ebp-12],eax - and esi,0xFF - - mov eax,[ebp-16] - mov edx,[eax] - inc eax - mov [ebp-16],eax - and edx,0xFF - - call YUV_to_RGB_asm - - mov ebx,[ebp-4] - mov edx,[ebp-32] - mov [ebx+edx],eax - add ebx,4 - mov [ebp-4],ebx - - dec cx - jne freerdp_image_yuv_to_xrgb_asm_loopW - - - mov eax,[ebp-4] - add eax,[ebp-32] - mov [ebp-4],eax - - mov eax,[ebp-8] - add eax,[ebp-36] - mov [ebp-8],eax - - mov ebx,[ebp+28] - mov eax,[ebp-12] - add eax,ebx - mov [ebp-12],eax - - mov eax,[ebp-16] - add eax,ebx - mov [ebp-16],eax - - dec dword [ebp-28] - jne freerdp_image_yuv_to_xrgb_asm_loopH - -;END - mov eax,0 -END: - pop ebx - mov esp,ebp - pop ebp - ret diff --git a/libfreerdp/codec/h264_x64.asm b/libfreerdp/codec/h264_x64.asm deleted file mode 100644 index c7963220e..000000000 --- a/libfreerdp/codec/h264_x64.asm +++ /dev/null @@ -1,269 +0,0 @@ -;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 -;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 -;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 - -section .text - ;global YUV_to_RGB_asm -YUV_to_RGB_asm: - shl rdi,8 - - mov eax,edx - imul eax,403 - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_asm1 - mov eax,0 - jmp YUV_to_RGB_asm11 - -YUV_to_RGB_asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_asm11 - mov eax,0xFF00 - -YUV_to_RGB_asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,esi - imul ebx,475 - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_asm2 - mov ebx, 0 - jmp YUV_to_RGB_asm21 - -YUV_to_RGB_asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_asm21 - mov ebx,0xFF00 - -YUV_to_RGB_asm21: - and ebx,0xFF00 - shr ebx,8 - - imul edx,120 - sub edi,edx - imul esi,48 - sub edi,esi - add edi,21632 - - bt edi,31 - jae YUV_to_RGB_asm3 - mov edi, 0 - jmp YUV_to_RGB_asm31 - -YUV_to_RGB_asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_asm31 - mov edi, 0xFF00 - -YUV_to_RGB_asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - -;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline); - global freerdp_image_yuv_to_xrgb_asm -freerdp_image_yuv_to_xrgb_asm: - push rbx - push rbp - mov rbp, rsp - ;cWidth: cx - sub rsp,82 ;pDstData -8,pSrcData[3] -32,nWidth -40,nHeight -48,cHeight -56,scanline -64,iStride[0] -72,VaddDst -80,last_column 1 -81,last_line 1 -82 - -;last_column: set to 10B, if last column should be skipped ('cause UV data is the same for two columns and two columns are processed at once) -;last_line: set to 10B, if last line should be skipped ('cause UV data is the same for two lines and two lines are processed at once) - - - mov [rbp-8],rdi - - mov rax,[rsi] - mov [rbp-16],rax - mov rax,[rsi+8] - mov [rbp-24],rax - mov rax,[rsi+16] - mov [rbp-32],rax - - and rdx,0FFFFH - ;mov [rbp-40],rdx - - - shr rcx,1 ;/2 - mov [rbp-48],rcx - - - and r9,0FFFFH - mov [rbp-64],r9 - - shr r9d,1 - sub r9d,edx - shl r9d,2 - mov [rbp-80],r9 - - - mov rax,[rbp-48] - mov [rbp-56],rax - - - mov rcx,[r8] - and rcx,0FFFFH - mov [rbp-72],rcx - shl dword [rbp-72],1 - sub [rbp-72],rdx - - mov r9,[r8+4] - mov r8,rcx - - and r9,0FFFFH - shr rax,1 - sub r9,rax - - - mov al,dl - and al,1B - mov [rbp-81],al - inc dx - shr edx,1 - mov [rbp-40],rdx - -freerdp_image_yuv_to_xrgb_asm_loopH: - mov cx,[rbp-40] - - -freerdp_image_yuv_to_xrgb_asm_loopW: - dec cx - jne freerdp_image_yuv_to_xrgb_asm_not_last_column - - shl byte [rbp-81],1 - -freerdp_image_yuv_to_xrgb_asm_not_last_column: - - - mov rax,[rbp-16] - mov edi,[rax] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - test byte [rbp-81],2 - jne freerdp_image_yuv_to_xrgb_asm_skip_last_column - - mov rax,[rbp-16] - mov edi,[rax+r8] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov rdx,[rbp-64] - mov [rbx+rdx],eax - -freerdp_image_yuv_to_xrgb_asm_skip_last_column: - add qword [rbp-8],4 - inc qword [rbp-16] - - - mov rax,[rbp-16] - mov edi,[rax] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - test byte [rbp-81],2 - jne freerdp_image_yuv_to_xrgb_asm_skip_last_column2 - - mov rax,[rbp-16] - mov edi,[rax+r8] - and edi,0xFF - - mov rax,[rbp-24] - mov esi,[rax] - and esi,0xFF - - mov rax,[rbp-32] - mov edx,[rax] - and edx,0xFF - - call YUV_to_RGB_asm - - ;shr [rbp-81],1 - - mov rbx,[rbp-8] - mov rdx,[rbp-64] - mov [rbx+rdx],eax - -freerdp_image_yuv_to_xrgb_asm_skip_last_column2: - add qword [rbp-8],4 - inc qword [rbp-16] - inc qword [rbp-24] - inc qword [rbp-32] - - - test cx,0FFFFH - jne freerdp_image_yuv_to_xrgb_asm_loopW - jmp END - - - mov rax,[rbp-8] - add rax,[rbp-80] - mov [rbp-8],rax - - mov rax,[rbp-16] - add rax,[rbp-72] - mov [rbp-16],rax - - mov rax,[rbp-24] - add rax,r9 - mov [rbp-24],rax - - mov rax,[rbp-32] - add rax,r9 - mov [rbp-32],rax - - dec qword [rbp-56] - jne freerdp_image_yuv_to_xrgb_asm_loopH - -;END - mov rax,0 -END: - mov rsp,rbp - pop rbp - pop rbx - ret \ No newline at end of file diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 b/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 new file mode 100644 index 000000000..7709e9423 --- /dev/null +++ b/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 @@ -0,0 +1,14 @@ +TestOpenH264: TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o + gcc -o TestOpenH264 TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o -lwinpr + +h264_ssse3.c.o: ../h264_ssse3.c + gcc -c -O3 -o h264_ssse3.c.o ../h264_ssse3.c -mssse3 + +TestOpenH264ASM.c.o: TestOpenH264ASM.c + gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c + +h264.c.o: ../h264.c + gcc -c -o h264.c.o ../h264.c + +clean: + rm -f TestOpenH264 TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/TestOpenH264 b/libfreerdp/codec/test/TestOpenH264 new file mode 100755 index 0000000000000000000000000000000000000000..c92bd5af2190f0d681727a24c74e78bfb62ea1c6 GIT binary patch literal 15584 zcmcIre{@vUoxd{^AVkargc=d$iIbM>mJldlRID>F4_|bm6o}OpoP=Zo(Ik_d9}-$3 zamRS_c8Jnd_fWT-0%H;=e>8gs-6{Ai-pNxW#48*4d*#IWZW6+|2iiHR>pFflg(zc z*hC-`a5*`LVk0Hvm@qBlTp?#eoI`1=0IW_UA>vsEg&d3^J*1fO8w*a6G~&1C7^I9D zQ#e3+o>VKmXbw{hZ%WzkbQ0`3<%68tR+E zE%RE+mdsnSs3hnwS;+k_fI%wa9d}o9)3QEhf}*^qGcmmQZ3Q;?u`z82S2xURvKkLzk&WdCcpGJ~jg4pV<1 zA7mgO^CW@zN?cUlNw}yySa{ku3Q9EOw~u z3a-qc0H7-~dO5h5v%y0_8cy_+b zf=|xk=iV&x4`#vN%Yxqp`vvSwva6^H5@VrE;5#z#+q2+S;8W47rMw*Hzk$hVYzpg| z>>M+W)YBZdGtYJDIK#ZKslKT>KvThjk_F7`-O%W7@&-fIfsohBymvnAUFF+Q9}M{d z%NweLL0^!0%hyzRYkdI$uc=tx;BWG+sb1gUGkD{3svGJzH2G@1bpfA`l?PV_{58H{ z&>vXgs}6+&zLf!A(AN}FTbQ@5zNtRD1oMI&VqR&lvAUtbUqc0{p+YQSb#)El;6_&G zZ}v5@=5Q#;Y7CKJD9`|?(bpLCg;-rpgFonF!4QfPV0D`U^&ubI;0uN78-4z|+Um`y z3o!0Q*t62Of zHvS|gDWWa>&xmOrvamOyOY<0o@XUHlS5~$X?M|WJQN?4LtLTzJ_6M3PY0i^jRLIlZ zC`=eE(ccIcOBlyaC3QHC_aJOTxkSrp4BbiAIBoT^c5=~8;=;QP>qNzy}F;3?Z zO--5T;>5qt}CQVdv`j13YQzlk& z`Y6%Vgb637-z1uvE>X(q14L7kB@|BYC7PD-L;4f=|lzBM)f$M_S~>@X9siJ+ZD?PNwyA$9U9p@d_u4-vK)#+qY9) zYtae+d5yxf_E^Z8?BkWb-5mD(6b)6W&D()Ui;U)L`pHe!bqDt#Kdv;gZa=Y-1NPZ% zgzX_p%6E_2Ag}2MwC>ZlYnDDuKdc?O6o&p@lGWOehNS*SLci^{1l(casg=--KLSVB z9iH$3Aj`Rf)c#uJw%n^B5V|(y!qm$B19RY=elpp&F7ZRCBp$|X`*t$1T6=V87xLoL z@#|6VDCu03`iHQo>D_y&Q|_bgvwtI&i1-%N;U3^6 zS^Ts??J=n11{K>!EwZ2e@(~smJB0 z?$h6-WXD9P?SWdSW^0exweIdwD=b(JXweE`&C+Y%%{r`*i(M|)$K@)Aw3?%u+DSU1vb9N8{p4n)=WT>4*J`mla@e;YcG(a-9>G|HewVpdeMefKal z`$1Vix$V1SL~_O`QUfWmUO|zVeK%S`9pI^$zR02K-7b9;*~C)WBri}S#HqWAE;INL z^_3dZ?8kt!?=Il9VV5^~*@4h<=umxSh$6?{6g0rK$q7KnbU2miKOxh>D`jVR@)7pP zwn%H7h4S`ac|8Wu3t%I6v`|m{Gb6o+(jOb?7^VMUr28rTmXRKy^lVfcuR!h5Lo{f= zzwRK7VZHTYr1c`(Jc;uLuX?ogB8{O(Chb8xCI@X%^`aq&2~E+@=zY$P)}fAyc!zqz zz8l`F7ZJrHu{K>DMd}1kjX(?I=cC-HI#M5d3Uz4di^SWyM|0uGp?3m5b`q;5_n+v(f88~o`@-0B(EQajl14x{i z-1i1ADrPGMy^yL${M`s+Hu25GVCg{js1=YNdHb|&4qq}|MFErXXf8Szo zsV-L&h5|?1$p-Hd73gg3WS7r(B=iIcMb%MCzhp|+A)Ru_X*kqRxxf4#-Jj_rlX_<_ zqUem<4DSYrb7XJXW!~wd5bkUze#*O-3Dea$GLO?NOzX2T3*Swh=seibPRb?v4CN@) zT^OoI_O7}thi{DKKqcwm#%ZpL%Y12w5bHT&i9Bf!VB~o}<@t}C)lS9Kq7YU`d(;t# zu)zhI}qaNo@N7Vu?ihs2*%Iv>V=V~>*QB|R=dijc~4n)=G z!FVxRxvTplD;8Nk`nwEfyhk0TG9-#&3`Pnt)#SonR2`01%KZ0 z!HXyywHLJ~s`l^IotzwqsKcm~;ak<8*tfkUHv`xgRf)-n<`^^>dTn?Bs)#XsP{Jq{ zvXR5;Id)h*i|Y& zHI{lj+D_S`lNa2o9uJCKU!xvv-Z;2WTDfz&158@WuxpzBIt>66w{WaK_t{@@=Gb4M zu8kzKzY-HGy!y5QIVYyLPAjpEErHrA*o4`C#oZD;*f0ZssT zlQQ;7Zy7tO{aq=ucR_`3u(8Hduam<{Z^bGU9@eqx)GG!&y4ugB#KI1)X^C$g+DAX2 z_R|Cz{{{I#gD}!M%0e^X#b~s0$mr(OyI$knQXLiRS!AdH!&gX8A!!m)VdRa2_~App ziT2a4$M{QUlF9biUdkghbwB+;i*E(A;3ziKPh*d5{8n;(*LAN;f5%nn(%*NjDc48% z4}nUL{`zW7|8lmEm8d_Hn<>!&NCQydjh>|`S!*(#!I9R>Oaq4{kWtaWz( zgryUa-Hf&iHX`gq&T_<4|6O@S8;FWlxN@yEB<|_XPWiV4`~DCDt=pSKkw?iJ86I70W84o z&IgkfH76=+VjjKLQQ&$F>ps(JlI4-tEp=|Y|2hRuROp{-`YeYB<;gAYK9$pcw8DPR zsq*fV)(Xo&`%#bXb||hrw6p?$9j;$_0l%$1mL5&t;=tmy#ZgGMW}E;@i7N+@ARdP& zd_pkTVZ`>?ZpM~(F1_7@`f7i}^*d@74+@;?w!4$0ytoNs9{nx#hOac+T1Vjmj?SZB z^L6&mjwQZCZHeENbuN1oRdCT2V3tP?A*o6OvqX6ff=W#;N?+*aGSZ6;Oud~;m ztg}~tMKWi(%=^GjPB>-geGWe83m@US0J-5h`=!%prd@6D6zQ{+tF1h8(8#S>itiU+Pa4xVMj3+?o`3r(;Q^$8z%@K$WqNU3T|~>ouxFmoft5hj0x+>r|aRFp^7ZKmx<) zZl4}Rc|NPNpY8_mIkAo%06GXH!EGf#eGKYDPNB92(b9v5?CrmVb@j1N&bjPcE#YQQ zO&HXPy5;sJM=2MD67q%LCqPoJZZhO1ORD`CRb!9I2jowl!;4FBIB=d^JBMt~aCjJc zg9CIs6~6x*YUalBNOwUw_AgrGV8P879;tPn@3Gh)vtr&lXZLqen?TIJlPVBES>gkO zvy%qZS(kmdxXykIb?|4@kK(j1$c+e5C#8PLsY{3x!OVZwc6MqKK6TkgaOjVIXWeZ~8U*}ruTIYJe z<)ul-6wV#2qfla>{TTeEq0!*zr|~$TbCtB+$^dO~pq9Mmz4+>IBF3Y)*&wAW5<7N?0PlI?u{w<}+A5wzh=4O8& zM7Wo~IpQEAzI#Y%lh3HkjGDMeu==&PY9jQE>*kmHmIEHUYaHu@Ai zCl5D-lugw^rRYB6KTk3A1PP6b^3|&=Tx*tVN_9vns=Y-iT6mvw!)+AhG2g?!z~<1# z`lb!aJ$D)Y&kr>=&)?MG-!OlD&|6envT@;(MQqc0Him0Jh|}EM$W|`SV@0*Fo5vO| zMw+p^T^^+pkL49trQ-HA)HmW`df*nNzP1KgLyTSt_?uSwYC^XtEd+g2*=$5Lf~?^* z*`iS6Z-SE{a&eq`%Hxj&(J^@-c}h?$gTXOjZLH0LKP&!h_yd{tSIW}=+F;0h9kZnW zJ!MJ%Yhmesu#WM9%ZiJ#W8~i0#qo*Y<>AUV#7tzHHYOI{Ol^g1UL(E zDdM$=S0Zjh+>E#jaR*|C;o6V50P!%<2=bY>W{Yxp@G{7+ElM+uO2tpCOHAv=q}|LHmGqNolfpUT@}OcIZ-@|o3-}kn=VL_LQOEc{0sjQ}PP6_G$N2P1A?JKDd4rk1*5FgS z9tS@c{Q1Op=NCMfpUXT*jsRDHFUtY3EWhAM>$3d9Cv0lI(wU>?&)qhmJiqkuiSGQe z$RtmGvn@X-zifGasVjf(vU~-S&{zgbMxcD$s0Rg)yQn&3pn6&hpb)rD;CHr7czmKa z-?<_(=}GGowoc016#DS|ojPewKoKFG}zFDah|dt~E&h zT>&L$3c7zz6;DySP>|)5ZdO^HKpFe|QqoVcVslc&u?agm}faHvSS!doDK=TF!TQ&$?tSBGx}h%lS8LOd|t#8^9=lJBL2FFmxy?Ui0>6~ zoroV6@#7-?7ZLxvhh*|20VCchQ+Pq`3pBG5>_U7cu4U!(Le(1twV^3ovOZjo&w%P{ zSqV`atMRJ5q;_)?>=|(=U~sVh2C-(R5nk{DzJ_X25UJ*d5G&ztPDY;V=Z(gWoUS-0|C5d=NdBoU=74zy}Gf!23h-I5VsPZi(wIO+*wJD zzp)Wt3uIR3-%Qi^roJo34YN8yAqYT@Zvz~gUnG4a5;TUS{w%8#6y|dQD4ZI*$@xJd z3y?7DFSR;3YpKw8aBA$9sjq=WbBfHr%IXBgSd{=`oErN^AXm`RjvF+lcE8k@^OPbc zmZe$r1u#tREBeV@&Aty_^(A zOwU3n$oOfb&HD0t(qXZXtu=o%VbUWT6rROp)|czB=Q_?#`$mtDO#gQRBYj!_Ro6K| zq5lhg+Si(c^jl)PkTC1Z?=R(dnN&V=$jpB?m}Y%VadL4@;SqZ*BY)X`2A8oI64XXw zxTN+S^83|H`AnMjuy&Ka+?Vvv0f>+(pGnj23G!d&PhSvF82Bso{{&g;ds1Jn!*QWM zOvVw!a7g(h+u;;==KSS%P5ln8mXbH9vGT}r3})!deb9i=-z61AEcIkOlA+&GRK+Db ziVOwmr$LS7FX;4mW$ZqFQwxjvbV*-$HT-ATtM zGr69m<9ST36X`fsQaS&d7`Ab~=OnfbD}tP_P2#}wQ{VNc;UzM7_((&m`&P(a|49qiCW{9V4_!3;sBW9)zUuetuAssJF{r*YE zuVHdLr{iPa&8Fk>*WMV49*EwS&t_fW90QS}ahHSdc+)@-vv67P^?PQyih+|IXNH|` z3%of4|Bk>jf8TE4_Ghx4VxA%YC_IeIf~PZ?_52)g%C|4W{?EDms(@B;!rgV=&!(a#J2<@8|8gEQH#47>mZBs(t@IJt!%2|Etp zm;^;~YZm;+9KU>i`)L;WUuVG& zXTeW$JfA%;%0=rTg>wS`*J(~p^Jj%feGQAE!jbgbLf}--Qwp7&af4vHfltM*TF%3A zT&(8u={AV+Wx+QCS1uQ4PXR9&r@!swcJf)N@Mpfz{f)q9&v0^tKf6SVp4m>dUXwa^ zAjDapz_*L>EAc-GJM`Ht1)bmVyj0*-B47S&5xl7=IGvA>`zRsKt^-c>Rx#blA^xlt zsc&;!N{z*S9S z$2>SH5ejT(#)F~SaAPB#wJ=fqgh!g_8ecGU56*MYTPWA+3V2U%s1VgO)Hk#HEARD& z{N7b}EMptIB(j1YIhEH6${lWO=17ohd68qYm-M~-i4sro7k9i;R;+SWs9yDMHywZB zRV31PcHa`zW;7lMfq|@*@@F4M)u^$ z=J3OYALZH3#4zK~iPLF5U6NHACuoclF6r{(FiTA^Y@Dx=M_DqY43D{Ycn3#M^t^O1 z2j^`}y(r7UAUbQ4Zq9t1CY@tAVVvDb=kRko>Cj~danjks+s0;i8bq)1`2w}g-ulMs z4L)y;zj-rN@uG#L&D2L)0vp!af_x`A}IW}~m>AusQQqL2D$Oz{GFz0gl}$l9RayRo{dmJVAP=aZ;O`2$#K z%qV$Ap@nhti6vfCIA7oFZNhL4^Nt+G(Jt}+R-Bv4C>R}cGR{t=GmL{&>6meDUc3?b@heeGe!eT+=>G!h9cxDb literal 0 HcmV?d00001 From cc16ddea2dbeb3cf91af2d5ba90d304c30e1ea61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Mon, 8 Sep 2014 12:28:35 -0400 Subject: [PATCH 19/31] libfreerdp-primitives: update YCbCr color converter --- libfreerdp/primitives/prim_colors.c | 16 +++-- .../primitives/test/TestPrimitivesYCbCr.c | 63 +++++++++++++++++++ 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c index 7478fceee..746415999 100644 --- a/libfreerdp/primitives/prim_colors.c +++ b/libfreerdp/primitives/prim_colors.c @@ -51,13 +51,13 @@ pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, { for (x = 0; x < roi->width; x++) { - Y = (double) ((*pY++ >> 1) + 2048); - Cb = (double) (*pCb++ >> 1); - Cr = (double) (*pCr++ >> 1); + Y = (double) (pY[0] + 4096); + Cb = (double) (pCb[0]); + Cr = (double) (pCr[0]); - R = (INT16) (((int) (Y + (1.402524948120117L * Cr) + 8.0L)) >> 4); - G = (INT16) (((int) (Y - (0.3437300026416779L * Cb) - (0.7144010066986084L * Cr) + 8.0L)) >> 4); - B = (INT16) (((int) (Y + (1.769904971122742L * Cb) + 8.0L)) >> 4); + R = ((INT16) (((Cr * 1.402524948120117L) + Y + 16.0L)) >> 5); + G = ((INT16) ((Y - (Cb * 0.3437300026416779L) - (Cr * 0.7144010066986084L) + 16.0L)) >> 5); + B = ((INT16) (((Cb * 1.769904971122742L) + Y + 16.0L)) >> 5); if (R < 0) R = 0; @@ -78,6 +78,10 @@ pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, *pRGB++ = (BYTE) G; *pRGB++ = (BYTE) R; *pRGB++ = 0xFF; + + pY++; + pCb++; + pCr++; } pY += srcPad; diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c index 0a1301ec5..26c2169ee 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c @@ -2106,6 +2106,51 @@ static int test_memcmp_count(const BYTE* mem1, const BYTE* mem2, int size) return count; } +static void test_fill_bitmap_red_channel(BYTE* data, int width, int height, BYTE value) +{ + int i, j; + UINT32* pixel; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + pixel = (UINT32*) &data[((i * width) + j) * 4]; + *pixel = ((*pixel & 0xFF00FFFF) | (value << 16)); + } + } +} + +static void test_fill_bitmap_green_channel(BYTE* data, int width, int height, BYTE value) +{ + int i, j; + UINT32* pixel; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + pixel = (UINT32*) &data[((i * width) + j) * 4]; + *pixel = ((*pixel & 0xFFFF00FF) | (value << 8)); + } + } +} + +static void test_fill_bitmap_blue_channel(BYTE* data, int width, int height, BYTE value) +{ + int i, j; + UINT32* pixel; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + pixel = (UINT32*) &data[((i * width) + j) * 4]; + *pixel = ((*pixel & 0xFFFFFF00) | (value)); + } + } +} + int TestPrimitivesYCbCr(int argc, char* argv[]) { int cmp; @@ -2159,6 +2204,24 @@ int TestPrimitivesYCbCr(int argc, char* argv[]) _aligned_free(pSrcDst[2]); } + if (0) + { + test_fill_bitmap_red_channel(actual, 64, 64, 0); + test_fill_bitmap_red_channel(expected, 64, 64, 0); + } + + if (0) + { + test_fill_bitmap_green_channel(actual, 64, 64, 0); + test_fill_bitmap_green_channel(expected, 64, 64, 0); + } + + if (0) + { + test_fill_bitmap_blue_channel(actual, 64, 64, 0); + test_fill_bitmap_blue_channel(expected, 64, 64, 0); + } + cmp = test_memcmp_offset(actual, expected, size); cnt = test_memcmp_count(actual, expected, size); From e21202ee616815b1a5814f47959c687aa114c293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Mon, 8 Sep 2014 15:16:03 -0400 Subject: [PATCH 20/31] libfreerdp-primitives: add per-pixel YCbCr test code --- libfreerdp/primitives/prim_colors.c | 14 ++-- .../primitives/test/TestPrimitivesYCbCr.c | 79 ++++++++++++++++++- 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c index 746415999..a1831597d 100644 --- a/libfreerdp/primitives/prim_colors.c +++ b/libfreerdp/primitives/prim_colors.c @@ -39,7 +39,7 @@ pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, { int x, y; INT16 R, G, B; - double Y, Cb, Cr; + float Y, Cb, Cr; BYTE* pRGB = pDst; const INT16* pY = pSrc[0]; const INT16* pCb = pSrc[1]; @@ -51,13 +51,13 @@ pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, { for (x = 0; x < roi->width; x++) { - Y = (double) (pY[0] + 4096); - Cb = (double) (pCb[0]); - Cr = (double) (pCr[0]); + Y = (float) (pY[0] + 4096); + Cb = (float) (pCb[0]); + Cr = (float) (pCr[0]); - R = ((INT16) (((Cr * 1.402524948120117L) + Y + 16.0L)) >> 5); - G = ((INT16) ((Y - (Cb * 0.3437300026416779L) - (Cr * 0.7144010066986084L) + 16.0L)) >> 5); - B = ((INT16) (((Cb * 1.769904971122742L) + Y + 16.0L)) >> 5); + R = ((INT16) (((Cr * 1.402525f) + Y + 16.0f)) >> 5); + G = ((INT16) ((Y - (Cb * 0.343730f) - (Cr * 0.714401f) + 16.0f)) >> 5); + B = ((INT16) (((Cb * 1.769905f) + Y + 16.0f)) >> 5); if (R < 0) R = 0; diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c index 26c2169ee..79e6347e4 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c @@ -2151,6 +2151,79 @@ static void test_fill_bitmap_blue_channel(BYTE* data, int width, int height, BYT } } +static float TEST_YCbCrToRGB_01[4] = { 1.403f, 0.344f, 0.714f, 1.770f }; +static float TEST_YCbCrToRGB_02[4] = { 1.402525f, 0.343730f, 0.714401f, 1.769905f }; + +static INT16 TEST_YCbCr_01[3] = { +115, +1720, -2145 }; +static BYTE TEST_RGB_01[3] = { 37, 161, 227 }; /* incorrect red */ + +static INT16 TEST_YCbCr_02[3] = { -450, +1938, -2126 }; +static BYTE TEST_RGB_02[3] = { 21, 140, 221 }; /* incorrect green */ + +static INT16 TEST_YCbCr_03[3] = { -504, +1896, -2168 }; +static BYTE TEST_RGB_03[3] = { 17, 140, 217 }; /* incorrect blue */ + +int test_YCbCr_fp(float coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) +{ + INT16 R, G, B; + float Y, Cb, Cr; + + Y = (float) (YCbCr[0] + 4096); + Cb = (float) (YCbCr[1]); + Cr = (float) (YCbCr[2]); + + R = ((INT16) (((Cr * coeffs[0]) + Y + 16.0f)) >> 5); + G = ((INT16) ((Y - (Cb * coeffs[1]) - (Cr * coeffs[2]) + 16.0f)) >> 5); + B = ((INT16) (((Cb * coeffs[3]) + Y + 16.0f)) >> 5); + + if (R < 0) + R = 0; + else if (R > 255) + R = 255; + + if (G < 0) + G = 0; + else if (G > 255) + G = 255; + + if (B < 0) + B = 0; + else if (B > 255) + B = 255; + + printf("--------------------------------\n"); + printf("R: A: %3d E: %3d %s\n", R, RGB[0], (R == RGB[0]) ? "" : "***"); + printf("G: A: %3d E: %3d %s\n", G, RGB[1], (G == RGB[1]) ? "" : "***"); + printf("B: A: %3d E: %3d %s\n", B, RGB[2], (B == RGB[2]) ? "" : "***"); + printf("Y: %+5d Cb: %+5d Cr: %+5d\n", YCbCr[0], YCbCr[1], YCbCr[2]); + printf("[0]: %20.16f\n", coeffs[0]); + printf("[1]: %20.16f\n", coeffs[1]); + printf("[2]: %20.16f\n", coeffs[2]); + printf("[3]: %20.16f\n", coeffs[3]); + printf("--------------------------------\n"); + + return 0; +} + +int test_YCbCr_pixels() +{ + if (0) + { + test_YCbCr_fp(TEST_YCbCrToRGB_01, TEST_YCbCr_01, TEST_RGB_01); + test_YCbCr_fp(TEST_YCbCrToRGB_01, TEST_YCbCr_02, TEST_RGB_02); + test_YCbCr_fp(TEST_YCbCrToRGB_01, TEST_YCbCr_03, TEST_RGB_03); + } + + if (1) + { + test_YCbCr_fp(TEST_YCbCrToRGB_02, TEST_YCbCr_01, TEST_RGB_01); + test_YCbCr_fp(TEST_YCbCrToRGB_02, TEST_YCbCr_02, TEST_RGB_02); + test_YCbCr_fp(TEST_YCbCrToRGB_02, TEST_YCbCr_03, TEST_RGB_03); + } + + return 0; +} + int TestPrimitivesYCbCr(int argc, char* argv[]) { int cmp; @@ -2162,6 +2235,8 @@ int TestPrimitivesYCbCr(int argc, char* argv[]) const primitives_t* prims = primitives_get(); static const prim_size_t roi_64x64 = { 64, 64 }; + return test_YCbCr_pixels(); + expected = (BYTE*) TEST_XRGB_IMAGE; size = 64 * 64 * 4; @@ -2204,13 +2279,13 @@ int TestPrimitivesYCbCr(int argc, char* argv[]) _aligned_free(pSrcDst[2]); } - if (0) + if (1) { test_fill_bitmap_red_channel(actual, 64, 64, 0); test_fill_bitmap_red_channel(expected, 64, 64, 0); } - if (0) + if (1) { test_fill_bitmap_green_channel(actual, 64, 64, 0); test_fill_bitmap_green_channel(expected, 64, 64, 0); From 81454c1171c02a165d9ed6809c1d177646dce03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Mon, 8 Sep 2014 15:47:03 -0400 Subject: [PATCH 21/31] libfreerdp-primitives: add more YCbCr test coefficients --- .../primitives/test/TestPrimitivesYCbCr.c | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c index 79e6347e4..a56533a55 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c @@ -2151,8 +2151,11 @@ static void test_fill_bitmap_blue_channel(BYTE* data, int width, int height, BYT } } -static float TEST_YCbCrToRGB_01[4] = { 1.403f, 0.344f, 0.714f, 1.770f }; -static float TEST_YCbCrToRGB_02[4] = { 1.402525f, 0.343730f, 0.714401f, 1.769905f }; +#define TEST_FP_TYPE float + +static TEST_FP_TYPE TEST_YCbCrToRGB_01[4] = { 1.403f, 0.344f, 0.714f, 1.770f }; +static TEST_FP_TYPE TEST_YCbCrToRGB_02[4] = { 1.402525f, 0.343730f, 0.714401f, 1.769905f }; +static TEST_FP_TYPE TEST_YCbCrToRGB_03[4] = { 1.402524948120117L, 0.3437300026416779L, 0.7144010066986084L, 1.769904971122742L }; static INT16 TEST_YCbCr_01[3] = { +115, +1720, -2145 }; static BYTE TEST_RGB_01[3] = { 37, 161, 227 }; /* incorrect red */ @@ -2163,14 +2166,14 @@ static BYTE TEST_RGB_02[3] = { 21, 140, 221 }; /* incorrect green */ static INT16 TEST_YCbCr_03[3] = { -504, +1896, -2168 }; static BYTE TEST_RGB_03[3] = { 17, 140, 217 }; /* incorrect blue */ -int test_YCbCr_fp(float coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) +int test_YCbCr_fp(TEST_FP_TYPE coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) { INT16 R, G, B; - float Y, Cb, Cr; + TEST_FP_TYPE Y, Cb, Cr; - Y = (float) (YCbCr[0] + 4096); - Cb = (float) (YCbCr[1]); - Cr = (float) (YCbCr[2]); + Y = (TEST_FP_TYPE) (YCbCr[0] + 4096); + Cb = (TEST_FP_TYPE) (YCbCr[1]); + Cr = (TEST_FP_TYPE) (YCbCr[2]); R = ((INT16) (((Cr * coeffs[0]) + Y + 16.0f)) >> 5); G = ((INT16) ((Y - (Cb * coeffs[1]) - (Cr * coeffs[2]) + 16.0f)) >> 5); @@ -2196,10 +2199,10 @@ int test_YCbCr_fp(float coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) printf("G: A: %3d E: %3d %s\n", G, RGB[1], (G == RGB[1]) ? "" : "***"); printf("B: A: %3d E: %3d %s\n", B, RGB[2], (B == RGB[2]) ? "" : "***"); printf("Y: %+5d Cb: %+5d Cr: %+5d\n", YCbCr[0], YCbCr[1], YCbCr[2]); - printf("[0]: %20.16f\n", coeffs[0]); - printf("[1]: %20.16f\n", coeffs[1]); - printf("[2]: %20.16f\n", coeffs[2]); - printf("[3]: %20.16f\n", coeffs[3]); + //printf("[0]: %20.20lf\n", coeffs[0]); + //printf("[1]: %20.20lf\n", coeffs[1]); + //printf("[2]: %20.20lf\n", coeffs[2]); + //printf("[3]: %20.20lf\n", coeffs[3]); printf("--------------------------------\n"); return 0; @@ -2221,6 +2224,13 @@ int test_YCbCr_pixels() test_YCbCr_fp(TEST_YCbCrToRGB_02, TEST_YCbCr_03, TEST_RGB_03); } + if (0) + { + test_YCbCr_fp(TEST_YCbCrToRGB_03, TEST_YCbCr_01, TEST_RGB_01); + test_YCbCr_fp(TEST_YCbCrToRGB_03, TEST_YCbCr_02, TEST_RGB_02); + test_YCbCr_fp(TEST_YCbCrToRGB_03, TEST_YCbCr_03, TEST_RGB_03); + } + return 0; } From a427a46ba5865bf9d0d43d00ead24394787f6c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Mon, 8 Sep 2014 16:24:43 -0400 Subject: [PATCH 22/31] libfreerdp-primitives: start porting tests to Windows --- cmake/ConfigOptions.cmake | 2 +- libfreerdp/primitives/CMakeLists.txt | 2 +- .../primitives/test/TestPrimitivesSet.c | 4 +-- libfreerdp/primitives/test/measure.h | 24 ++++++++++++----- libfreerdp/primitives/test/prim_test.c | 7 +++++ libfreerdp/primitives/test/prim_test.h | 27 ++++++++++++------- 6 files changed, 46 insertions(+), 20 deletions(-) diff --git a/cmake/ConfigOptions.cmake b/cmake/ConfigOptions.cmake index d2e56eade..2fcec230f 100644 --- a/cmake/ConfigOptions.cmake +++ b/cmake/ConfigOptions.cmake @@ -1,5 +1,5 @@ -if((CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|x86") AND (CMAKE_SIZEOF_VOID_P EQUAL 4)) +if((CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|x86|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 4)) set(TARGET_ARCH "x86") elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 8)) set(TARGET_ARCH "x64") diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 2c4ef7414..cf95a4116 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -100,7 +100,7 @@ endif() set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp") -if(BUILD_TESTING AND ((NOT WIN32) AND (NOT APPLE))) +if(BUILD_TESTING AND NOT WIN32 AND NOT APPLE) add_subdirectory(test) endif() diff --git a/libfreerdp/primitives/test/TestPrimitivesSet.c b/libfreerdp/primitives/test/TestPrimitivesSet.c index 3d689eeff..2111d65c3 100644 --- a/libfreerdp/primitives/test/TestPrimitivesSet.c +++ b/libfreerdp/primitives/test/TestPrimitivesSet.c @@ -243,7 +243,7 @@ int test_set32u_func(void) } /* ------------------------------------------------------------------------- */ -static inline void memset32u_naive( +static INLINE void memset32u_naive( UINT32 val, UINT32 *dst, size_t count) @@ -275,7 +275,7 @@ int test_set32u_speed(void) } /* ------------------------------------------------------------------------- */ -static inline void memset32s_naive( +static INLINE void memset32s_naive( INT32 val, INT32 *dst, size_t count) diff --git a/libfreerdp/primitives/test/measure.h b/libfreerdp/primitives/test/measure.h index ba2909c00..2eb8ae80e 100644 --- a/libfreerdp/primitives/test/measure.h +++ b/libfreerdp/primitives/test/measure.h @@ -22,10 +22,6 @@ * Define GOOGLE_PROFILER if you want gperftools included. */ -#ifdef _GNUC_ -# pragma once -#endif - #ifndef __MEASURE_H_INCLUDED__ #define __MEASURE_H_INCLUDED__ @@ -35,9 +31,21 @@ #include #endif -#include -#include -#include +#include + +#ifdef _WIN32 + +#define PROFILER_START(_prefix_) +#define PROFILER_STOP + +#define MEASURE_LOOP_START(_prefix_, _count_) +#define MEASURE_LOOP_STOP +#define MEASURE_GET_RESULTS(_result_) +#define MEASURE_SHOW_RESULTS(_result_) +#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) +#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) + +#else #ifdef GOOGLE_PROFILER #include @@ -122,4 +130,6 @@ extern void _floatprint(float t, char *output); MEASURE_SHOW_RESULTS(_result_); \ } +#endif + #endif // __MEASURE_H_INCLUDED__ diff --git a/libfreerdp/primitives/test/prim_test.c b/libfreerdp/primitives/test/prim_test.c index a19b5f64b..b9757ac05 100644 --- a/libfreerdp/primitives/test/prim_test.c +++ b/libfreerdp/primitives/test/prim_test.c @@ -18,9 +18,11 @@ #include "prim_test.h" +#ifndef _WIN32 #include #include #include +#endif #include #include @@ -83,6 +85,10 @@ void get_random_data(void *buffer, size_t size) } /* ------------------------------------------------------------------------- */ + +#ifdef _WIN32 +float _delta_time(const struct timespec *t0, const struct timespec *t1) { return 0.0f; } +#else float _delta_time(const struct timespec *t0, const struct timespec *t1) { INT64 secs = (INT64) (t1->tv_sec) - (INT64) (t0->tv_sec); @@ -98,6 +104,7 @@ float _delta_time(const struct timespec *t0, const struct timespec *t1) retval = (double) secs + (double) nsecs / (double) 1000000000.0; return (retval < 0.0) ? 0.0 : (float) retval; } +#endif /* ------------------------------------------------------------------------- */ void _floatprint(float t, char *output) diff --git a/libfreerdp/primitives/test/prim_test.h b/libfreerdp/primitives/test/prim_test.h index 42f8777c9..37db6a9b6 100644 --- a/libfreerdp/primitives/test/prim_test.h +++ b/libfreerdp/primitives/test/prim_test.h @@ -13,10 +13,6 @@ * this code may be covered by patents by HP, Microsoft, or other parties. */ -#ifdef __GNUC__ -# pragma once -#endif - #ifndef __PRIMTEST_H_INCLUDED__ #define __PRIMTEST_H_INCLUDED__ @@ -34,7 +30,11 @@ #include #endif +#ifdef _WIN32 +#define ALIGN(x) x +#else #define ALIGN(x) x DECLSPEC_ALIGN(MEMORY_ALLOCATION_ALIGNMENT) +#endif #define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_)) #define MAX_TEST_SIZE 4096 @@ -112,7 +112,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf(label, "%s-%-4d", oplabel, size); \ + sprintf_s(label, "%s-%-4d", oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultNormal[s], \ _funcNormal_); \ } \ @@ -128,7 +128,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \ + sprintf_s(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultOpt[s], \ _funcOpt_); \ } \ @@ -147,7 +147,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf(label, "IPP-%s-%-4d", oplabel, size); \ + sprintf_s(label, "IPP-%s-%-4d", oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultIPP[s], \ _funcIPP_); \ } \ @@ -158,6 +158,14 @@ extern int test_or_32u_speed(void); #define PRIM_NOP do {} while (0) /* ------------------------------------------------------------------------- */ + +#ifdef _WIN32 +#define STD_SPEED_TEST( \ + _name_, _srctype_, _dsttype_, _prework_, \ + _doNormal_, _funcNormal_, \ + _doOpt_, _funcOpt_, _flagOpt_, _flagExt_, \ + _doIPP_, _funcIPP_) +#else #define STD_SPEED_TEST( \ _name_, _srctype_, _dsttype_, _prework_, \ _doNormal_, _funcNormal_, \ @@ -210,7 +218,7 @@ static void _name_( \ _floatprint(resultOpt[s], sSN); \ if (resultNormal[s] > 0.0) \ { \ - sprintf(sSNp, "%d%%", \ + sprintf_s(sSNp, "%d%%", \ (int) (resultOpt[s] / resultNormal[s] * 100.0 + 0.5)); \ } \ } \ @@ -219,7 +227,7 @@ static void _name_( \ _floatprint(resultIPP[s], sIPP); \ if (resultNormal[s] > 0.0) \ { \ - sprintf(sIPPp, "%d%%", \ + sprintf_s(sIPPp, "%d%%", \ (int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \ } \ } \ @@ -228,5 +236,6 @@ static void _name_( \ } \ free(resultNormal); free(resultOpt); free(resultIPP); \ } +#endif #endif // !__PRIMTEST_H_INCLUDED__ From 782872541396e00a09b479a780d26af3123f6390 Mon Sep 17 00:00:00 2001 From: erbth Date: Tue, 9 Sep 2014 00:13:18 +0200 Subject: [PATCH 23/31] YUV data conversion of H.264 implementation (egfx): only convert invalid areas SIMD SSSE3 conversion in primitives compiling all primitives sources with optimization and cleanup after last merge --- channels/drdynvc/client/dvcman.c | 9 +- include/freerdp/codec/h264.h | 6 +- libfreerdp/codec/CMakeLists.txt | 18 --- libfreerdp/codec/h264.c | 129 +++++----------- .../codec/test/Makefile.TestOpenH264ASM32 | 17 --- .../codec/test/Makefile.TestOpenH264ASM64 | 17 --- .../codec/test/Makefile.TestOpenH264SSSE3 | 14 -- libfreerdp/codec/test/TestOpenH264 | Bin 15584 -> 0 bytes libfreerdp/codec/test/TestOpenH264ASM.c | 92 ------------ libfreerdp/codec/test/TestOpenH264ASM.h | 7 - libfreerdp/primitives/CMakeLists.txt | 15 +- libfreerdp/primitives/prim_YUV.c | 138 +++++++++++------- libfreerdp/primitives/prim_YUV.h | 1 + .../prim_YUV_opt.c} | 97 ++++++------ winpr/libwinpr/utils/collections/StreamPool.c | 2 - 15 files changed, 199 insertions(+), 363 deletions(-) delete mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM32 delete mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM64 delete mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 delete mode 100755 libfreerdp/codec/test/TestOpenH264 delete mode 100644 libfreerdp/codec/test/TestOpenH264ASM.c delete mode 100644 libfreerdp/codec/test/TestOpenH264ASM.h rename libfreerdp/{codec/h264_ssse3.c => primitives/prim_YUV_opt.c} (80%) diff --git a/channels/drdynvc/client/dvcman.c b/channels/drdynvc/client/dvcman.c index 001717e14..f9e4873b8 100644 --- a/channels/drdynvc/client/dvcman.c +++ b/channels/drdynvc/client/dvcman.c @@ -486,7 +486,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C int status = 0; DVCMAN_CHANNEL* channel; UINT32 dataSize = Stream_GetRemainingLength(data); - wStream* s; channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); @@ -499,7 +498,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C if (channel->dvc_data) { /* Fragmented data */ - if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data)) + if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data)) { CLOG_ERR("data exceeding declared length!"); Stream_Release(channel->dvc_data); @@ -513,11 +512,9 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C { Stream_SealLength(channel->dvc_data); Stream_SetPosition(channel->dvc_data, 0); - s=channel->dvc_data; + status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data); + Stream_Release(channel->dvc_data); channel->dvc_data = NULL; - - status = channel->channel_callback->OnDataReceived(channel->channel_callback, s); - Stream_Release(s); } } else diff --git a/include/freerdp/codec/h264.h b/include/freerdp/codec/h264.h index d29a9e243..969914709 100644 --- a/include/freerdp/codec/h264.h +++ b/include/freerdp/codec/h264.h @@ -29,8 +29,7 @@ typedef struct _H264_CONTEXT H264_CONTEXT; typedef BOOL (*pfnH264SubsystemInit)(H264_CONTEXT* h264); typedef void (*pfnH264SubsystemUninit)(H264_CONTEXT* h264); -typedef int (*pfnH264SubsystemDecompress)(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); +typedef int (*pfnH264SubsystemDecompress)(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize); struct _H264_CONTEXT_SUBSYSTEM { @@ -50,6 +49,9 @@ struct _H264_CONTEXT UINT32 width; UINT32 height; //int scanline; + + BYTE* pYUVData[3]; + int iStride[3]; /* <<<<<<< HEAD diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index f8ac3faa5..75999d262 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -101,24 +101,6 @@ if(WITH_LIBAVCODEC) set(FREERDP_LIBAVCODEC_LIBS ${LIBAVCODEC_LIB} ${LIBAVUTIL_LIB}) endif() -if(WITH_LIBAVCODEC OR WITH_OPENH264) - if(WITH_H264_SSSE3) - add_definitions(-DWITH_H264_SSSE3) - set(${MODULE_PREFIX}_SRCS - ${${MODULE_PREFIX}_SRCS} - h264_ssse3.c) - - if(CMAKE_COMPILER_IS_GNUCC) - set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3") - endif() - - if(MSVC) - set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2") - endif() - - set_property(SOURCE h264_ssse3.c PROPERTY COMPILE_FLAGS ${OPTIMIZATION}) - endif() -endif() add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT" MONOLITHIC ${MONOLITHIC_BUILD} diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 77527a4de..5f8f688ab 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -28,39 +28,14 @@ #include #include -#ifdef WITH_LIBAVCODEC -int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height) -{ - UINT32 size; +#include - h264->width = width; - h264->height = height; - h264->scanline = h264->width * 4; - size = h264->scanline * h264->height; - - if (size > h264->size) - { - h264->size = size; - - if (!h264->data) - h264->data = (BYTE*) _aligned_malloc(h264->size, 16); - else - h264->data = (BYTE*) _aligned_realloc(h264->data, h264->size, 16); - } - - if (!h264->data) - return -1; - - return 1; -} -#endif /** * Dummy subsystem */ -static int dummy_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) +static int dummy_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize) { return -1; } @@ -107,13 +82,9 @@ static void openh264_trace_callback(H264_CONTEXT* h264, int level, const char* m static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize) { - int srcStep[3]; - prim_size_t roi; - BYTE* pYUVData[3]; DECODING_STATE state; SBufferInfo sBufferInfo; SSysMEMBuffer* pSystemBuffer; - primitives_t* prims = primitives_get(); H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData; struct timeval T1,T2; @@ -147,7 +118,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz */ if (sBufferInfo.iBufferStatus != 1) - state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, pYUVData, &sBufferInfo); + state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo); gettimeofday(&T2,NULL); printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); @@ -164,17 +135,19 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (state != 0) return -1; - if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2]) - return -1; - if (sBufferInfo.iBufferStatus != 1) - return -1; + return -2; if (pSystemBuffer->iFormat != videoFormatI420) return -1; + if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2]) + return -1; + h264->iStride[0] = pSystemBuffer->iStride[0]; h264->iStride[1] = pSystemBuffer->iStride[1]; + h264->iStride[2] = pSystemBuffer->iStride[1]; + h264->width = pSystemBuffer->iWidth; h264->height = pSystemBuffer->iHeight; @@ -305,16 +278,11 @@ struct _H264_CONTEXT_LIBAVCODEC }; typedef struct _H264_CONTEXT_LIBAVCODEC H264_CONTEXT_LIBAVCODEC; -static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, - BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) +static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize) { int status; - int srcStep[3]; int gotFrame = 0; AVPacket packet; - prim_size_t roi; - const BYTE* pSrc[3]; - primitives_t* prims = primitives_get(); H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData; struct timeval T1,T2; @@ -346,22 +314,19 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS if (gotFrame) { - if (h264_prepare_rgb_buffer(h264, sys->videoFrame->width, sys->videoFrame->height) < 0) - return -1; + h264->pYUVData[0] = sys->videoFrame->data[0]; + h264->pYUVData[1] = sys->videoFrame->data[1]; + h264->pYUVData[2] = sys->videoFrame->data[2]; - roi.width = h264->width; - roi.height = h264->height; + h264->iStride[0] = sys->videoFrame->linesize[0]; + h264->iStride[1] = sys->videoFrame->linesize[1]; + h264->iStride[2] = sys->videoFrame->linesize[2]; - pSrc[0] = sys->videoFrame->data[0]; - pSrc[1] = sys->videoFrame->data[1]; - pSrc[2] = sys->videoFrame->data[2]; - - srcStep[0] = sys->videoFrame->linesize[0]; - srcStep[1] = sys->videoFrame->linesize[1]; - srcStep[2] = sys->videoFrame->linesize[2]; - - prims->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, h264->data, h264->scanline, &roi); + h264->width = sys->videoFrame->width; + h264->height = sys->videoFrame->height; } + else + return -2; return 1; } @@ -482,6 +447,8 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, int* iStride; int ret, i, cx, cy; int UncompressedSize; + primitives_t *prims = primitives_get(); + prim_size_t roi; struct timeval T1,T2; @@ -489,24 +456,24 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, return -1; #if 0 - printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n", - pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight); + printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nDstHeight=%d, numRegionRects=%d\n", + pSrcData, SrcSize, *ppDstData, nDstStep, nDstHeight, numRegionRects); #endif if (!(pDstData = *ppDstData)) return -1; -<<<<<<< HEAD - if (h264->subsystem->Decompress(h264, pSrcData, SrcSize, - pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight)) - return -1; + if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0) + return ret; UncompressedSize = h264->width * h264->height * 4; if (UncompressedSize > (nDstStep * nDstHeight)) return -1; + pYUVData = h264->pYUVData; + iStride = h264->iStride; gettimeofday(&T1,NULL); for (i = 0; i < numRegionRects; i++){ @@ -517,32 +484,18 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4; pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left; - ret = rect->top/2 * iStride[1] + rect->left/2; - pYUVPoint[1] = pYUVData[1] + ret; - pYUVPoint[2] = pYUVData[2] + ret; + pYUVPoint[1] = pYUVData[1] + rect->top/2 * iStride[1] + rect->left/2; + pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2; #if 0 printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n", rect->left, rect->top, cx, cy); #endif -#ifdef WITH_H264_SSSE3 - freerdp_image_yuv420p_to_xrgb_ssse3(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep); -#else -/* roi.width = h264->width; - roi.height = h264->height; + roi.width = cx; + roi.height = cy; - pSrc[0] = sys->videoFrame->data[0]; - pSrc[1] = sys->videoFrame->data[1]; - pSrc[2] = sys->videoFrame->data[2]; - - srcStep[0] = sys->videoFrame->linesize[0]; - srcStep[1] = sys->videoFrame->linesize[1]; - srcStep[2] = sys->videoFrame->linesize[2]; - - prims->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, h264->data, h264->scanline, &roi) - */ -#endif + prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi); } gettimeofday(&T2,NULL); printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); @@ -582,24 +535,12 @@ H264_CONTEXT* h264_context_new(BOOL Compressor) h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT)); -#ifdef WITH_H264_SSSE3 - if(freerdp_check_ssse3()){ - printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ..."); - return NULL; - } -#endif - if (h264) { h264->Compressor = Compressor; h264->subsystem = &g_Subsystem_dummy; -#ifdef WITH_LIBAVCODEC - if (h264_prepare_rgb_buffer(h264, 256, 256) < 0) - return NULL; -#endif - if (!h264_context_init(h264)) { free(h264); @@ -614,10 +555,6 @@ void h264_context_free(H264_CONTEXT* h264) { if (h264) { -#ifdef WITH_LIBAVCODEC - _aligned_free(h264->data); -#endif - h264->subsystem->Uninit(h264); free(h264); diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 deleted file mode 100644 index 2a0308db4..000000000 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM32 +++ /dev/null @@ -1,17 +0,0 @@ -TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o - gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr - -h264_ssse3.asm.o: ../h264_ssse3_x32.asm - nasm -f elf32 -o h264_ssse3.asm.o ../h264_ssse3_x32.asm - -h264.asm.o: ../h264_x32.asm - nasm -f elf32 -o h264.asm.o ../h264_x32.asm - -TestOpenH264ASM.c.o: TestOpenH264ASM.c - gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c - -h264.c.o: ../h264.c - gcc -c -o h264.c.o ../h264.c - -clean: - rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 b/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 deleted file mode 100644 index 53e208b69..000000000 --- a/libfreerdp/codec/test/Makefile.TestOpenH264ASM64 +++ /dev/null @@ -1,17 +0,0 @@ -TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o - gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr - -h264_ssse3.asm.o: ../h264_ssse3_x64.asm - nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm - -h264.asm.o: ../h264_x64.asm - nasm -f elf64 -o h264.asm.o ../h264_x64.asm - -TestOpenH264ASM.c.o: TestOpenH264ASM.c - gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c - -h264.c.o: ../h264.c - gcc -c -o h264.c.o ../h264.c - -clean: - rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 b/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 deleted file mode 100644 index 7709e9423..000000000 --- a/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3 +++ /dev/null @@ -1,14 +0,0 @@ -TestOpenH264: TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o - gcc -o TestOpenH264 TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o -lwinpr - -h264_ssse3.c.o: ../h264_ssse3.c - gcc -c -O3 -o h264_ssse3.c.o ../h264_ssse3.c -mssse3 - -TestOpenH264ASM.c.o: TestOpenH264ASM.c - gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c - -h264.c.o: ../h264.c - gcc -c -o h264.c.o ../h264.c - -clean: - rm -f TestOpenH264 TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o diff --git a/libfreerdp/codec/test/TestOpenH264 b/libfreerdp/codec/test/TestOpenH264 deleted file mode 100755 index c92bd5af2190f0d681727a24c74e78bfb62ea1c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15584 zcmcIre{@vUoxd{^AVkargc=d$iIbM>mJldlRID>F4_|bm6o}OpoP=Zo(Ik_d9}-$3 zamRS_c8Jnd_fWT-0%H;=e>8gs-6{Ai-pNxW#48*4d*#IWZW6+|2iiHR>pFflg(zc z*hC-`a5*`LVk0Hvm@qBlTp?#eoI`1=0IW_UA>vsEg&d3^J*1fO8w*a6G~&1C7^I9D zQ#e3+o>VKmXbw{hZ%WzkbQ0`3<%68tR+E zE%RE+mdsnSs3hnwS;+k_fI%wa9d}o9)3QEhf}*^qGcmmQZ3Q;?u`z82S2xURvKkLzk&WdCcpGJ~jg4pV<1 zA7mgO^CW@zN?cUlNw}yySa{ku3Q9EOw~u z3a-qc0H7-~dO5h5v%y0_8cy_+b zf=|xk=iV&x4`#vN%Yxqp`vvSwva6^H5@VrE;5#z#+q2+S;8W47rMw*Hzk$hVYzpg| z>>M+W)YBZdGtYJDIK#ZKslKT>KvThjk_F7`-O%W7@&-fIfsohBymvnAUFF+Q9}M{d z%NweLL0^!0%hyzRYkdI$uc=tx;BWG+sb1gUGkD{3svGJzH2G@1bpfA`l?PV_{58H{ z&>vXgs}6+&zLf!A(AN}FTbQ@5zNtRD1oMI&VqR&lvAUtbUqc0{p+YQSb#)El;6_&G zZ}v5@=5Q#;Y7CKJD9`|?(bpLCg;-rpgFonF!4QfPV0D`U^&ubI;0uN78-4z|+Um`y z3o!0Q*t62Of zHvS|gDWWa>&xmOrvamOyOY<0o@XUHlS5~$X?M|WJQN?4LtLTzJ_6M3PY0i^jRLIlZ zC`=eE(ccIcOBlyaC3QHC_aJOTxkSrp4BbiAIBoT^c5=~8;=;QP>qNzy}F;3?Z zO--5T;>5qt}CQVdv`j13YQzlk& z`Y6%Vgb637-z1uvE>X(q14L7kB@|BYC7PD-L;4f=|lzBM)f$M_S~>@X9siJ+ZD?PNwyA$9U9p@d_u4-vK)#+qY9) zYtae+d5yxf_E^Z8?BkWb-5mD(6b)6W&D()Ui;U)L`pHe!bqDt#Kdv;gZa=Y-1NPZ% zgzX_p%6E_2Ag}2MwC>ZlYnDDuKdc?O6o&p@lGWOehNS*SLci^{1l(casg=--KLSVB z9iH$3Aj`Rf)c#uJw%n^B5V|(y!qm$B19RY=elpp&F7ZRCBp$|X`*t$1T6=V87xLoL z@#|6VDCu03`iHQo>D_y&Q|_bgvwtI&i1-%N;U3^6 zS^Ts??J=n11{K>!EwZ2e@(~smJB0 z?$h6-WXD9P?SWdSW^0exweIdwD=b(JXweE`&C+Y%%{r`*i(M|)$K@)Aw3?%u+DSU1vb9N8{p4n)=WT>4*J`mla@e;YcG(a-9>G|HewVpdeMefKal z`$1Vix$V1SL~_O`QUfWmUO|zVeK%S`9pI^$zR02K-7b9;*~C)WBri}S#HqWAE;INL z^_3dZ?8kt!?=Il9VV5^~*@4h<=umxSh$6?{6g0rK$q7KnbU2miKOxh>D`jVR@)7pP zwn%H7h4S`ac|8Wu3t%I6v`|m{Gb6o+(jOb?7^VMUr28rTmXRKy^lVfcuR!h5Lo{f= zzwRK7VZHTYr1c`(Jc;uLuX?ogB8{O(Chb8xCI@X%^`aq&2~E+@=zY$P)}fAyc!zqz zz8l`F7ZJrHu{K>DMd}1kjX(?I=cC-HI#M5d3Uz4di^SWyM|0uGp?3m5b`q;5_n+v(f88~o`@-0B(EQajl14x{i z-1i1ADrPGMy^yL${M`s+Hu25GVCg{js1=YNdHb|&4qq}|MFErXXf8Szo zsV-L&h5|?1$p-Hd73gg3WS7r(B=iIcMb%MCzhp|+A)Ru_X*kqRxxf4#-Jj_rlX_<_ zqUem<4DSYrb7XJXW!~wd5bkUze#*O-3Dea$GLO?NOzX2T3*Swh=seibPRb?v4CN@) zT^OoI_O7}thi{DKKqcwm#%ZpL%Y12w5bHT&i9Bf!VB~o}<@t}C)lS9Kq7YU`d(;t# zu)zhI}qaNo@N7Vu?ihs2*%Iv>V=V~>*QB|R=dijc~4n)=G z!FVxRxvTplD;8Nk`nwEfyhk0TG9-#&3`Pnt)#SonR2`01%KZ0 z!HXyywHLJ~s`l^IotzwqsKcm~;ak<8*tfkUHv`xgRf)-n<`^^>dTn?Bs)#XsP{Jq{ zvXR5;Id)h*i|Y& zHI{lj+D_S`lNa2o9uJCKU!xvv-Z;2WTDfz&158@WuxpzBIt>66w{WaK_t{@@=Gb4M zu8kzKzY-HGy!y5QIVYyLPAjpEErHrA*o4`C#oZD;*f0ZssT zlQQ;7Zy7tO{aq=ucR_`3u(8Hduam<{Z^bGU9@eqx)GG!&y4ugB#KI1)X^C$g+DAX2 z_R|Cz{{{I#gD}!M%0e^X#b~s0$mr(OyI$knQXLiRS!AdH!&gX8A!!m)VdRa2_~App ziT2a4$M{QUlF9biUdkghbwB+;i*E(A;3ziKPh*d5{8n;(*LAN;f5%nn(%*NjDc48% z4}nUL{`zW7|8lmEm8d_Hn<>!&NCQydjh>|`S!*(#!I9R>Oaq4{kWtaWz( zgryUa-Hf&iHX`gq&T_<4|6O@S8;FWlxN@yEB<|_XPWiV4`~DCDt=pSKkw?iJ86I70W84o z&IgkfH76=+VjjKLQQ&$F>ps(JlI4-tEp=|Y|2hRuROp{-`YeYB<;gAYK9$pcw8DPR zsq*fV)(Xo&`%#bXb||hrw6p?$9j;$_0l%$1mL5&t;=tmy#ZgGMW}E;@i7N+@ARdP& zd_pkTVZ`>?ZpM~(F1_7@`f7i}^*d@74+@;?w!4$0ytoNs9{nx#hOac+T1Vjmj?SZB z^L6&mjwQZCZHeENbuN1oRdCT2V3tP?A*o6OvqX6ff=W#;N?+*aGSZ6;Oud~;m ztg}~tMKWi(%=^GjPB>-geGWe83m@US0J-5h`=!%prd@6D6zQ{+tF1h8(8#S>itiU+Pa4xVMj3+?o`3r(;Q^$8z%@K$WqNU3T|~>ouxFmoft5hj0x+>r|aRFp^7ZKmx<) zZl4}Rc|NPNpY8_mIkAo%06GXH!EGf#eGKYDPNB92(b9v5?CrmVb@j1N&bjPcE#YQQ zO&HXPy5;sJM=2MD67q%LCqPoJZZhO1ORD`CRb!9I2jowl!;4FBIB=d^JBMt~aCjJc zg9CIs6~6x*YUalBNOwUw_AgrGV8P879;tPn@3Gh)vtr&lXZLqen?TIJlPVBES>gkO zvy%qZS(kmdxXykIb?|4@kK(j1$c+e5C#8PLsY{3x!OVZwc6MqKK6TkgaOjVIXWeZ~8U*}ruTIYJe z<)ul-6wV#2qfla>{TTeEq0!*zr|~$TbCtB+$^dO~pq9Mmz4+>IBF3Y)*&wAW5<7N?0PlI?u{w<}+A5wzh=4O8& zM7Wo~IpQEAzI#Y%lh3HkjGDMeu==&PY9jQE>*kmHmIEHUYaHu@Ai zCl5D-lugw^rRYB6KTk3A1PP6b^3|&=Tx*tVN_9vns=Y-iT6mvw!)+AhG2g?!z~<1# z`lb!aJ$D)Y&kr>=&)?MG-!OlD&|6envT@;(MQqc0Him0Jh|}EM$W|`SV@0*Fo5vO| zMw+p^T^^+pkL49trQ-HA)HmW`df*nNzP1KgLyTSt_?uSwYC^XtEd+g2*=$5Lf~?^* z*`iS6Z-SE{a&eq`%Hxj&(J^@-c}h?$gTXOjZLH0LKP&!h_yd{tSIW}=+F;0h9kZnW zJ!MJ%Yhmesu#WM9%ZiJ#W8~i0#qo*Y<>AUV#7tzHHYOI{Ol^g1UL(E zDdM$=S0Zjh+>E#jaR*|C;o6V50P!%<2=bY>W{Yxp@G{7+ElM+uO2tpCOHAv=q}|LHmGqNolfpUT@}OcIZ-@|o3-}kn=VL_LQOEc{0sjQ}PP6_G$N2P1A?JKDd4rk1*5FgS z9tS@c{Q1Op=NCMfpUXT*jsRDHFUtY3EWhAM>$3d9Cv0lI(wU>?&)qhmJiqkuiSGQe z$RtmGvn@X-zifGasVjf(vU~-S&{zgbMxcD$s0Rg)yQn&3pn6&hpb)rD;CHr7czmKa z-?<_(=}GGowoc016#DS|ojPewKoKFG}zFDah|dt~E&h zT>&L$3c7zz6;DySP>|)5ZdO^HKpFe|QqoVcVslc&u?agm}faHvSS!doDK=TF!TQ&$?tSBGx}h%lS8LOd|t#8^9=lJBL2FFmxy?Ui0>6~ zoroV6@#7-?7ZLxvhh*|20VCchQ+Pq`3pBG5>_U7cu4U!(Le(1twV^3ovOZjo&w%P{ zSqV`atMRJ5q;_)?>=|(=U~sVh2C-(R5nk{DzJ_X25UJ*d5G&ztPDY;V=Z(gWoUS-0|C5d=NdBoU=74zy}Gf!23h-I5VsPZi(wIO+*wJD zzp)Wt3uIR3-%Qi^roJo34YN8yAqYT@Zvz~gUnG4a5;TUS{w%8#6y|dQD4ZI*$@xJd z3y?7DFSR;3YpKw8aBA$9sjq=WbBfHr%IXBgSd{=`oErN^AXm`RjvF+lcE8k@^OPbc zmZe$r1u#tREBeV@&Aty_^(A zOwU3n$oOfb&HD0t(qXZXtu=o%VbUWT6rROp)|czB=Q_?#`$mtDO#gQRBYj!_Ro6K| zq5lhg+Si(c^jl)PkTC1Z?=R(dnN&V=$jpB?m}Y%VadL4@;SqZ*BY)X`2A8oI64XXw zxTN+S^83|H`AnMjuy&Ka+?Vvv0f>+(pGnj23G!d&PhSvF82Bso{{&g;ds1Jn!*QWM zOvVw!a7g(h+u;;==KSS%P5ln8mXbH9vGT}r3})!deb9i=-z61AEcIkOlA+&GRK+Db ziVOwmr$LS7FX;4mW$ZqFQwxjvbV*-$HT-ATtM zGr69m<9ST36X`fsQaS&d7`Ab~=OnfbD}tP_P2#}wQ{VNc;UzM7_((&m`&P(a|49qiCW{9V4_!3;sBW9)zUuetuAssJF{r*YE zuVHdLr{iPa&8Fk>*WMV49*EwS&t_fW90QS}ahHSdc+)@-vv67P^?PQyih+|IXNH|` z3%of4|Bk>jf8TE4_Ghx4VxA%YC_IeIf~PZ?_52)g%C|4W{?EDms(@B;!rgV=&!(a#J2<@8|8gEQH#47>mZBs(t@IJt!%2|Etp zm;^;~YZm;+9KU>i`)L;WUuVG& zXTeW$JfA%;%0=rTg>wS`*J(~p^Jj%feGQAE!jbgbLf}--Qwp7&af4vHfltM*TF%3A zT&(8u={AV+Wx+QCS1uQ4PXR9&r@!swcJf)N@Mpfz{f)q9&v0^tKf6SVp4m>dUXwa^ zAjDapz_*L>EAc-GJM`Ht1)bmVyj0*-B47S&5xl7=IGvA>`zRsKt^-c>Rx#blA^xlt zsc&;!N{z*S9S z$2>SH5ejT(#)F~SaAPB#wJ=fqgh!g_8ecGU56*MYTPWA+3V2U%s1VgO)Hk#HEARD& z{N7b}EMptIB(j1YIhEH6${lWO=17ohd68qYm-M~-i4sro7k9i;R;+SWs9yDMHywZB zRV31PcHa`zW;7lMfq|@*@@F4M)u^$ z=J3OYALZH3#4zK~iPLF5U6NHACuoclF6r{(FiTA^Y@Dx=M_DqY43D{Ycn3#M^t^O1 z2j^`}y(r7UAUbQ4Zq9t1CY@tAVVvDb=kRko>Cj~danjks+s0;i8bq)1`2w}g-ulMs z4L)y;zj-rN@uG#L&D2L)0vp!af_x`A}IW}~m>AusQQqL2D$Oz{GFz0gl}$l9RayRo{dmJVAP=aZ;O`2$#K z%qV$Ap@nhti6vfCIA7oFZNhL4^Nt+G(Jt}+R-Bv4C>R}cGR{t=GmL{&>6meDUc3?b@heeGe!eT+=>G!h9cxDb diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c deleted file mode 100644 index 040b1650d..000000000 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include - -#include - -#include "TestOpenH264ASM.h" - -#define WIDTH 1920 -#define HEIGHT 1080 - -#define SSSE3 1 - - -int main(void){ - int i,j,k; - int ret; - unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; - int nSrcStep[2]; - -#if SSSE3 - if(freerdp_check_ssse3()){ - fprintf(stderr,"ssse3 not supported!\n"); - return EXIT_FAILURE; - } -#endif - - struct timeval t1,t2,t3; - - pSrcData[0]=malloc(1984*HEIGHT*sizeof(char)); - pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char)); - pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char)); - pDstData_asm=_aligned_malloc(WIDTH*(HEIGHT+1)*4*sizeof(char),16); - pDstData_c=malloc(WIDTH*(HEIGHT+1)*4*sizeof(char)); - - memset(pDstData_asm,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char)); - memset(pDstData_c,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char)); - - for(i=0;iwidth & 0x01; + last_line = roi->height & 0x01; + + nWidth = (roi->width + 1) & ~0x0001; + nHeight = (roi->height + 1) & ~0x0001; - halfWidth = roi->width / 2; - halfHeight = roi->height / 2; + halfWidth = nWidth / 2; + halfHeight = nHeight / 2; - srcPad[0] = (srcStep[0] - roi->width); + srcPad[0] = (srcStep[0] - nWidth); srcPad[1] = (srcStep[1] - halfWidth); srcPad[2] = (srcStep[2] - halfWidth); - dstPad = (dstStep - (roi->width * 4)); + dstPad = (dstStep - (nWidth * 4)); - for (y = 0; y < halfHeight; y++) + for (y = 0; y < halfHeight; ) { - for (x = 0; x < halfWidth; x++) + y++; + if (y == halfHeight) + last_line = last_line << 1; + + for (x = 0; x < halfWidth; ) { + x++; + if (x == halfWidth) + last_column = last_column << 1; + U = *pU++; V = *pV++; @@ -105,32 +121,41 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 2nd pixel */ - Y = *pY++; - Yp = Y << 8; + if (!(last_column & 0x02)) + { + Y = *pY++; + Yp = Y << 8; - R = (Yp + Vp403) >> 8; - G = (Yp - Up48 - Vp120) >> 8; - B = (Yp + Up475) >> 8; + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; - if (R < 0) - R = 0; - else if (R > 255) - R = 255; + if (R < 0) + R = 0; + else if (R > 255) + R = 255; - if (G < 0) - G = 0; - else if (G > 255) - G = 255; + if (G < 0) + G = 0; + else if (G > 255) + G = 255; - if (B < 0) - B = 0; - else if (B > 255) - B = 255; + if (B < 0) + B = 0; + else if (B > 255) + B = 255; - *pRGB++ = (BYTE) B; - *pRGB++ = (BYTE) G; - *pRGB++ = (BYTE) R; - *pRGB++ = 0xFF; + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; + } + else + { + pY++; + pRGB += 4; + last_column = last_column >> 1; + } } pY += srcPad[0]; @@ -138,8 +163,12 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], pV -= halfWidth; pRGB += dstPad; - for (x = 0; x < halfWidth; x++) + for (x = 0; x < halfWidth; ) { + x++; + if (x == halfWidth) + last_column = last_column << 1; + U = *pU++; V = *pV++; @@ -183,32 +212,41 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 4th pixel */ - Y = *pY++; - Yp = Y << 8; + if(!(last_column & 0x02)) + { + Y = *pY++; + Yp = Y << 8; - R = (Yp + Vp403) >> 8; - G = (Yp - Up48 - Vp120) >> 8; - B = (Yp + Up475) >> 8; + R = (Yp + Vp403) >> 8; + G = (Yp - Up48 - Vp120) >> 8; + B = (Yp + Up475) >> 8; - if (R < 0) - R = 0; - else if (R > 255) - R = 255; + if (R < 0) + R = 0; + else if (R > 255) + R = 255; - if (G < 0) - G = 0; - else if (G > 255) - G = 255; + if (G < 0) + G = 0; + else if (G > 255) + G = 255; - if (B < 0) - B = 0; - else if (B > 255) - B = 255; + if (B < 0) + B = 0; + else if (B > 255) + B = 255; - *pRGB++ = (BYTE) B; - *pRGB++ = (BYTE) G; - *pRGB++ = (BYTE) R; - *pRGB++ = 0xFF; + *pRGB++ = (BYTE) B; + *pRGB++ = (BYTE) G; + *pRGB++ = (BYTE) R; + *pRGB++ = 0xFF; + } + else + { + pY++; + pRGB += 4; + last_column = last_column >> 1; + } } pY += srcPad[0]; @@ -223,6 +261,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], void primitives_init_YUV(primitives_t* prims) { prims->YUV420ToRGB_8u_P3AC4R = general_YUV420ToRGB_8u_P3AC4R; + + primitives_init_YUV_opt(prims); } void primitives_deinit_YUV(primitives_t* prims) diff --git a/libfreerdp/primitives/prim_YUV.h b/libfreerdp/primitives/prim_YUV.h index 12f796b61..99428ada6 100644 --- a/libfreerdp/primitives/prim_YUV.h +++ b/libfreerdp/primitives/prim_YUV.h @@ -22,6 +22,7 @@ pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, BYTE* pDst, int dstStep, const prim_size_t* roi); void primitives_init_YUV(primitives_t* prims); +void primitives_init_YUV_opt(primitives_t* prims); void primitives_deinit_YUV(primitives_t* prims); #endif /* FREERDP_PRIMITIVES_YUV_H */ diff --git a/libfreerdp/codec/h264_ssse3.c b/libfreerdp/primitives/prim_YUV_opt.c similarity index 80% rename from libfreerdp/codec/h264_ssse3.c rename to libfreerdp/primitives/prim_YUV_opt.c index 1774856b4..4b5cea145 100644 --- a/libfreerdp/codec/h264_ssse3.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -1,32 +1,32 @@ /** function for converting YUV420p data to the RGB format (but without any special upconverting) * It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher. - * The target scanline (6th parameter) must be a multiple of 16. - * iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four - * of the half of iStride[0] or bigger + * The target dstStep (6th parameter) must be a multiple of 16. + * srcStep[0] must be (target dstStep) / 4 or bigger and srcStep[1] the next multiple of four + * of the half of srcStep[0] or bigger */ #include -#include -//#include -#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif #include #include - -int freerdp_check_ssse3() -{ - if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) - return 0; - - return 1; -} +#include +#include -int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline) +#ifdef WITH_SSE2 + +#include +#include + +pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, + BYTE *pDst, int dstStep, const prim_size_t *roi) { char last_line,last_column; - int i,VaddDst,VaddY,VaddUV; + int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; BYTE *UData,*VData,*YData; @@ -37,9 +37,12 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt buffer=_aligned_malloc(4*16,16); - YData=pSrcData[0]; - UData=pSrcData[1]; - VData=pSrcData[2]; + YData=(BYTE *)pSrc[0]; + UData=(BYTE *)pSrc[1]; + VData=(BYTE *)pSrc[2]; + + nWidth=roi->width; + nHeight=roi->height; if((last_column=nWidth&3)){ @@ -48,7 +51,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } - _mm_store_si128(buffer+48,r7); + _mm_store_si128(buffer+3,r7); last_column=1; } @@ -61,10 +64,10 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt nHeight=nHeight>>1; - VaddDst=(scanline<<1)-(nWidth<<4); - VaddY=(iStride[0]<<1)-(nWidth<<2); - VaddUV=iStride[1]-(((nWidth<<1)+2)&0xFFFC); - + VaddDst=(dstStep<<1)-(nWidth<<4); + VaddY=(srcStep[0]<<1)-(nWidth<<2); + VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC); + VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC); while(nHeight-- >0){ @@ -129,7 +132,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt r1=_mm_add_epi32(r1,r6); r7=_mm_add_epi32(r7,r6); - _mm_store_si128(buffer+16,r7); + _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2=_mm_cvtsi32_si128(*(UINT32 *)VData); @@ -153,7 +156,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt r2=_mm_add_epi32(r2,r6); r7=_mm_add_epi32(r7,r6); - _mm_store_si128(buffer+32,r7); + _mm_store_si128(buffer+2,r7); @@ -170,8 +173,8 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt _mm_store_si128(buffer,r4); }else{ - r1=_mm_load_si128(buffer+16); - r2=_mm_load_si128(buffer+32); + r1=_mm_load_si128(buffer+1); + r2=_mm_load_si128(buffer+2); r0=_mm_load_si128(buffer); } @@ -220,17 +223,17 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt if(last_column&0x02){ - r6=_mm_load_si128(buffer+48); + r6=_mm_load_si128(buffer+3); r4=_mm_and_si128(r4,r6); - r5=_mm_lddqu_si128((__m128i *)pDstData); + r5=_mm_lddqu_si128((__m128i *)pDst); r6=_mm_andnot_si128(r6,r5); r4=_mm_or_si128(r4,r6); } - _mm_storeu_si128((__m128i *)pDstData,r4); + _mm_storeu_si128((__m128i *)pDst,r4); //Y data processing in secound line if(!(last_line&0x02)){ - r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+iStride[0])); + r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4=_mm_shuffle_epi8(r4,r7); @@ -271,28 +274,40 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt if(last_column&0x02){ - r6=_mm_load_si128(buffer+48); + r6=_mm_load_si128(buffer+3); r4=_mm_and_si128(r4,r6); - r5=_mm_lddqu_si128((__m128i *)(pDstData+scanline)); + r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6=_mm_andnot_si128(r6,r5); r4=_mm_or_si128(r4,r6); last_column=last_column>>1; } - _mm_storeu_si128((__m128i *)(pDstData+scanline),r4); + _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } - pDstData+=16; + pDst+=16; YData+=4; }while(iYUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R; + } +#endif +} diff --git a/winpr/libwinpr/utils/collections/StreamPool.c b/winpr/libwinpr/utils/collections/StreamPool.c index 696ecd971..c95875fbe 100644 --- a/winpr/libwinpr/utils/collections/StreamPool.c +++ b/winpr/libwinpr/utils/collections/StreamPool.c @@ -155,8 +155,6 @@ wStream* StreamPool_Take(wStreamPool* pool, size_t size) Stream_SetPosition(s, 0); Stream_EnsureCapacity(s, size); - - Stream_SetLength(s,size); } s->pool = pool; From 2d6a59e34ba87225f8b86bdfbd464ca5c7f81382 Mon Sep 17 00:00:00 2001 From: erbth Date: Tue, 9 Sep 2014 12:34:08 +0200 Subject: [PATCH 24/31] added some commits, I didn't understand my own code anymore --- libfreerdp/primitives/prim_YUV_opt.c | 105 ++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 11 deletions(-) diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c index 4b5cea145..a8010b9d3 100644 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -26,6 +26,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { char last_line,last_column; +/* last_line: if the last (U,V doubled) line should be skipped, set to 10B + * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ + int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; BYTE *UData,*VData,*YData; @@ -88,25 +91,29 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * B = clip(( 256 * C + 475 * D + 128) >> 8); */ if(!(i&0x01)){ -/* Y-, U- and V-data is stored in different arrays. - * We start with processing U-data. - * - * at first we fetch four U-values from its array and shuffle them like this: - * 0d0d 0c0c 0b0b 0a0a - * we've done two things: converting the values to signed words and duplicating - * each value, because always two pixel "share" the same U- (and V-) data - */ + + /* Y-, U- and V-data is stored in different arrays. + * We start with processing U-data. + * + * at first we fetch four U-values from its array and shuffle them like this: + * 0d0d 0c0c 0b0b 0a0a + * we've done two things: converting the values to signed words and duplicating + * each value, because always two pixel "share" the same U- (and V-) data */ r0=_mm_cvtsi32_si128(*(UINT32 *)UData); r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0=_mm_shuffle_epi8(r0,r5); UData+=4; + /* then we subtract 128 from each value, so we get D */ r3=_mm_set_epi16(128,128,128,128,128,128,128,128); r0=_mm_subs_epi16(r0,r3); + /* we need to do two things with our D, so let's store it for later use */ r2=r0; + /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 + * this is what we need to get G data later on */ r4=r0; r7=_mm_set_epi16(48,48,48,48,48,48,48,48); r0=_mm_mullo_epi16(r0,r7); @@ -116,11 +123,16 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r4=_mm_unpackhi_epi16(r7,r4); + /* to complete this step, add (?) 128 to each value (rounding ?!) + * yeah, add. in the end this will be subtracted from something, + * because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! + * by the way, our values have become signed dwords during multiplication! */ r6=_mm_set_epi32(128,128,128,128); r0=_mm_sub_epi32(r0,r6); r4=_mm_sub_epi32(r4,r6); + /* to get B data, we need to prepare a secound value, D*475+128 */ r1=r2; r7=_mm_set_epi16(475,475,475,475,475,475,475,475); r1=_mm_mullo_epi16(r1,r7); @@ -132,9 +144,13 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r1=_mm_add_epi32(r1,r6); r7=_mm_add_epi32(r7,r6); + /* so we got something like this: xmm7:xmm1 + * this pair contains values for 16 pixel: + * aabbccdd + * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); -/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ + /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2=_mm_cvtsi32_si128(*(UINT32 *)VData); r2=_mm_shuffle_epi8(r2,r5); @@ -145,6 +161,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r5=r2; + /* this is also known as E*403+128, we need it to convert R data */ r3=r2; r7=_mm_set_epi16(403,403,403,403,403,403,403,403); r2=_mm_mullo_epi16(r2,r7); @@ -156,10 +173,12 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r2=_mm_add_epi32(r2,r6); r7=_mm_add_epi32(r7,r6); + /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); + /* doing this step: E*120 */ r3=r5; r7=_mm_set_epi16(120,120,120,120,120,120,120,120); r3=_mm_mullo_epi16(r3,r7); @@ -168,11 +187,17 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r3=_mm_unpacklo_epi16(r3,r5); r7=_mm_unpackhi_epi16(r7,r5); + /* now we complete what we've begun above: + * (48*D-128) + (120*E) = (48*D +120*E -128) */ r0=_mm_add_epi32(r0,r3); r4=_mm_add_epi32(r4,r7); + /* and store to memory ! */ _mm_store_si128(buffer,r4); }else{ + /* maybe you've wondered about the conditional above ? + * Well, we prepared UV data for eight pixel in each line, but can only process four + * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1=_mm_load_si128(buffer+1); r2=_mm_load_si128(buffer+2); r0=_mm_load_si128(buffer); @@ -181,7 +206,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, if(++i==nWidth) last_column=last_column<<1; - //processing Y data + /* We didn't produce any output yet, so let's do so! + * Ok, fetch four pixel from the Y-data array and shuffle them like this: + * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4=_mm_cvtsi32_si128(*(UINT32 *)YData); r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4=_mm_shuffle_epi8(r4,r7); @@ -189,50 +216,91 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r5=r4; r6=r4; + /* no we can perform the "real" conversion itself and produce output! */ r4=_mm_add_epi32(r4,r2); r5=_mm_sub_epi32(r5,r0); r6=_mm_add_epi32(r6,r1); + /* in the end, we only need bytes for RGB values. + * So, what do we do? right! shifting left makes values bigger and thats always good. + * before we had dwords of data, and by shifting left and treating the result + * as packed words, we get not only signed words, but do also divide by 256 + * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least + * significant byte, that we don't need anymore, because we've done some rounding */ r4=_mm_slli_epi32(r4,8); r5=_mm_slli_epi32(r5,8); r6=_mm_slli_epi32(r6,8); + /* one thing we still have to face is the clip() function ... + * we have still signed words, and there are those min/max instructions in SSE2 ... + * the max instruction takes always the bigger of the two operands and stores it in the first one, + * and it operates with signs ! + * if we feed it with our values and zeros, it takes the zeros if our values are smaller than + * zero and otherwise our values */ r7=_mm_set_epi32(0,0,0,0); r4=_mm_max_epi16(r4,r7); r5=_mm_max_epi16(r5,r7); r6=_mm_max_epi16(r6,r7); + /* the same thing just completely different can be used to limit our values to 255, + * but now using the min instruction and 255s */ r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4=_mm_min_epi16(r4,r7); r5=_mm_min_epi16(r5,r7); r6=_mm_min_epi16(r6,r7); + /* Now we got our bytes. + * the moment has come to assemble the three channels R,G and B to the xrgb dwords + * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4=_mm_and_si128(r4,r7); + /* on Green channel we have to shuffle somehow, so we get something like this: + * 00d0 00c0 00b0 00a0 */ r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5=_mm_shuffle_epi8(r5,r7); + /* and on Blue channel that one: + * 000d 000c 000b 000a */ r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6=_mm_shuffle_epi8(r6,r7); + /* and at last we or it together and get this one: + * xrgb xrgb xrgb xrgb */ r4=_mm_or_si128(r4,r5); r4=_mm_or_si128(r4,r6); + /* Only thing to do know is writing data to memory, but this gets a bit more + * complicated if the width is not a multiple of four and it is the last column in line. */ if(last_column&0x02){ + /* let's say, we need to only convert six pixel in width + * Ok, the first 4 pixel will be converted just like every 4 pixel else, but + * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), + * and we land here. Through initialisation a mask was prepared. In this case it looks like + * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6=_mm_load_si128(buffer+3); + /* we and our output data with this mask to get only the valid pixel */ r4=_mm_and_si128(r4,r6); + /* then we fetch memory from the destination array ... */ r5=_mm_lddqu_si128((__m128i *)pDst); + /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6=_mm_andnot_si128(r6,r5); + /* we only have to or the two values together and write it back to the destination array, + * and only the pixel that should be updated really get changed. */ r4=_mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); - //Y data processing in secound line + if(!(last_line&0x02)){ + /* Because UV data is the same for two lines, we can process the secound line just here, + * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination + * pointer. These offsets are iStride[0] and the target scanline. + * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, + * we just skip all this. */ r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4=_mm_shuffle_epi8(r4,r7); @@ -280,18 +348,33 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r6=_mm_andnot_si128(r6,r5); r4=_mm_or_si128(r4,r6); + /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, + * and this "special condition" can be released */ last_column=last_column>>1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } + /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst+=16; YData+=4; }while(i Date: Tue, 9 Sep 2014 13:44:57 -0400 Subject: [PATCH 25/31] libfreerdp-primitives: update YCbCr test code --- .../primitives/test/TestPrimitivesYCbCr.c | 183 ++++++++++++------ 1 file changed, 119 insertions(+), 64 deletions(-) diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c index a56533a55..2cbd8b69e 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c @@ -2,6 +2,7 @@ #include "prim_test.h" #include +#include #ifdef HAVE_CONFIG_H #include "config.h" @@ -2075,78 +2076,98 @@ static UINT32 TEST_XRGB_IMAGE[4096] = 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5 }; -static int test_memcmp_offset(const BYTE* mem1, const BYTE* mem2, int size) +static int test_bmp_cmp_offset(const BYTE* mem1, const BYTE* mem2, int size, int channel) { int index = 0; + size /= 4; + mem1 += channel; + mem2 += channel; + while ((index < size) && (*mem1 == *mem2)) { - mem1++; - mem2++; + mem1 += 4; + mem2 += 4; index++; } return (index == size) ? 1 : -index; } -static int test_memcmp_count(const BYTE* mem1, const BYTE* mem2, int size) +static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int channel) { int count = 0; int index = 0; + size /= 4; + mem1 += channel; + mem2 += channel; + for (index = 0; index < size; index++) { if (*mem1 != *mem2) count++; - mem1++; - mem2++; + mem1 += 4; + mem2 += 4; } return count; } -static void test_fill_bitmap_red_channel(BYTE* data, int width, int height, BYTE value) +static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, int channel) { - int i, j; - UINT32* pixel; + UINT32 pixel; + int count = 0; + int index = 0; + BYTE R, G, B; + BYTE eR, eG, eB; + INT16 Y, Cb, Cr; - for (i = 0; i < height; i++) + size /= 4; + actual += channel; + expected += channel; + + for (index = 0; index < size; index++) { - for (j = 0; j < width; j++) + if (*actual != *expected) { - pixel = (UINT32*) &data[((i * width) + j) * 4]; - *pixel = ((*pixel & 0xFF00FFFF) | (value << 16)); + pixel = *((UINT32*) &actual[-channel]); + GetRGB32(R, G, B, pixel); + + pixel = *((UINT32*) &expected[-channel]); + GetRGB32(eR, eG, eB, pixel); + + Y = TEST_Y_COMPONENT[index]; + Cb = TEST_CB_COMPONENT[index]; + Cr = TEST_CR_COMPONENT[index]; + + printf("Idx: %d Y: %+5d Cb: %+5d Cr: %+5d Actual: R: %3d G: %3d B: %3d Expected: R: %3d G: %3d B: %3d\n", + index, Y, Cb, Cr, R, G, B, eR, eG, eB); + + count++; } + + actual += 4; + expected += 4; } + + return count; } -static void test_fill_bitmap_green_channel(BYTE* data, int width, int height, BYTE value) +static void test_fill_bitmap_channel(BYTE* data, int width, int height, BYTE value, int nChannel) { - int i, j; - UINT32* pixel; + int x, y; + BYTE* pChannel; - for (i = 0; i < height; i++) + pChannel = data + nChannel; + + for (y = 0; y < height; y++) { - for (j = 0; j < width; j++) + for (x = 0; x < width; x++) { - pixel = (UINT32*) &data[((i * width) + j) * 4]; - *pixel = ((*pixel & 0xFFFF00FF) | (value << 8)); - } - } -} - -static void test_fill_bitmap_blue_channel(BYTE* data, int width, int height, BYTE value) -{ - int i, j; - UINT32* pixel; - - for (i = 0; i < height; i++) - { - for (j = 0; j < width; j++) - { - pixel = (UINT32*) &data[((i * width) + j) * 4]; - *pixel = ((*pixel & 0xFFFFFF00) | (value)); + *pChannel = value; + pChannel += 4; } } } @@ -2170,14 +2191,36 @@ int test_YCbCr_fp(TEST_FP_TYPE coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) { INT16 R, G, B; TEST_FP_TYPE Y, Cb, Cr; + TEST_FP_TYPE fR, fG, fB; Y = (TEST_FP_TYPE) (YCbCr[0] + 4096); Cb = (TEST_FP_TYPE) (YCbCr[1]); Cr = (TEST_FP_TYPE) (YCbCr[2]); +#if 1 + fR = ((Cr * coeffs[0]) + Y + 16.0f); + fG = (Y - (Cb * coeffs[1]) - (Cr * coeffs[2]) + 16.0f); + fB = ((Cb * coeffs[3]) + Y + 16.0f); + + printf("fR: %f fG: %f fB: %f\n", fR, fG, fB); + + R = (INT16) fR; + G = (INT16) fG; + B = (INT16) fB; + + printf("iR: %d iG: %d iB: %d\n", R, G, B); + + R >>= 5; + G >>= 5; + B >>= 5; + + printf("R5: %d G5: %d B5: %d\n", R, G, B); + +#else R = ((INT16) (((Cr * coeffs[0]) + Y + 16.0f)) >> 5); G = ((INT16) ((Y - (Cb * coeffs[1]) - (Cr * coeffs[2]) + 16.0f)) >> 5); B = ((INT16) (((Cb * coeffs[3]) + Y + 16.0f)) >> 5); +#endif if (R < 0) R = 0; @@ -2203,7 +2246,7 @@ int test_YCbCr_fp(TEST_FP_TYPE coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) //printf("[1]: %20.20lf\n", coeffs[1]); //printf("[2]: %20.20lf\n", coeffs[2]); //printf("[3]: %20.20lf\n", coeffs[3]); - printf("--------------------------------\n"); + printf("--------------------------------\n\n"); return 0; } @@ -2236,16 +2279,17 @@ int test_YCbCr_pixels() int TestPrimitivesYCbCr(int argc, char* argv[]) { - int cmp; - int cnt; int size; + int cmp[3]; + int cnt[3]; + float err[3]; BYTE* actual; BYTE* expected; INT16* pYCbCr[3]; const primitives_t* prims = primitives_get(); static const prim_size_t roi_64x64 = { 64, 64 }; - return test_YCbCr_pixels(); + //return test_YCbCr_pixels(); expected = (BYTE*) TEST_XRGB_IMAGE; @@ -2289,41 +2333,52 @@ int TestPrimitivesYCbCr(int argc, char* argv[]) _aligned_free(pSrcDst[2]); } - if (1) + if (0) { - test_fill_bitmap_red_channel(actual, 64, 64, 0); - test_fill_bitmap_red_channel(expected, 64, 64, 0); - } - - if (1) - { - test_fill_bitmap_green_channel(actual, 64, 64, 0); - test_fill_bitmap_green_channel(expected, 64, 64, 0); + test_fill_bitmap_channel(actual, 64, 64, 0, 2); /* red */ + test_fill_bitmap_channel(expected, 64, 64, 0, 2); /* red */ } if (0) { - test_fill_bitmap_blue_channel(actual, 64, 64, 0); - test_fill_bitmap_blue_channel(expected, 64, 64, 0); + test_fill_bitmap_channel(actual, 64, 64, 0, 1); /* green */ + test_fill_bitmap_channel(expected, 64, 64, 0, 1); /* green */ } - cmp = test_memcmp_offset(actual, expected, size); - cnt = test_memcmp_count(actual, expected, size); - - if (cmp <= 0) + if (0) { - cmp *= -1; - float rate = ((float) cnt) / ((float) size) * 100.0f; - - printf("YCbCr to RGB conversion failure\n"); - - printf("Actual, Expected (offset: %d diff: %d/%d = %d%%):\n", - cmp, cnt, size, (int) rate); - - winpr_HexDump(&actual[cmp], 16); - winpr_HexDump(&expected[cmp], 16); + test_fill_bitmap_channel(actual, 64, 64, 0, 0); /* blue */ + test_fill_bitmap_channel(expected, 64, 64, 0, 0); /* blue */ } + cmp[2] = test_bmp_cmp_offset(actual, expected, size, 2); /* red */ + cnt[2] = test_bmp_cmp_count(actual, expected, size, 2); /* red */ + err[2] = ((float) cnt[2]) / ((float) size / 4) * 100.0f; + + cmp[1] = test_bmp_cmp_offset(actual, expected, size, 1); /* green */ + cnt[1] = test_bmp_cmp_count(actual, expected, size, 1); /* green */ + err[1] = ((float) cnt[1]) / ((float) size / 4) * 100.0f; + + cmp[0] = test_bmp_cmp_offset(actual, expected, size, 0); /* blue */ + cnt[0] = test_bmp_cmp_count(actual, expected, size, 0); /* blue */ + err[0] = ((float) cnt[0]) / ((float) size / 4) * 100.0f; + + if (0) + { + printf("Red Error Dump:\n"); + test_bmp_cmp_dump(actual, expected, size, 2); /* red */ + + printf("Green Error Dump:\n"); + test_bmp_cmp_dump(actual, expected, size, 1); /* green */ + + printf("Blue Error Dump:\n"); + test_bmp_cmp_dump(actual, expected, size, 0); /* blue */ + } + + printf("R: diff: %d (%f%%)\n", cnt[2], err[2]); + printf("G: diff: %d (%f%%)\n", cnt[1], err[1]); + printf("B: diff: %d (%f%%)\n", cnt[0], err[0]); + _aligned_free(actual); return 0; From 372d4076d45c7a6b07d15fc953aae6ae29b61d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Tue, 9 Sep 2014 14:36:04 -0400 Subject: [PATCH 26/31] libfreerdp-codec: fix progressive decoding --- libfreerdp/codec/progressive.c | 47 ++++++++++------- .../codec/test/TestFreeRDPCodecProgressive.c | 50 ++++++++++--------- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/libfreerdp/codec/progressive.c b/libfreerdp/codec/progressive.c index 69092d161..a8d042fda 100644 --- a/libfreerdp/codec/progressive.c +++ b/libfreerdp/codec/progressive.c @@ -897,10 +897,36 @@ INT16 progressive_rfx_srl_read(RFX_PROGRESSIVE_UPGRADE_STATE* state, UINT32 numB return sign ? -mag : mag; } +int progressive_rfx_upgrade_state_finish(RFX_PROGRESSIVE_UPGRADE_STATE* state) +{ + int pad; + wBitStream* srl; + wBitStream* raw; + + srl = state->srl; + raw = state->raw; + + /* Read trailing bits from RAW/SRL bit streams */ + + pad = (raw->position % 8) ? (8 - (raw->position % 8)) : 0; + + if (pad) + BitStream_Shift(raw, pad); + + pad = (srl->position % 8) ? (8 - (srl->position % 8)) : 0; + + if (pad) + BitStream_Shift(srl, pad); + + if (BitStream_GetRemainingLength(srl) == 8) + BitStream_Shift(srl, 8); + + return 1; +} + int progressive_rfx_upgrade_block(RFX_PROGRESSIVE_UPGRADE_STATE* state, INT16* buffer, INT16* sign, int length, UINT32 shift, UINT32 bitPos, UINT32 numBits) { - int pad; int index; INT16 input; wBitStream* srl; @@ -923,21 +949,6 @@ int progressive_rfx_upgrade_block(RFX_PROGRESSIVE_UPGRADE_STATE* state, INT16* b buffer[index] += (input << shift); } - /* This is the last band, read padding bits from RAW and SRL bit streams */ - - pad = (raw->position % 8) ? (8 - (raw->position % 8)) : 0; - - if (pad) - BitStream_Shift(raw, pad); - - pad = (srl->position % 8) ? (8 - (srl->position % 8)) : 0; - - if (pad) - BitStream_Shift(srl, pad); - - if (BitStream_GetRemainingLength(srl) == 8) - BitStream_Shift(srl, 8); - return 1; } @@ -966,10 +977,11 @@ int progressive_rfx_upgrade_block(RFX_PROGRESSIVE_UPGRADE_STATE* state, INT16* b /* sign == 0, read from srl */ input = progressive_rfx_srl_read(state, numBits); + + sign[index] = input; } buffer[index] += (input << shift); - sign[index] = input; } return 1; @@ -1014,6 +1026,7 @@ int progressive_rfx_upgrade_component(PROGRESSIVE_CONTEXT* progressive, RFX_COMP state.nonLL = FALSE; progressive_rfx_upgrade_block(&state, ¤t[4015], &sign[4015], 81, shift->LL3, bitPos->LL3, numBits->LL3); /* LL3 */ + progressive_rfx_upgrade_state_finish(&state); aRawLen = (state.raw->position + 7) / 8; aSrlLen = (state.srl->position + 7) / 8; diff --git a/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c b/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c index 03533d2c1..a577ed09f 100644 --- a/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c +++ b/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c @@ -194,22 +194,22 @@ static int test_image_fill_quarter(BYTE* pDstData, int nDstStep, int nWidth, int case 1: x = nWidth / 2; y = nHeight / 2; - width = nWidth; - height = nHeight; + width = nWidth / 2; + height = nHeight /2; break; case 2: - x = nWidth / 2; - y = 0; - width = nWidth; - height = nHeight / 2; - break; - - case 3: x = 0; y = nHeight / 2; width = nWidth / 2; - height = nHeight; + height = nHeight /2; + break; + + case 3: + x = nWidth / 2; + y = 0; + width = nWidth / 2; + height = nHeight /2; break; } @@ -878,18 +878,18 @@ int test_progressive_decode(PROGRESSIVE_CONTEXT* progressive, EGFX_SAMPLE_FILE f break; case 2: - clippingRect.left = g_Width / 2; - clippingRect.top = 0; - clippingRect.right = g_Width; - clippingRect.bottom = g_Height / 2; - break; - - case 3: clippingRect.left = 0; clippingRect.top = g_Height / 2; clippingRect.right = g_Width / 2; clippingRect.bottom = g_Height; break; + + case 3: + clippingRect.left = g_Width / 2; + clippingRect.top = 0; + clippingRect.right = g_Width; + clippingRect.bottom = g_Height / 2; + break; } for (index = 0; index < region->numTiles; index++) @@ -925,6 +925,7 @@ int test_progressive_decode(PROGRESSIVE_CONTEXT* progressive, EGFX_SAMPLE_FILE f if (cmp <= 0) { +#if 0 float rate = ((float) cnt) / ((float) size) * 100.0f; cmp *= -1; @@ -936,6 +937,7 @@ int test_progressive_decode(PROGRESSIVE_CONTEXT* progressive, EGFX_SAMPLE_FILE f winpr_HexDump(&g_DstData[cmp], 16); winpr_HexDump(&bitmaps[pass].buffer[cmp], 16); +#endif } //WLog_Image(progressive->log, WLOG_TRACE, g_DstData, g_Width, g_Height, 32); @@ -966,7 +968,7 @@ int test_progressive_ms_sample(char* ms_sample_path) if (status < 0) return -1; - count = 1; + count = 4; progressive = progressive_context_new(FALSE); @@ -978,7 +980,7 @@ int test_progressive_ms_sample(char* ms_sample_path) if (1) { - printf("Sample Image 1\n"); + printf("\nSample Image 1\n"); test_image_fill(g_DstData, g_DstStep, 0, 0, g_Width, g_Height, 0xFF000000); test_progressive_decode(progressive, files[0][0], bitmaps[0][0], 0, count); test_progressive_decode(progressive, files[0][1], bitmaps[0][1], 1, count); @@ -986,11 +988,11 @@ int test_progressive_ms_sample(char* ms_sample_path) test_progressive_decode(progressive, files[0][3], bitmaps[0][3], 3, count); } - /* image 2 (incorrect) */ + /* image 2 */ - if (0) + if (1) { - printf("Sample Image 2\n"); + printf("\nSample Image 2\n"); test_image_fill(g_DstData, g_DstStep, 0, 0, g_Width, g_Height, 0xFF000000); test_progressive_decode(progressive, files[1][0], bitmaps[1][0], 0, count); test_progressive_decode(progressive, files[1][1], bitmaps[1][1], 1, count); @@ -1000,9 +1002,9 @@ int test_progressive_ms_sample(char* ms_sample_path) /* image 3 */ - if (0) + if (1) { - printf("Sample Image 3\n"); + printf("\nSample Image 3\n"); test_image_fill(g_DstData, g_DstStep, 0, 0, g_Width, g_Height, 0xFF000000); test_progressive_decode(progressive, files[2][0], bitmaps[2][0], 0, count); test_progressive_decode(progressive, files[2][1], bitmaps[2][1], 1, count); From 5c5eedc85b7fd62b29664dbd768873dd998969e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Tue, 9 Sep 2014 17:34:02 -0400 Subject: [PATCH 27/31] libfreerdp-codec: allow error margin of 1 on YCbCr to RGB color decoding --- .../codec/test/TestFreeRDPCodecProgressive.c | 55 ++++------- .../primitives/test/TestPrimitivesYCbCr.c | 95 ++++++++++--------- 2 files changed, 69 insertions(+), 81 deletions(-) diff --git a/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c b/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c index a577ed09f..3167704ce 100644 --- a/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c +++ b/libfreerdp/codec/test/TestFreeRDPCodecProgressive.c @@ -220,6 +220,8 @@ static int test_image_fill_quarter(BYTE* pDstData, int nDstStep, int nWidth, int static int test_image_fill_unused_quarters(BYTE* pDstData, int nDstStep, int nWidth, int nHeight, UINT32 color, int quarter) { + return 1; + if (quarter == 0) { test_image_fill_quarter(pDstData, nDstStep, nWidth, nHeight, color, 1); @@ -799,29 +801,21 @@ int test_progressive_load_bitmaps(char* ms_sample_path, EGFX_SAMPLE_FILE bitmaps return 1; } -static int test_memcmp_offset(const BYTE* mem1, const BYTE* mem2, int size) -{ - int index = 0; - - while ((index < size) && (*mem1 == *mem2)) - { - mem1++; - mem2++; - index++; - } - - return (index == size) ? 1 : -index; -} - -static int test_memcmp_count(const BYTE* mem1, const BYTE* mem2, int size) +static int test_memcmp_count(const BYTE* mem1, const BYTE* mem2, int size, int margin) { + int error; int count = 0; int index = 0; for (index = 0; index < size; index++) { if (*mem1 != *mem2) - count++; + { + error = (*mem1 > *mem2) ? *mem1 - *mem2 : *mem2 - *mem1; + + if (error > margin) + count++; + } mem1++; mem2++; @@ -832,7 +826,6 @@ static int test_memcmp_count(const BYTE* mem1, const BYTE* mem2, int size) int test_progressive_decode(PROGRESSIVE_CONTEXT* progressive, EGFX_SAMPLE_FILE files[4], EGFX_SAMPLE_FILE bitmaps[4], int quarter, int count) { - int cmp; int cnt; int pass; int size; @@ -920,24 +913,13 @@ int test_progressive_decode(PROGRESSIVE_CONTEXT* progressive, EGFX_SAMPLE_FILE f } size = bitmaps[pass].size; - cmp = test_memcmp_offset(g_DstData, bitmaps[pass].buffer, size); - cnt = test_memcmp_count(g_DstData, bitmaps[pass].buffer, size); + cnt = test_memcmp_count(g_DstData, bitmaps[pass].buffer, size, 1); - if (cmp <= 0) + if (cnt) { -#if 0 float rate = ((float) cnt) / ((float) size) * 100.0f; - - cmp *= -1; - printf("Progressive RemoteFX decompression failure\n"); - - printf("Actual, Expected (offset: %d diff: %d/%d = %.3f%%):\n", - cmp, cnt, size, rate); - - winpr_HexDump(&g_DstData[cmp], 16); - winpr_HexDump(&bitmaps[pass].buffer[cmp], 16); -#endif + printf("Actual, Expected (%d/%d = %.3f%%):\n", cnt, size, rate); } //WLog_Image(progressive->log, WLOG_TRACE, g_DstData, g_Width, g_Height, 32); @@ -958,6 +940,9 @@ int test_progressive_ms_sample(char* ms_sample_path) g_Height = 1080; g_DstStep = g_Width * 4; + ZeroMemory(files, sizeof(files)); + ZeroMemory(bitmaps, sizeof(bitmaps)); + status = test_progressive_load_files(ms_sample_path, files); if (status < 0) @@ -990,9 +975,9 @@ int test_progressive_ms_sample(char* ms_sample_path) /* image 2 */ - if (1) + if (0) { - printf("\nSample Image 2\n"); + printf("\nSample Image 2\n"); /* sample data is in incorrect order */ test_image_fill(g_DstData, g_DstStep, 0, 0, g_Width, g_Height, 0xFF000000); test_progressive_decode(progressive, files[1][0], bitmaps[1][0], 0, count); test_progressive_decode(progressive, files[1][1], bitmaps[1][1], 1, count); @@ -1002,9 +987,9 @@ int test_progressive_ms_sample(char* ms_sample_path) /* image 3 */ - if (1) + if (0) { - printf("\nSample Image 3\n"); + printf("\nSample Image 3\n"); /* sample data is in incorrect order */ test_image_fill(g_DstData, g_DstStep, 0, 0, g_Width, g_Height, 0xFF000000); test_progressive_decode(progressive, files[2][0], bitmaps[2][0], 0, count); test_progressive_decode(progressive, files[2][1], bitmaps[2][1], 1, count); diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c index 2cbd8b69e..17fba910d 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c @@ -2076,26 +2076,9 @@ static UINT32 TEST_XRGB_IMAGE[4096] = 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5 }; -static int test_bmp_cmp_offset(const BYTE* mem1, const BYTE* mem2, int size, int channel) -{ - int index = 0; - - size /= 4; - mem1 += channel; - mem2 += channel; - - while ((index < size) && (*mem1 == *mem2)) - { - mem1 += 4; - mem2 += 4; - index++; - } - - return (index == size) ? 1 : -index; -} - -static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int channel) +static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int channel, int margin) { + int error; int count = 0; int index = 0; @@ -2106,7 +2089,12 @@ static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int for (index = 0; index < size; index++) { if (*mem1 != *mem2) - count++; + { + error = (*mem1 > *mem2) ? *mem1 - *mem2 : *mem2 - *mem1; + + if (error > margin) + count++; + } mem1 += 4; mem2 += 4; @@ -2115,8 +2103,10 @@ static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int return count; } -static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, int channel) +static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, int channel, int margin) { + int x, y; + int error[3]; UINT32 pixel; int count = 0; int index = 0; @@ -2142,10 +2132,19 @@ static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, Cb = TEST_CB_COMPONENT[index]; Cr = TEST_CR_COMPONENT[index]; - printf("Idx: %d Y: %+5d Cb: %+5d Cr: %+5d Actual: R: %3d G: %3d B: %3d Expected: R: %3d G: %3d B: %3d\n", - index, Y, Cb, Cr, R, G, B, eR, eG, eB); + x = index % 64; + y = (index - x) / 64; - count++; + error[0] = (R > eR) ? R - eR : eR - R; + error[1] = (G > eG) ? G - eG : eG - G; + error[2] = (B > eB) ? B - eB : eB - B; + + if ((error[0] > margin) || (error[1] > margin) || (error[2] > margin)) + { + printf("(%2d,%2d) Y: %+5d Cb: %+5d Cr: %+5d R: %03d/%03d G: %03d/%03d B: %03d/%03d %d %d %d\n", + x, y, Y, Cb, Cr, R, eR, G, eG, B, eB, R - eR, G - eG, B - eB); + count++; + } } actual += 4; @@ -2178,36 +2177,43 @@ static TEST_FP_TYPE TEST_YCbCrToRGB_01[4] = { 1.403f, 0.344f, static TEST_FP_TYPE TEST_YCbCrToRGB_02[4] = { 1.402525f, 0.343730f, 0.714401f, 1.769905f }; static TEST_FP_TYPE TEST_YCbCrToRGB_03[4] = { 1.402524948120117L, 0.3437300026416779L, 0.7144010066986084L, 1.769904971122742L }; -static INT16 TEST_YCbCr_01[3] = { +115, +1720, -2145 }; -static BYTE TEST_RGB_01[3] = { 37, 161, 227 }; /* incorrect red */ +static INT16 TEST_YCbCr_01[3] = { +3443, -1863, +272 }; +static BYTE TEST_RGB_01[3] = { 247, 249, 132 }; -static INT16 TEST_YCbCr_02[3] = { -450, +1938, -2126 }; -static BYTE TEST_RGB_02[3] = { 21, 140, 221 }; /* incorrect green */ +static INT16 TEST_YCbCr_02[3] = { +1086, +1584, -2268 }; +static BYTE TEST_RGB_02[3] = { 62, 195, 249 }; -static INT16 TEST_YCbCr_03[3] = { -504, +1896, -2168 }; -static BYTE TEST_RGB_03[3] = { 17, 140, 217 }; /* incorrect blue */ +static INT16 TEST_YCbCr_03[3] = { -576, +2002, -2179 }; +static BYTE TEST_RGB_03[3] = { 15, 137, 221 }; int test_YCbCr_fp(TEST_FP_TYPE coeffs[4], INT16 YCbCr[3], BYTE RGB[3]) { INT16 R, G, B; TEST_FP_TYPE Y, Cb, Cr; TEST_FP_TYPE fR, fG, fB; + TEST_FP_TYPE fR1, fR2; Y = (TEST_FP_TYPE) (YCbCr[0] + 4096); Cb = (TEST_FP_TYPE) (YCbCr[1]); Cr = (TEST_FP_TYPE) (YCbCr[2]); #if 1 + fR1 = Cr * coeffs[0]; + fR2 = fR1 + Y + 16.0f; + fR = ((Cr * coeffs[0]) + Y + 16.0f); fG = (Y - (Cb * coeffs[1]) - (Cr * coeffs[2]) + 16.0f); fB = ((Cb * coeffs[3]) + Y + 16.0f); - printf("fR: %f fG: %f fB: %f\n", fR, fG, fB); + printf("fR: %f fG: %f fB: %f fY: %f\n", fR, fG, fB, Y); R = (INT16) fR; G = (INT16) fG; B = (INT16) fB; + printf("mR: %d mG: %d mB: %d\n", + (R - 16) % 32, (G - 16) % 32, (B - 16) % 32); + printf("iR: %d iG: %d iB: %d\n", R, G, B); R >>= 5; @@ -2280,11 +2286,11 @@ int test_YCbCr_pixels() int TestPrimitivesYCbCr(int argc, char* argv[]) { int size; - int cmp[3]; int cnt[3]; float err[3]; BYTE* actual; BYTE* expected; + int margin = 1; INT16* pYCbCr[3]; const primitives_t* prims = primitives_get(); static const prim_size_t roi_64x64 = { 64, 64 }; @@ -2351,33 +2357,30 @@ int TestPrimitivesYCbCr(int argc, char* argv[]) test_fill_bitmap_channel(expected, 64, 64, 0, 0); /* blue */ } - cmp[2] = test_bmp_cmp_offset(actual, expected, size, 2); /* red */ - cnt[2] = test_bmp_cmp_count(actual, expected, size, 2); /* red */ + cnt[2] = test_bmp_cmp_count(actual, expected, size, 2, margin); /* red */ err[2] = ((float) cnt[2]) / ((float) size / 4) * 100.0f; - cmp[1] = test_bmp_cmp_offset(actual, expected, size, 1); /* green */ - cnt[1] = test_bmp_cmp_count(actual, expected, size, 1); /* green */ + cnt[1] = test_bmp_cmp_count(actual, expected, size, 1, margin); /* green */ err[1] = ((float) cnt[1]) / ((float) size / 4) * 100.0f; - cmp[0] = test_bmp_cmp_offset(actual, expected, size, 0); /* blue */ - cnt[0] = test_bmp_cmp_count(actual, expected, size, 0); /* blue */ + cnt[0] = test_bmp_cmp_count(actual, expected, size, 0, margin); /* blue */ err[0] = ((float) cnt[0]) / ((float) size / 4) * 100.0f; - if (0) + if (cnt[0] || cnt[1] || cnt[2]) { printf("Red Error Dump:\n"); - test_bmp_cmp_dump(actual, expected, size, 2); /* red */ + test_bmp_cmp_dump(actual, expected, size, 2, margin); /* red */ printf("Green Error Dump:\n"); - test_bmp_cmp_dump(actual, expected, size, 1); /* green */ + test_bmp_cmp_dump(actual, expected, size, 1, margin); /* green */ printf("Blue Error Dump:\n"); - test_bmp_cmp_dump(actual, expected, size, 0); /* blue */ - } + test_bmp_cmp_dump(actual, expected, size, 0, margin); /* blue */ - printf("R: diff: %d (%f%%)\n", cnt[2], err[2]); - printf("G: diff: %d (%f%%)\n", cnt[1], err[1]); - printf("B: diff: %d (%f%%)\n", cnt[0], err[0]); + printf("R: diff: %d (%f%%)\n", cnt[2], err[2]); + printf("G: diff: %d (%f%%)\n", cnt[1], err[1]); + printf("B: diff: %d (%f%%)\n", cnt[0], err[0]); + } _aligned_free(actual); From bcf1266f517f07212e737fd24bba548a93157a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Tue, 9 Sep 2014 19:15:07 -0400 Subject: [PATCH 28/31] libfreerdp-primitives: integrate H264 SSE3 color converter --- include/freerdp/codec/h264.h | 19 -- libfreerdp/codec/h264.c | 55 ++-- libfreerdp/primitives/prim_YUV.c | 39 +-- libfreerdp/primitives/prim_YUV_opt.c | 380 +++++++++++++-------------- 4 files changed, 225 insertions(+), 268 deletions(-) diff --git a/include/freerdp/codec/h264.h b/include/freerdp/codec/h264.h index 969914709..e539cb0b3 100644 --- a/include/freerdp/codec/h264.h +++ b/include/freerdp/codec/h264.h @@ -44,31 +44,12 @@ struct _H264_CONTEXT { BOOL Compressor; - //BYTE* data; - //UINT32 size; UINT32 width; UINT32 height; - //int scanline; - BYTE* pYUVData[3]; int iStride[3]; - -/* -<<<<<<< HEAD -#ifdef WITH_OPENH264 - ISVCDecoder* pDecoder; BYTE* pYUVData[3]; - int iStride[2]; -#endif -#ifdef WITH_LIBAVCODEC - AVCodec* codec; - AVCodecContext* codecContext; - AVCodecParserContext* codecParser; - AVFrame* videoFrame; -#endif -======= -*/ void* pSystemData; H264_CONTEXT_SUBSYSTEM* subsystem; }; diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 5f8f688ab..cf5d2be58 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -28,9 +28,6 @@ #include #include -#include - - /** * Dummy subsystem */ @@ -87,8 +84,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz SSysMEMBuffer* pSystemBuffer; H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData; - struct timeval T1,T2; - if (!sys->pDecoder) return -1; @@ -102,7 +97,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); - gettimeofday(&T1,NULL); state = (*sys->pDecoder)->DecodeFrame2( sys->pDecoder, pSrcData, @@ -119,9 +113,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (sBufferInfo.iBufferStatus != 1) state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo); - - gettimeofday(&T2,NULL); - printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -285,18 +276,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS AVPacket packet; H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData; - struct timeval T1,T2; - av_init_packet(&packet); packet.data = pSrcData; packet.size = SrcSize; - gettimeofday(&T1,NULL); status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet); - gettimeofday(&T2,NULL); - - printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); if (status < 0) { @@ -437,20 +422,18 @@ static H264_CONTEXT_SUBSYSTEM g_Subsystem_libavcodec = int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects) { + int index; + int status; + int* iStride; BYTE* pDstData; BYTE* pDstPoint; - + prim_size_t roi; BYTE** pYUVData; + int width, height; BYTE* pYUVPoint[3]; - RDPGFX_RECT16* rect; - int* iStride; - int ret, i, cx, cy; int UncompressedSize; primitives_t *prims = primitives_get(); - prim_size_t roi; - - struct timeval T1,T2; if (!h264) return -1; @@ -463,23 +446,23 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, if (!(pDstData = *ppDstData)) return -1; - - if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0) - return ret; - + if ((status = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0) + return status; UncompressedSize = h264->width * h264->height * 4; + if (UncompressedSize > (nDstStep * nDstHeight)) return -1; pYUVData = h264->pYUVData; iStride = h264->iStride; - gettimeofday(&T1,NULL); - for (i = 0; i < numRegionRects; i++){ - rect = &(regionRects[i]); - cx = rect->right - rect->left; - cy = rect->bottom - rect->top; + for (index = 0; index < numRegionRects; index++) + { + rect = &(regionRects[index]); + + width = rect->right - rect->left; + height = rect->bottom - rect->top; pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4; pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left; @@ -488,17 +471,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2; #if 0 - printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n", - rect->left, rect->top, cx, cy); + printf("regionRect: x: %d y: %d width: %d height: %d\n", + rect->left, rect->top, width, height); #endif - roi.width = cx; - roi.height = cy; + roi.width = width; + roi.height = height; prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi); } - gettimeofday(&T2,NULL); - printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); return 1; } diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index 0425c9e8f..24ff1a49a 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -27,6 +27,16 @@ #include "prim_internal.h" #include "prim_YUV.h" +/** + * | R | ( | 256 0 403 | | Y | ) + * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 + * | B | ( | 256 475 0 | | V - 128 | ) + * + * | Y | ( | 54 183 18 | | R | ) | 0 | + * | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 | + * | V | ( | 128 -116 -12 | | B | ) | 128 | + */ + pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], BYTE* pDst, int dstStep, const prim_size_t* roi) { @@ -45,14 +55,14 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], int Vp403, Vp120; BYTE* pRGB = pDst; int nWidth, nHeight; - int last_line, last_column; + int lastRow, lastCol; pY = pSrc[0]; pU = pSrc[1]; pV = pSrc[2]; - last_column = roi->width & 0x01; - last_line = roi->height & 0x01; + lastCol = roi->width & 0x01; + lastRow = roi->height & 0x01; nWidth = (roi->width + 1) & ~0x0001; nHeight = (roi->height + 1) & ~0x0001; @@ -68,15 +78,13 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], for (y = 0; y < halfHeight; ) { - y++; - if (y == halfHeight) - last_line = last_line << 1; + if (++y == halfHeight) + lastRow <<= 1; for (x = 0; x < halfWidth; ) { - x++; - if (x == halfWidth) - last_column = last_column << 1; + if (++x == halfWidth) + lastCol <<= 1; U = *pU++; V = *pV++; @@ -121,7 +129,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 2nd pixel */ - if (!(last_column & 0x02)) + if (!(lastCol & 0x02)) { Y = *pY++; Yp = Y << 8; @@ -154,7 +162,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], { pY++; pRGB += 4; - last_column = last_column >> 1; + lastCol >>= 1; } } @@ -165,9 +173,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], for (x = 0; x < halfWidth; ) { - x++; - if (x == halfWidth) - last_column = last_column << 1; + if (++x == halfWidth) + lastCol <<= 1; U = *pU++; V = *pV++; @@ -212,7 +219,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 4th pixel */ - if(!(last_column & 0x02)) + if (!(lastCol & 0x02)) { Y = *pY++; Yp = Y << 8; @@ -245,7 +252,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], { pY++; pRGB += 4; - last_column = last_column >> 1; + lastCol >>= 1; } } diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c index a8010b9d3..eaf7bf6d7 100644 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -25,73 +25,68 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { - char last_line,last_column; -/* last_line: if the last (U,V doubled) line should be skipped, set to 10B - * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ - - int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; - + int lastRow, lastCol; BYTE *UData,*VData,*YData; - + int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; + /* last_line: if the last (U,V doubled) line should be skipped, set to 10B + * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ + + buffer = _aligned_malloc(4 * 16, 16); - buffer=_aligned_malloc(4*16,16); + YData = (BYTE*) pSrc[0]; + UData = (BYTE*) pSrc[1]; + VData = (BYTE*) pSrc[2]; + nWidth = roi->width; + nHeight = roi->height; - YData=(BYTE *)pSrc[0]; - UData=(BYTE *)pSrc[1]; - VData=(BYTE *)pSrc[2]; - - nWidth=roi->width; - nHeight=roi->height; - - - if((last_column=nWidth&3)){ - switch(last_column){ - case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break; - case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; - case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; + if ((lastCol = (nWidth & 3))) + { + switch (lastCol) + { + case 1: + r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); + break; + + case 2: + r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); + break; + + case 3: + r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); + break; } + _mm_store_si128(buffer+3,r7); - last_column=1; + lastCol = 1; } - nWidth+=3; - nWidth=nWidth>>2; + nWidth += 3; + nWidth = nWidth >> 2; - - last_line=nHeight&1; + lastRow = nHeight & 1; nHeight++; - nHeight=nHeight>>1; + nHeight = nHeight >> 1; + VaddDst = (dstStep << 1) - (nWidth << 4); + VaddY = (srcStep[0] << 1) - (nWidth << 2); + VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); + VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); - VaddDst=(dstStep<<1)-(nWidth<<4); - VaddY=(srcStep[0]<<1)-(nWidth<<2); - VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC); - VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC); - - - while(nHeight-- >0){ - if(nHeight==0){ - last_line=last_line<<1; - } + while (nHeight-- > 0) + { + if (nHeight == 0) + lastRow <<= 1; + + i = 0; - i=0; - do{ -/* - * Well, in the end it should look like this: - * C = Y; - * D = U - 128; - * E = V - 128; - * - * R = clip(( 256 * C + 403 * E + 128) >> 8); - * G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); - * B = clip(( 256 * C + 475 * D + 128) >> 8); - */ - if(!(i&0x01)){ - + do + { + if (!(i & 0x01)) + { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * @@ -99,50 +94,48 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ - r0=_mm_cvtsi32_si128(*(UINT32 *)UData); - r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); - r0=_mm_shuffle_epi8(r0,r5); + r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); + r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); + r0 = _mm_shuffle_epi8(r0,r5); - UData+=4; + UData += 4; /* then we subtract 128 from each value, so we get D */ - r3=_mm_set_epi16(128,128,128,128,128,128,128,128); - r0=_mm_subs_epi16(r0,r3); + r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); + r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ - r2=r0; + r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ - r4=r0; - r7=_mm_set_epi16(48,48,48,48,48,48,48,48); - r0=_mm_mullo_epi16(r0,r7); - r4=_mm_mulhi_epi16(r4,r7); - r7=r0; - r0=_mm_unpacklo_epi16(r0,r4); - r4=_mm_unpackhi_epi16(r7,r4); - + r4 = r0; + r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); + r0 = _mm_mullo_epi16(r0,r7); + r4 = _mm_mulhi_epi16(r4,r7); + r7 = r0; + r0 = _mm_unpacklo_epi16(r0,r4); + r4 = _mm_unpackhi_epi16(r7,r4); /* to complete this step, add (?) 128 to each value (rounding ?!) * yeah, add. in the end this will be subtracted from something, * because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! * by the way, our values have become signed dwords during multiplication! */ - r6=_mm_set_epi32(128,128,128,128); - r0=_mm_sub_epi32(r0,r6); - r4=_mm_sub_epi32(r4,r6); - + r6 = _mm_set_epi32(128,128,128,128); + r0 = _mm_sub_epi32(r0,r6); + r4 = _mm_sub_epi32(r4,r6); /* to get B data, we need to prepare a secound value, D*475+128 */ - r1=r2; - r7=_mm_set_epi16(475,475,475,475,475,475,475,475); - r1=_mm_mullo_epi16(r1,r7); - r2=_mm_mulhi_epi16(r2,r7); - r7=r1; - r1=_mm_unpacklo_epi16(r1,r2); - r7=_mm_unpackhi_epi16(r7,r2); + r1 = r2; + r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); + r1 = _mm_mullo_epi16(r1,r7); + r2 = _mm_mulhi_epi16(r2,r7); + r7 = r1; + r1 = _mm_unpacklo_epi16(r1,r2); + r7 = _mm_unpackhi_epi16(r7,r2); - r1=_mm_add_epi32(r1,r6); - r7=_mm_add_epi32(r7,r6); + r1 = _mm_add_epi32(r1,r6); + r7 = _mm_add_epi32(r7,r6); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: @@ -151,76 +144,74 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ - r2=_mm_cvtsi32_si128(*(UINT32 *)VData); - r2=_mm_shuffle_epi8(r2,r5); + r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); + r2 = _mm_shuffle_epi8(r2,r5); - VData+=4; + VData += 4; - r2=_mm_subs_epi16(r2,r3); - - r5=r2; + r2 = _mm_subs_epi16(r2,r3); + r5 = r2; /* this is also known as E*403+128, we need it to convert R data */ - r3=r2; - r7=_mm_set_epi16(403,403,403,403,403,403,403,403); - r2=_mm_mullo_epi16(r2,r7); - r3=_mm_mulhi_epi16(r3,r7); - r7=r2; - r2=_mm_unpacklo_epi16(r2,r3); - r7=_mm_unpackhi_epi16(r7,r3); + r3 = r2; + r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); + r2 = _mm_mullo_epi16(r2,r7); + r3 = _mm_mulhi_epi16(r3,r7); + r7 = r2; + r2 = _mm_unpacklo_epi16(r2,r3); + r7 = _mm_unpackhi_epi16(r7,r3); - r2=_mm_add_epi32(r2,r6); - r7=_mm_add_epi32(r7,r6); + r2 = _mm_add_epi32(r2,r6); + r7 = _mm_add_epi32(r7,r6); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); - - /* doing this step: E*120 */ - r3=r5; - r7=_mm_set_epi16(120,120,120,120,120,120,120,120); - r3=_mm_mullo_epi16(r3,r7); - r5=_mm_mulhi_epi16(r5,r7); - r7=r3; - r3=_mm_unpacklo_epi16(r3,r5); - r7=_mm_unpackhi_epi16(r7,r5); + r3 = r5; + r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); + r3 = _mm_mullo_epi16(r3,r7); + r5 = _mm_mulhi_epi16(r5,r7); + r7 = r3; + r3 = _mm_unpacklo_epi16(r3,r5); + r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D-128) + (120*E) = (48*D +120*E -128) */ - r0=_mm_add_epi32(r0,r3); - r4=_mm_add_epi32(r4,r7); + r0 = _mm_add_epi32(r0,r3); + r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); - }else{ + } + else + { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ - r1=_mm_load_si128(buffer+1); - r2=_mm_load_si128(buffer+2); - r0=_mm_load_si128(buffer); + r1 = _mm_load_si128(buffer+1); + r2 = _mm_load_si128(buffer+2); + r0 = _mm_load_si128(buffer); } - if(++i==nWidth) - last_column=last_column<<1; + if (++i == nWidth) + lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ - r4=_mm_cvtsi32_si128(*(UINT32 *)YData); - r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); - r4=_mm_shuffle_epi8(r4,r7); + r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); + r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4 = _mm_shuffle_epi8(r4,r7); - r5=r4; - r6=r4; + r5 = r4; + r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ - r4=_mm_add_epi32(r4,r2); - r5=_mm_sub_epi32(r5,r0); - r6=_mm_add_epi32(r6,r1); - + r4 = _mm_add_epi32(r4,r2); + r5 = _mm_sub_epi32(r5,r0); + r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. @@ -228,9 +219,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ - r4=_mm_slli_epi32(r4,8); - r5=_mm_slli_epi32(r5,8); - r6=_mm_slli_epi32(r6,8); + r4 = _mm_slli_epi32(r4,8); + r5 = _mm_slli_epi32(r5,8); + r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... @@ -238,128 +229,125 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ - r7=_mm_set_epi32(0,0,0,0); - r4=_mm_max_epi16(r4,r7); - r5=_mm_max_epi16(r5,r7); - r6=_mm_max_epi16(r6,r7); + r7 = _mm_set_epi32(0,0,0,0); + r4 = _mm_max_epi16(r4,r7); + r5 = _mm_max_epi16(r5,r7); + r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_min_epi16(r4,r7); - r5=_mm_min_epi16(r5,r7); - r6=_mm_min_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_min_epi16(r4,r7); + r5 = _mm_min_epi16(r5,r7); + r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_and_si128(r4,r7); + r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ - r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); - r5=_mm_shuffle_epi8(r5,r7); + r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ - r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); - r6=_mm_shuffle_epi8(r6,r7); - + r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ - r4=_mm_or_si128(r4,r5); - r4=_mm_or_si128(r4,r6); - + r4 = _mm_or_si128(r4,r5); + r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ - if(last_column&0x02){ + if (lastCol & 0x02) + { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ - r6=_mm_load_si128(buffer+3); + r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ - r4=_mm_and_si128(r4,r6); + r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ - r5=_mm_lddqu_si128((__m128i *)pDst); + r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ - r6=_mm_andnot_si128(r6,r5); + r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ - r4=_mm_or_si128(r4,r6); + r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); - - if(!(last_line&0x02)){ + if (!(lastRow & 0x02)) + { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ - r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); - r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); - r4=_mm_shuffle_epi8(r4,r7); + r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); + r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4 = _mm_shuffle_epi8(r4,r7); - r5=r4; - r6=r4; + r5 = r4; + r6 = r4; - r4=_mm_add_epi32(r4,r2); - r5=_mm_sub_epi32(r5,r0); - r6=_mm_add_epi32(r6,r1); + r4 = _mm_add_epi32(r4,r2); + r5 = _mm_sub_epi32(r5,r0); + r6 = _mm_add_epi32(r6,r1); + r4 = _mm_slli_epi32(r4,8); + r5 = _mm_slli_epi32(r5,8); + r6 = _mm_slli_epi32(r6,8); - r4=_mm_slli_epi32(r4,8); - r5=_mm_slli_epi32(r5,8); - r6=_mm_slli_epi32(r6,8); + r7 = _mm_set_epi32(0,0,0,0); + r4 = _mm_max_epi16(r4,r7); + r5 = _mm_max_epi16(r5,r7); + r6 = _mm_max_epi16(r6,r7); - r7=_mm_set_epi32(0,0,0,0); - r4=_mm_max_epi16(r4,r7); - r5=_mm_max_epi16(r5,r7); - r6=_mm_max_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_min_epi16(r4,r7); + r5 = _mm_min_epi16(r5,r7); + r6 = _mm_min_epi16(r6,r7); - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_min_epi16(r4,r7); - r5=_mm_min_epi16(r5,r7); - r6=_mm_min_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_and_si128(r4,r7); - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_and_si128(r4,r7); + r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5 = _mm_shuffle_epi8(r5,r7); - r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); - r5=_mm_shuffle_epi8(r5,r7); + r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6 = _mm_shuffle_epi8(r6,r7); - r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); - r6=_mm_shuffle_epi8(r6,r7); + r4 = _mm_or_si128(r4,r5); + r4 = _mm_or_si128(r4,r6); - - r4=_mm_or_si128(r4,r5); - r4=_mm_or_si128(r4,r6); - - - if(last_column&0x02){ - r6=_mm_load_si128(buffer+3); - r4=_mm_and_si128(r4,r6); - r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep)); - r6=_mm_andnot_si128(r6,r5); - r4=_mm_or_si128(r4,r6); + if (lastCol & 0x02) + { + r6 = _mm_load_si128(buffer+3); + r4 = _mm_and_si128(r4,r6); + r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); + r6 = _mm_andnot_si128(r6,r5); + r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ - last_column=last_column>>1; + lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ - pDst+=16; - YData+=4; - - }while(iYUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R; + prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB_8u_P3AC4R; } #endif } From 3d4fea7d8eb5859deb21b1740f59c0a223561fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Tue, 9 Sep 2014 19:18:07 -0400 Subject: [PATCH 29/31] libfreerdp-primitives: fix YUV420 color conversion matrix --- libfreerdp/primitives/prim_YUV_opt.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c index eaf7bf6d7..7b80a4522 100644 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -117,15 +117,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); - /* to complete this step, add (?) 128 to each value (rounding ?!) - * yeah, add. in the end this will be subtracted from something, - * because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! - * by the way, our values have become signed dwords during multiplication! */ - r6 = _mm_set_epi32(128,128,128,128); - r0 = _mm_sub_epi32(r0,r6); - r4 = _mm_sub_epi32(r4,r6); - - /* to get B data, we need to prepare a secound value, D*475+128 */ + /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); @@ -134,9 +126,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); - r1 = _mm_add_epi32(r1,r6); - r7 = _mm_add_epi32(r7,r6); - /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd @@ -153,7 +142,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r5 = r2; - /* this is also known as E*403+128, we need it to convert R data */ + /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); @@ -162,9 +151,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); - r2 = _mm_add_epi32(r2,r6); - r7 = _mm_add_epi32(r7,r6); - /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); @@ -178,7 +164,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: - * (48*D-128) + (120*E) = (48*D +120*E -128) */ + * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); From c71e4e18a194819684287d11288f1c2abececc39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Wed, 10 Sep 2014 00:42:41 -0400 Subject: [PATCH 30/31] libfreerdp-core: refactor codec context management --- client/Windows/wf_graphics.c | 33 +++- client/Windows/wf_interface.c | 1 + client/Windows/wf_interface.h | 1 + client/X11/xf_client.c | 35 ++-- client/X11/xf_gdi.c | 8 +- client/X11/xf_gfx.c | 83 +++++---- client/X11/xf_graphics.c | 48 ++++-- client/X11/xfreerdp.h | 6 +- include/freerdp/codec/bitmap.h | 18 -- include/freerdp/codec/clear.h | 3 +- include/freerdp/codec/interleaved.h | 46 +++++ .../freerdp}/codec/planar.h | 23 ++- include/freerdp/codec/progressive.h | 3 +- include/freerdp/codec/rfx.h | 11 +- include/freerdp/codecs.h | 63 +++++++ include/freerdp/freerdp.h | 5 +- include/freerdp/gdi/gdi.h | 3 +- include/freerdp/types.h | 15 ++ include/freerdp/update.h | 15 -- libfreerdp/codec/CMakeLists.txt | 5 +- .../codec/{bitmap_encode.c => bitmap.c} | 1 + .../codec/{bitmap_decode.c => interleaved.c} | 136 +++++++++------ libfreerdp/codec/planar.c | 3 +- .../codec/test/TestFreeRDPCodecPlanar.c | 85 +++------- libfreerdp/core/CMakeLists.txt | 1 + libfreerdp/core/codecs.c | 157 ++++++++++++++++++ libfreerdp/core/freerdp.c | 2 + libfreerdp/gdi/gdi.c | 90 +++++----- libfreerdp/gdi/graphics.c | 50 ++++-- libfreerdp/primitives/test/prim_test.h | 10 +- 30 files changed, 659 insertions(+), 301 deletions(-) create mode 100644 include/freerdp/codec/interleaved.h rename {libfreerdp => include/freerdp}/codec/planar.h (73%) create mode 100644 include/freerdp/codecs.h rename libfreerdp/codec/{bitmap_encode.c => bitmap.c} (99%) rename libfreerdp/codec/{bitmap_decode.c => interleaved.c} (71%) create mode 100644 libfreerdp/core/codecs.c diff --git a/client/Windows/wf_graphics.c b/client/Windows/wf_graphics.c index c33f14a82..e0adc70ac 100644 --- a/client/Windows/wf_graphics.c +++ b/client/Windows/wf_graphics.c @@ -142,8 +142,9 @@ void wf_Bitmap_Paint(wfContext* wfc, rdpBitmap* bitmap) } void wf_Bitmap_Decompress(wfContext* wfc, rdpBitmap* bitmap, - BYTE* data, int width, int height, int bpp, int length, BOOL compressed, int codec_id) + BYTE* data, int width, int height, int bpp, int length, BOOL compressed, int codecId) { + int status; UINT16 size; size = width * height * (bpp / 8); @@ -155,13 +156,35 @@ void wf_Bitmap_Decompress(wfContext* wfc, rdpBitmap* bitmap, if (compressed) { - BOOL status; + BYTE* pDstData; + UINT32 SrcSize; - status = bitmap_decompress(data, bitmap->data, width, height, length, bpp, bpp); + SrcSize = (UINT32) length; + pDstData = bitmap->data; - if (status != TRUE) + if (bpp < 32) { - DEBUG_WARN( "Bitmap Decompression Failed\n"); + freerdp_client_codecs_prepare(wfc->codecs, FREERDP_CODEC_INTERLEAVED); + + status = interleaved_decompress(wfc->codecs->interleaved, data, SrcSize, bpp, + &pDstData, PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("wf_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } + } + else + { + freerdp_client_codecs_prepare(wfc->codecs, FREERDP_CODEC_PLANAR); + + status = planar_decompress(wfc->codecs->planar, data, SrcSize, &pDstData, + PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("wf_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } } } else diff --git a/client/Windows/wf_interface.c b/client/Windows/wf_interface.c index 3fb115d43..8821ae37d 100644 --- a/client/Windows/wf_interface.c +++ b/client/Windows/wf_interface.c @@ -190,6 +190,7 @@ BOOL wf_pre_connect(freerdp* instance) context = instance->context; wfc = (wfContext*) instance->context; wfc->instance = instance; + wfc->codecs = instance->context->codecs; settings = instance->settings; diff --git a/client/Windows/wf_interface.h b/client/Windows/wf_interface.h index b9aa4056c..ff291e0f8 100644 --- a/client/Windows/wf_interface.h +++ b/client/Windows/wf_interface.h @@ -97,6 +97,7 @@ struct wf_context HGDI_DC hdc; UINT16 srcBpp; UINT16 dstBpp; + rdpCodecs* codecs; freerdp* instance; wfBitmap* primary; wfBitmap* drawing; diff --git a/client/X11/xf_client.c b/client/X11/xf_client.c index d43ed6359..e1f3d996e 100644 --- a/client/X11/xf_client.c +++ b/client/X11/xf_client.c @@ -695,17 +695,22 @@ static void xf_post_disconnect(freerdp *instance) * @return TRUE if successful. FALSE otherwise. * Can exit with error code XF_EXIT_PARSE_ARGUMENTS if there is an error in the parameters. */ -BOOL xf_pre_connect(freerdp *instance) +BOOL xf_pre_connect(freerdp* instance) { - rdpChannels *channels; - rdpSettings *settings; - xfContext *xfc = (xfContext *) instance->context; + rdpChannels* channels; + rdpSettings* settings; + xfContext* xfc = (xfContext*) instance->context; + + xfc->codecs = instance->context->codecs; xfc->settings = instance->settings; xfc->instance = instance; + settings = instance->settings; channels = instance->context->channels; + settings->OsMajorType = OSMAJORTYPE_UNIX; settings->OsMinorType = OSMINORTYPE_NATIVE_XSERVER; + ZeroMemory(settings->OrderSupport, 32); settings->OrderSupport[NEG_DSTBLT_INDEX] = TRUE; settings->OrderSupport[NEG_PATBLT_INDEX] = TRUE; @@ -861,12 +866,12 @@ BOOL xf_post_connect(freerdp *instance) if (settings->RemoteFxCodec) { - xfc->rfx = rfx_context_new(FALSE); + xfc->codecs->rfx = rfx_context_new(FALSE); } if (settings->NSCodec) { - xfc->nsc = nsc_context_new(); + xfc->codecs->nsc = nsc_context_new(); } } @@ -1103,22 +1108,22 @@ void xf_window_free(xfContext *xfc) context->rail = NULL; } - if (xfc->rfx) + if (xfc->codecs->rfx) { - rfx_context_free(xfc->rfx); - xfc->rfx = NULL; + rfx_context_free(xfc->codecs->rfx); + xfc->codecs->rfx = NULL; } - if (xfc->nsc) + if (xfc->codecs->nsc) { - nsc_context_free(xfc->nsc); - xfc->nsc = NULL; + nsc_context_free(xfc->codecs->nsc); + xfc->codecs->nsc = NULL; } - if (xfc->clear) + if (xfc->codecs->clear) { - clear_context_free(xfc->clear); - xfc->clear = NULL; + clear_context_free(xfc->codecs->clear); + xfc->codecs->clear = NULL; } if (xfc->clrconv) diff --git a/client/X11/xf_gdi.c b/client/X11/xf_gdi.c index c18846370..3ae2db472 100644 --- a/client/X11/xf_gdi.c +++ b/client/X11/xf_gdi.c @@ -1033,7 +1033,7 @@ void xf_gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits if (surface_bits_command->codecID == RDP_CODEC_ID_REMOTEFX) { - message = rfx_process_message(xfc->rfx, + message = rfx_process_message(xfc->codecs->rfx, surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); XSetFunction(xfc->display, xfc->gc, GXcopy); @@ -1070,11 +1070,11 @@ void xf_gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits } XSetClipMask(xfc->display, xfc->gc, None); - rfx_message_free(xfc->rfx, message); + rfx_message_free(xfc->codecs->rfx, message); } else if (surface_bits_command->codecID == RDP_CODEC_ID_NSCODEC) { - nsc_process_message(xfc->nsc, surface_bits_command->bpp, surface_bits_command->width, surface_bits_command->height, + nsc_process_message(xfc->codecs->nsc, surface_bits_command->bpp, surface_bits_command->width, surface_bits_command->height, surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); XSetFunction(xfc->display, xfc->gc, GXcopy); @@ -1083,7 +1083,7 @@ void xf_gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits xfc->bmp_codec_nsc = (BYTE*) realloc(xfc->bmp_codec_nsc, surface_bits_command->width * surface_bits_command->height * 4); - freerdp_image_flip(xfc->nsc->BitmapData, xfc->bmp_codec_nsc, + freerdp_image_flip(xfc->codecs->nsc->BitmapData, xfc->bmp_codec_nsc, surface_bits_command->width, surface_bits_command->height, 32); image = XCreateImage(xfc->display, xfc->visual, 24, ZPixmap, 0, diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index c54ff6201..476679364 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -27,53 +27,53 @@ int xf_ResetGraphics(RdpgfxClientContext* context, RDPGFX_RESET_GRAPHICS_PDU* re { xfContext* xfc = (xfContext*) context->custom; - if (xfc->rfx) + if (xfc->codecs->rfx) { - rfx_context_free(xfc->rfx); - xfc->rfx = NULL; + rfx_context_free(xfc->codecs->rfx); + xfc->codecs->rfx = NULL; } - xfc->rfx = rfx_context_new(FALSE); + xfc->codecs->rfx = rfx_context_new(FALSE); - xfc->rfx->width = resetGraphics->width; - xfc->rfx->height = resetGraphics->height; - rfx_context_set_pixel_format(xfc->rfx, RDP_PIXEL_FORMAT_B8G8R8A8); + xfc->codecs->rfx->width = resetGraphics->width; + xfc->codecs->rfx->height = resetGraphics->height; + rfx_context_set_pixel_format(xfc->codecs->rfx, RDP_PIXEL_FORMAT_B8G8R8A8); - if (xfc->nsc) + if (xfc->codecs->nsc) { - nsc_context_free(xfc->nsc); - xfc->nsc = NULL; + nsc_context_free(xfc->codecs->nsc); + xfc->codecs->nsc = NULL; } - xfc->nsc = nsc_context_new(); + xfc->codecs->nsc = nsc_context_new(); - xfc->nsc->width = resetGraphics->width; - xfc->nsc->height = resetGraphics->height; - nsc_context_set_pixel_format(xfc->nsc, RDP_PIXEL_FORMAT_B8G8R8A8); + xfc->codecs->nsc->width = resetGraphics->width; + xfc->codecs->nsc->height = resetGraphics->height; + nsc_context_set_pixel_format(xfc->codecs->nsc, RDP_PIXEL_FORMAT_B8G8R8A8); - if (xfc->clear) + if (xfc->codecs->clear) { - clear_context_free(xfc->clear); - xfc->clear = NULL; + clear_context_free(xfc->codecs->clear); + xfc->codecs->clear = NULL; } - xfc->clear = clear_context_new(FALSE); + xfc->codecs->clear = clear_context_new(FALSE); - if (xfc->h264) + if (xfc->codecs->h264) { - h264_context_free(xfc->h264); - xfc->h264 = NULL; + h264_context_free(xfc->codecs->h264); + xfc->codecs->h264 = NULL; } - xfc->h264 = h264_context_new(FALSE); + xfc->codecs->h264 = h264_context_new(FALSE); - if (xfc->progressive) + if (xfc->codecs->progressive) { - progressive_context_free(xfc->progressive); - xfc->progressive = NULL; + progressive_context_free(xfc->codecs->progressive); + xfc->codecs->progressive = NULL; } - xfc->progressive = progressive_context_new(TRUE); + xfc->codecs->progressive = progressive_context_new(TRUE); region16_init(&(xfc->invalidRegion)); @@ -216,12 +216,14 @@ int xf_SurfaceCommand_RemoteFX(xfContext* xfc, RdpgfxClientContext* context, RDP REGION16 clippingRects; RECTANGLE_16 clippingRect; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_REMOTEFX); + surface = (xfGfxSurface*) context->GetSurfaceData(context, cmd->surfaceId); if (!surface) return -1; - message = rfx_process_message(xfc->rfx, cmd->data, cmd->length); + message = rfx_process_message(xfc->codecs->rfx, cmd->data, cmd->length); if (!message) return -1; @@ -270,7 +272,7 @@ int xf_SurfaceCommand_RemoteFX(xfContext* xfc, RdpgfxClientContext* context, RDP region16_uninit(&updateRegion); } - rfx_message_free(xfc->rfx, message); + rfx_message_free(xfc->codecs->rfx, message); if (!xfc->inGfxFrame) xf_OutputUpdate(xfc); @@ -285,6 +287,8 @@ int xf_SurfaceCommand_ClearCodec(xfContext* xfc, RdpgfxClientContext* context, R xfGfxSurface* surface; RECTANGLE_16 invalidRect; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_CLEARCODEC); + surface = (xfGfxSurface*) context->GetSurfaceData(context, cmd->surfaceId); if (!surface) @@ -292,7 +296,7 @@ int xf_SurfaceCommand_ClearCodec(xfContext* xfc, RdpgfxClientContext* context, R DstData = surface->data; - status = clear_decompress(xfc->clear, cmd->data, cmd->length, &DstData, + status = clear_decompress(xfc->codecs->clear, cmd->data, cmd->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); if (status < 0) @@ -322,6 +326,8 @@ int xf_SurfaceCommand_Planar(xfContext* xfc, RdpgfxClientContext* context, RDPGF xfGfxSurface* surface; RECTANGLE_16 invalidRect; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_PLANAR); + surface = (xfGfxSurface*) context->GetSurfaceData(context, cmd->surfaceId); if (!surface) @@ -329,7 +335,7 @@ int xf_SurfaceCommand_Planar(xfContext* xfc, RdpgfxClientContext* context, RDPGF DstData = surface->data; - status = planar_decompress(NULL, cmd->data, cmd->length, &DstData, + status = planar_decompress(xfc->codecs->planar, cmd->data, cmd->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); invalidRect.left = cmd->left; @@ -355,8 +361,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_BITMAP_STREAM* bs; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_H264); - h264 = xfc->h264; + h264 = xfc->codecs->h264; bs = (RDPGFX_H264_BITMAP_STREAM*) cmd->extra; @@ -372,7 +379,7 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ DstData = surface->data; - status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, + status = h264_decompress(xfc->codecs->h264, bs->data, bs->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline , surface->height, meta->regionRects, meta->numRegionRects); if (status < 0) @@ -398,6 +405,8 @@ int xf_SurfaceCommand_Alpha(xfContext* xfc, RdpgfxClientContext* context, RDPGFX xfGfxSurface* surface; RECTANGLE_16 invalidRect; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_ALPHACODEC); + surface = (xfGfxSurface*) context->GetSurfaceData(context, cmd->surfaceId); if (!surface) @@ -442,16 +451,18 @@ int xf_SurfaceCommand_Progressive(xfContext* xfc, RdpgfxClientContext* context, RFX_PROGRESSIVE_TILE* tile; PROGRESSIVE_BLOCK_REGION* region; + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_PROGRESSIVE); + surface = (xfGfxSurface*) context->GetSurfaceData(context, cmd->surfaceId); if (!surface) return -1; - progressive_create_surface_context(xfc->progressive, cmd->surfaceId, surface->width, surface->height); + progressive_create_surface_context(xfc->codecs->progressive, cmd->surfaceId, surface->width, surface->height); DstData = surface->data; - status = progressive_decompress(xfc->progressive, cmd->data, cmd->length, &DstData, + status = progressive_decompress(xfc->codecs->progressive, cmd->data, cmd->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height, cmd->surfaceId); if (status < 0) @@ -460,7 +471,7 @@ int xf_SurfaceCommand_Progressive(xfContext* xfc, RdpgfxClientContext* context, return -1; } - region = &(xfc->progressive->region); + region = &(xfc->codecs->progressive->region); region16_init(&clippingRects); @@ -607,7 +618,7 @@ int xf_DeleteSurface(RdpgfxClientContext* context, RDPGFX_DELETE_SURFACE_PDU* de context->SetSurfaceData(context, deleteSurface->surfaceId, NULL); - progressive_delete_surface_context(xfc->progressive, deleteSurface->surfaceId); + progressive_delete_surface_context(xfc->codecs->progressive, deleteSurface->surfaceId); return 1; } diff --git a/client/X11/xf_graphics.c b/client/X11/xf_graphics.c index 330977684..9fc2cc7d4 100644 --- a/client/X11/xf_graphics.c +++ b/client/X11/xf_graphics.c @@ -120,14 +120,14 @@ void xf_Bitmap_Paint(rdpContext* context, rdpBitmap* bitmap) void xf_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, BYTE* data, int width, int height, int bpp, int length, - BOOL compressed, int codec_id) + BOOL compressed, int codecId) { + int status; UINT16 size; BYTE* src; BYTE* dst; int yindex; int xindex; - BOOL status; RFX_MESSAGE* msg; xfContext* xfc = (xfContext*) context; @@ -138,19 +138,21 @@ void xf_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, else bitmap->data = (BYTE*) _aligned_realloc(bitmap->data, size, 16); - switch (codec_id) + switch (codecId) { case RDP_CODEC_ID_NSCODEC: - DEBUG_WARN( "xf_Bitmap_Decompress: nsc not done\n"); + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_NSCODEC); + DEBUG_WARN("xf_Bitmap_Decompress: nsc not done\n"); break; case RDP_CODEC_ID_REMOTEFX: - rfx_context_set_pixel_format(xfc->rfx, RDP_PIXEL_FORMAT_B8G8R8A8); - msg = rfx_process_message(xfc->rfx, data, length); + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_REMOTEFX); + rfx_context_set_pixel_format(xfc->codecs->rfx, RDP_PIXEL_FORMAT_B8G8R8A8); + msg = rfx_process_message(xfc->codecs->rfx, data, length); if (!msg) { - DEBUG_WARN( "xf_Bitmap_Decompress: rfx Decompression Failed\n"); + DEBUG_WARN("xf_Bitmap_Decompress: rfx Decompression Failed\n"); } else { @@ -166,7 +168,7 @@ void xf_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, src++; } } - rfx_message_free(xfc->rfx, msg); + rfx_message_free(xfc->codecs->rfx, msg); } break; @@ -180,11 +182,35 @@ void xf_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, default: if (compressed) { - status = bitmap_decompress(data, bitmap->data, width, height, length, bpp, bpp); + BYTE* pDstData; + UINT32 SrcSize; - if (!status) + SrcSize = (UINT32) length; + pDstData = bitmap->data; + + if (bpp < 32) { - DEBUG_WARN( "xf_Bitmap_Decompress: Bitmap Decompression Failed\n"); + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_INTERLEAVED); + + status = interleaved_decompress(xfc->codecs->interleaved, data, SrcSize, bpp, + &pDstData, PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("xf_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } + } + else + { + freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_PLANAR); + + status = planar_decompress(xfc->codecs->planar, data, SrcSize, &pDstData, + PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("gdi_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } } } else diff --git a/client/X11/xfreerdp.h b/client/X11/xfreerdp.h index a2d89b0df..ab1eef974 100644 --- a/client/X11/xfreerdp.h +++ b/client/X11/xfreerdp.h @@ -75,6 +75,7 @@ struct xf_context freerdp* instance; rdpSettings* settings; + rdpCodecs* codecs; GC gc; int bpp; @@ -152,11 +153,6 @@ struct xf_context VIRTUAL_SCREEN vscreen; BYTE* bmp_codec_none; BYTE* bmp_codec_nsc; - RFX_CONTEXT* rfx; - NSC_CONTEXT* nsc; - CLEAR_CONTEXT* clear; - H264_CONTEXT* h264; - PROGRESSIVE_CONTEXT* progressive; void* xv_context; void* clipboard_context; diff --git a/include/freerdp/codec/bitmap.h b/include/freerdp/codec/bitmap.h index d36917cad..507829253 100644 --- a/include/freerdp/codec/bitmap.h +++ b/include/freerdp/codec/bitmap.h @@ -32,27 +32,9 @@ extern "C" { #endif -FREERDP_API BOOL bitmap_decompress(BYTE* srcData, BYTE* dstData, int width, int height, int size, int srcBpp, int dstBpp); - FREERDP_API int freerdp_bitmap_compress(char* in_data, int width, int height, wStream* s, int bpp, int byte_limit, int start_line, wStream* temp_s, int e); -#define PLANAR_FORMAT_HEADER_CS (1 << 3) -#define PLANAR_FORMAT_HEADER_RLE (1 << 4) -#define PLANAR_FORMAT_HEADER_NA (1 << 5) -#define PLANAR_FORMAT_HEADER_CLL_MASK 0x07 - -typedef struct _BITMAP_PLANAR_CONTEXT BITMAP_PLANAR_CONTEXT; - -FREERDP_API BYTE* freerdp_bitmap_compress_planar(BITMAP_PLANAR_CONTEXT* context, BYTE* data, UINT32 format, - int width, int height, int scanline, BYTE* dstData, int* dstSize); - -FREERDP_API BITMAP_PLANAR_CONTEXT* freerdp_bitmap_planar_context_new(DWORD flags, int maxWidth, int maxHeight); -FREERDP_API void freerdp_bitmap_planar_context_free(BITMAP_PLANAR_CONTEXT* context); - -FREERDP_API int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcSize, - BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); - #ifdef __cplusplus } #endif diff --git a/include/freerdp/codec/clear.h b/include/freerdp/codec/clear.h index 857975b9f..e49d1d572 100644 --- a/include/freerdp/codec/clear.h +++ b/include/freerdp/codec/clear.h @@ -20,6 +20,8 @@ #ifndef FREERDP_CODEC_CLEAR_H #define FREERDP_CODEC_CLEAR_H +typedef struct _CLEAR_CONTEXT CLEAR_CONTEXT; + #include #include @@ -59,7 +61,6 @@ struct _CLEAR_CONTEXT UINT32 ShortVBarStorageCursor; CLEAR_VBAR_ENTRY ShortVBarStorage[16384]; }; -typedef struct _CLEAR_CONTEXT CLEAR_CONTEXT; #ifdef __cplusplus extern "C" { diff --git a/include/freerdp/codec/interleaved.h b/include/freerdp/codec/interleaved.h new file mode 100644 index 000000000..5f6662b6a --- /dev/null +++ b/include/freerdp/codec/interleaved.h @@ -0,0 +1,46 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Interleaved RLE Bitmap Codec + * + * Copyright 2014 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_CODEC_INTERLEAVED_H +#define FREERDP_CODEC_INTERLEAVED_H + +typedef struct _BITMAP_INTERLEAVED_CONTEXT BITMAP_INTERLEAVED_CONTEXT; + +#include +#include + +#include +#include + +struct _BITMAP_INTERLEAVED_CONTEXT +{ + BOOL Compressor; + + UINT32 FlipSize; + BYTE* FlipBuffer; +}; + +int interleaved_decompress(BITMAP_INTERLEAVED_CONTEXT* interleaved, BYTE* pSrcData, UINT32 SrcSize, int bpp, + BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); + +FREERDP_API BITMAP_INTERLEAVED_CONTEXT* bitmap_interleaved_context_new(BOOL Compressor); +FREERDP_API void bitmap_interleaved_context_free(BITMAP_INTERLEAVED_CONTEXT* interleaved); + +#endif /* FREERDP_CODEC_INTERLEAVED_H */ + diff --git a/libfreerdp/codec/planar.h b/include/freerdp/codec/planar.h similarity index 73% rename from libfreerdp/codec/planar.h rename to include/freerdp/codec/planar.h index a8e34c87a..a06f2db3d 100644 --- a/libfreerdp/codec/planar.h +++ b/include/freerdp/codec/planar.h @@ -17,14 +17,21 @@ * limitations under the License. */ -#ifndef FREERDP_CODEC_PLANAR_PRIVATE_H -#define FREERDP_CODEC_PLANAR_PRIVATE_H +#ifndef FREERDP_CODEC_PLANAR_H +#define FREERDP_CODEC_PLANAR_H #include +typedef struct _BITMAP_PLANAR_CONTEXT BITMAP_PLANAR_CONTEXT; + #include #include +#define PLANAR_FORMAT_HEADER_CS (1 << 3) +#define PLANAR_FORMAT_HEADER_RLE (1 << 4) +#define PLANAR_FORMAT_HEADER_NA (1 << 5) +#define PLANAR_FORMAT_HEADER_CLL_MASK 0x07 + #define PLANAR_CONTROL_BYTE(_nRunLength, _cRawBytes) \ (_nRunLength & 0x0F) | ((_cRawBytes & 0x0F) << 4) @@ -92,4 +99,14 @@ FREERDP_API BYTE* freerdp_bitmap_planar_compress_plane_rle(BYTE* plane, int widt FREERDP_API BYTE* freerdp_bitmap_planar_delta_encode_plane(BYTE* inPlane, int width, int height, BYTE* outPlane); FREERDP_API int freerdp_bitmap_planar_delta_encode_planes(BYTE* inPlanes[4], int width, int height, BYTE* outPlanes[4]); -#endif /* FREERDP_CODEC_PLANAR_PRIVATE_H */ +FREERDP_API BYTE* freerdp_bitmap_compress_planar(BITMAP_PLANAR_CONTEXT* context, BYTE* data, UINT32 format, + int width, int height, int scanline, BYTE* dstData, int* dstSize); + +FREERDP_API BITMAP_PLANAR_CONTEXT* freerdp_bitmap_planar_context_new(DWORD flags, int maxWidth, int maxHeight); +FREERDP_API void freerdp_bitmap_planar_context_free(BITMAP_PLANAR_CONTEXT* context); + +FREERDP_API int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcSize, + BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); + +#endif /* FREERDP_CODEC_PLANAR_H */ + diff --git a/include/freerdp/codec/progressive.h b/include/freerdp/codec/progressive.h index e18310ed8..be702a158 100644 --- a/include/freerdp/codec/progressive.h +++ b/include/freerdp/codec/progressive.h @@ -20,6 +20,8 @@ #ifndef FREERDP_CODEC_PROGRESSIVE_H #define FREERDP_CODEC_PROGRESSIVE_H +typedef struct _PROGRESSIVE_CONTEXT PROGRESSIVE_CONTEXT; + #include #include @@ -301,7 +303,6 @@ struct _PROGRESSIVE_CONTEXT wHashTable* SurfaceContexts; }; -typedef struct _PROGRESSIVE_CONTEXT PROGRESSIVE_CONTEXT; #ifdef __cplusplus extern "C" { diff --git a/include/freerdp/codec/rfx.h b/include/freerdp/codec/rfx.h index 08480bec2..2a68d14d7 100644 --- a/include/freerdp/codec/rfx.h +++ b/include/freerdp/codec/rfx.h @@ -20,6 +20,12 @@ #ifndef FREERDP_CODEC_REMOTEFX_H #define FREERDP_CODEC_REMOTEFX_H +typedef enum _RLGR_MODE RLGR_MODE; +typedef struct _RFX_RECT RFX_RECT; +typedef struct _RFX_TILE RFX_TILE; +typedef struct _RFX_MESSAGE RFX_MESSAGE; +typedef struct _RFX_CONTEXT RFX_CONTEXT; + #include #include #include @@ -36,7 +42,6 @@ enum _RLGR_MODE RLGR1, RLGR3 }; -typedef enum _RLGR_MODE RLGR_MODE; struct _RFX_RECT { @@ -45,7 +50,6 @@ struct _RFX_RECT UINT16 width; UINT16 height; }; -typedef struct _RFX_RECT RFX_RECT; struct _RFX_TILE { @@ -69,7 +73,6 @@ struct _RFX_TILE BYTE* CrData; BYTE* YCbCrData; }; -typedef struct _RFX_TILE RFX_TILE; struct _RFX_MESSAGE { @@ -99,7 +102,6 @@ struct _RFX_MESSAGE BOOL freeArray; }; -typedef struct _RFX_MESSAGE RFX_MESSAGE; typedef struct _RFX_CONTEXT_PRIV RFX_CONTEXT_PRIV; @@ -150,7 +152,6 @@ struct _RFX_CONTEXT /* private definitions */ RFX_CONTEXT_PRIV* priv; }; -typedef struct _RFX_CONTEXT RFX_CONTEXT; FREERDP_API RFX_CONTEXT* rfx_context_new(BOOL encoder); FREERDP_API void rfx_context_free(RFX_CONTEXT* context); diff --git a/include/freerdp/codecs.h b/include/freerdp/codecs.h new file mode 100644 index 000000000..15b311415 --- /dev/null +++ b/include/freerdp/codecs.h @@ -0,0 +1,63 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * RDP Codecs + * + * Copyright 2014 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_CODECS_H +#define FREERDP_CODECS_H + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define FREERDP_CODEC_INTERLEAVED 0x00000001 +#define FREERDP_CODEC_PLANAR 0x00000002 +#define FREERDP_CODEC_NSCODEC 0x00000004 +#define FREERDP_CODEC_REMOTEFX 0x00000008 +#define FREERDP_CODEC_CLEARCODEC 0x00000010 +#define FREERDP_CODEC_ALPHACODEC 0x00000020 +#define FREERDP_CODEC_PROGRESSIVE 0x00000040 +#define FREERDP_CODEC_H264 0x00000080 + +struct rdp_codecs +{ + rdpContext* context; + + RFX_CONTEXT* rfx; + NSC_CONTEXT* nsc; + H264_CONTEXT* h264; + CLEAR_CONTEXT* clear; + PROGRESSIVE_CONTEXT* progressive; + BITMAP_PLANAR_CONTEXT* planar; + BITMAP_INTERLEAVED_CONTEXT* interleaved; +}; + +FREERDP_API int freerdp_client_codecs_prepare(rdpCodecs* codecs, UINT32 flags); + +FREERDP_API rdpCodecs* codecs_new(rdpContext* context); +FREERDP_API void codecs_free(rdpCodecs* codecs); + +#endif /* FREERDP_CODECS_H */ + diff --git a/include/freerdp/freerdp.h b/include/freerdp/freerdp.h index b306fd0a6..bd5846db8 100644 --- a/include/freerdp/freerdp.h +++ b/include/freerdp/freerdp.h @@ -27,6 +27,7 @@ typedef struct rdp_cache rdpCache; typedef struct rdp_channels rdpChannels; typedef struct rdp_graphics rdpGraphics; typedef struct rdp_metrics rdpMetrics; +typedef struct rdp_codecs rdpCodecs; typedef struct rdp_freerdp freerdp; typedef struct rdp_context rdpContext; @@ -40,6 +41,7 @@ typedef RDP_CLIENT_ENTRY_POINTS_V1 RDP_CLIENT_ENTRY_POINTS; #include #include #include +#include #include #include #include @@ -120,7 +122,8 @@ struct rdp_context ALIGN64 rdpUpdate* update; /* 39 */ ALIGN64 rdpSettings* settings; /* 40 */ ALIGN64 rdpMetrics* metrics; /* 41 */ - UINT64 paddingC[64 - 42]; /* 42 */ + ALIGN64 rdpCodecs* codecs; /* 42 */ + UINT64 paddingC[64 - 43]; /* 43 */ UINT64 paddingD[96 - 64]; /* 64 */ UINT64 paddingE[128 - 96]; /* 96 */ diff --git a/include/freerdp/gdi/gdi.h b/include/freerdp/gdi/gdi.h index 9352278bd..8d574b815 100644 --- a/include/freerdp/gdi/gdi.h +++ b/include/freerdp/gdi/gdi.h @@ -279,6 +279,7 @@ struct rdp_gdi int cursor_x; int cursor_y; int bytesPerPixel; + rdpCodecs* codecs; HGDI_DC hdc; HCLRCONV clrconv; @@ -286,8 +287,6 @@ struct rdp_gdi gdiBitmap* drawing; BYTE* primary_buffer; GDI_COLOR textColor; - void* rfx_context; - void* nsc_context; gdiBitmap* tile; gdiBitmap* image; }; diff --git a/include/freerdp/types.h b/include/freerdp/types.h index 3d26e0bf9..a2ccb9c01 100644 --- a/include/freerdp/types.h +++ b/include/freerdp/types.h @@ -32,6 +32,21 @@ #define MAX(x,y) (((x) > (y)) ? (x) : (y)) #endif +struct _PALETTE_ENTRY +{ + BYTE red; + BYTE green; + BYTE blue; +}; +typedef struct _PALETTE_ENTRY PALETTE_ENTRY; + +struct rdp_palette +{ + UINT32 count; + PALETTE_ENTRY entries[256]; +}; +typedef struct rdp_palette rdpPalette; + #include struct _RDP_PLUGIN_DATA diff --git a/include/freerdp/update.h b/include/freerdp/update.h index b311e07e5..8428ab6a9 100644 --- a/include/freerdp/update.h +++ b/include/freerdp/update.h @@ -73,14 +73,6 @@ typedef struct _BITMAP_UPDATE BITMAP_UPDATE; /* Palette Updates */ -struct _PALETTE_ENTRY -{ - BYTE red; - BYTE green; - BYTE blue; -}; -typedef struct _PALETTE_ENTRY PALETTE_ENTRY; - struct _PALETTE_UPDATE { UINT32 number; @@ -88,13 +80,6 @@ struct _PALETTE_UPDATE }; typedef struct _PALETTE_UPDATE PALETTE_UPDATE; -struct rdp_palette -{ - UINT32 count; - PALETTE_ENTRY entries[256]; -}; -typedef struct rdp_palette rdpPalette; - /* Play Sound (System Beep) Updates */ struct _PLAY_SOUND_UPDATE diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index 75999d262..bab5714f6 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -23,10 +23,9 @@ set(${MODULE_PREFIX}_SRCS color.c audio.c planar.c - planar.h + bitmap.c + interleaved.c progressive.c - bitmap_decode.c - bitmap_encode.c rfx_bitstream.h rfx_constants.h rfx_decode.c diff --git a/libfreerdp/codec/bitmap_encode.c b/libfreerdp/codec/bitmap.c similarity index 99% rename from libfreerdp/codec/bitmap_encode.c rename to libfreerdp/codec/bitmap.c index 9db6f1a14..ccb104ed2 100644 --- a/libfreerdp/codec/bitmap_encode.c +++ b/libfreerdp/codec/bitmap.c @@ -22,6 +22,7 @@ #endif #include +#include #define GETPIXEL16(d, x, y, w) (*(((unsigned short*)d) + ((y) * (w) + (x)))) #define GETPIXEL32(d, x, y, w) (*(((unsigned int*)d) + ((y) * (w) + (x)))) diff --git a/libfreerdp/codec/bitmap_decode.c b/libfreerdp/codec/interleaved.c similarity index 71% rename from libfreerdp/codec/bitmap_decode.c rename to libfreerdp/codec/interleaved.c index ee6e672e6..68a224b90 100644 --- a/libfreerdp/codec/bitmap_decode.c +++ b/libfreerdp/codec/interleaved.c @@ -1,8 +1,8 @@ /** * FreeRDP: A Remote Desktop Protocol Implementation - * Bitmap Decompression + * Interleaved RLE Bitmap Codec * - * Copyright 2011 Jay Sorg + * Copyright 2014 Marc-Andre Moreau * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,14 +21,7 @@ #include "config.h" #endif -#include -#include - -#include "planar.h" - -#include - -#include +#include /* RLE Compressed Bitmap Stream (RLE_BITMAP_STREAM) @@ -242,57 +235,104 @@ static INLINE UINT32 ExtractRunLength(UINT32 code, BYTE* pbOrderHdr, UINT32* adv #define RLEEXTRA #include "include/bitmap.c" -/** - * bitmap decompression routine - */ -BOOL bitmap_decompress(BYTE* srcData, BYTE* dstData, int width, int height, int size, int srcBpp, int dstBpp) +int interleaved_decompress(BITMAP_INTERLEAVED_CONTEXT* interleaved, BYTE* pSrcData, UINT32 SrcSize, int bpp, + BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight) { - int status; - BYTE* TmpBfr; + BOOL vFlip; + int scanline; BYTE* pDstData; + UINT32 BufferSize; + int dstBitsPerPixel; + int dstBytesPerPixel; - if (srcBpp == 16 && dstBpp == 16) - { - TmpBfr = (BYTE*) _aligned_malloc(width * height * 2, 16); - RleDecompress16to16(srcData, size, TmpBfr, width * 2, width, height); - freerdp_bitmap_flip(TmpBfr, dstData, width * 2, height); - _aligned_free(TmpBfr); - } - else if (srcBpp == 32 && dstBpp == 32) - { - pDstData = dstData; + pDstData = *ppDstData; + dstBitsPerPixel = FREERDP_PIXEL_FORMAT_DEPTH(DstFormat); + dstBytesPerPixel = (FREERDP_PIXEL_FORMAT_BPP(DstFormat) / 8); + vFlip = FREERDP_PIXEL_FORMAT_FLIP(DstFormat) ? TRUE : FALSE; - status = planar_decompress(NULL, srcData, size, &pDstData, - PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + if (!interleaved) + return -1; - if (status < 0) - return FALSE; - } - else if (srcBpp == 15 && dstBpp == 15) + if (bpp == 24) { - TmpBfr = (BYTE*) _aligned_malloc(width * height * 2, 16); - RleDecompress16to16(srcData, size, TmpBfr, width * 2, width, height); - freerdp_bitmap_flip(TmpBfr, dstData, width * 2, height); - _aligned_free(TmpBfr); + scanline = nWidth * 3; + BufferSize = scanline * nHeight; + + if (BufferSize > interleaved->FlipSize) + { + interleaved->FlipBuffer = _aligned_realloc(interleaved->FlipBuffer, BufferSize, 16); + interleaved->FlipSize = BufferSize; + } + + if (!interleaved->FlipBuffer) + return -1; + + RleDecompress24to24(pSrcData, SrcSize, interleaved->FlipBuffer, scanline, nWidth, nHeight); + freerdp_bitmap_flip(interleaved->FlipBuffer, pDstData, scanline, nHeight); } - else if (srcBpp == 8 && dstBpp == 8) + else if ((bpp == 16) || (bpp == 15)) { - TmpBfr = (BYTE*) _aligned_malloc(width * height, 16); - RleDecompress8to8(srcData, size, TmpBfr, width, width, height); - freerdp_bitmap_flip(TmpBfr, dstData, width, height); - _aligned_free(TmpBfr); + scanline = nWidth * 2; + BufferSize = scanline * nHeight; + + if (BufferSize > interleaved->FlipSize) + { + interleaved->FlipBuffer = _aligned_realloc(interleaved->FlipBuffer, BufferSize, 16); + interleaved->FlipSize = BufferSize; + } + + if (!interleaved->FlipBuffer) + return -1; + + RleDecompress16to16(pSrcData, SrcSize, interleaved->FlipBuffer, scanline, nWidth, nHeight); + freerdp_bitmap_flip(interleaved->FlipBuffer, pDstData, scanline, nHeight); } - else if (srcBpp == 24 && dstBpp == 24) + else if (bpp == 8) { - TmpBfr = (BYTE*) _aligned_malloc(width * height * 3, 16); - RleDecompress24to24(srcData, size, TmpBfr, width * 3, width, height); - freerdp_bitmap_flip(TmpBfr, dstData, width * 3, height); - _aligned_free(TmpBfr); + scanline = nWidth; + BufferSize = scanline * nHeight; + + if (BufferSize > interleaved->FlipSize) + { + interleaved->FlipBuffer = _aligned_realloc(interleaved->FlipBuffer, BufferSize, 16); + interleaved->FlipSize = BufferSize; + } + + if (!interleaved->FlipBuffer) + return -1; + + RleDecompress8to8(pSrcData, SrcSize, interleaved->FlipBuffer, scanline, nWidth, nHeight); + freerdp_bitmap_flip(interleaved->FlipBuffer, pDstData, scanline, nHeight); } else { - return FALSE; + return -1; } - return TRUE; + return 1; +} + +BITMAP_INTERLEAVED_CONTEXT* bitmap_interleaved_context_new(BOOL Compressor) +{ + BITMAP_INTERLEAVED_CONTEXT* interleaved; + + interleaved = (BITMAP_INTERLEAVED_CONTEXT*) calloc(1, sizeof(BITMAP_INTERLEAVED_CONTEXT*)); + + if (interleaved) + { + interleaved->FlipSize = 64 * 64 * 3; + interleaved->FlipBuffer = _aligned_malloc(interleaved->FlipSize, 16); + } + + return interleaved; +} + +void bitmap_interleaved_context_free(BITMAP_INTERLEAVED_CONTEXT* interleaved) +{ + if (!interleaved) + return; + + _aligned_free(interleaved->FlipBuffer); + + free(interleaved); } diff --git a/libfreerdp/codec/planar.c b/libfreerdp/codec/planar.c index 37ce3ed7e..7c08cc0eb 100644 --- a/libfreerdp/codec/planar.c +++ b/libfreerdp/codec/planar.c @@ -27,8 +27,7 @@ #include #include #include - -#include "planar.h" +#include static int planar_skip_plane_rle(const BYTE* pSrcData, UINT32 SrcSize, int nWidth, int nHeight) { diff --git a/libfreerdp/codec/test/TestFreeRDPCodecPlanar.c b/libfreerdp/codec/test/TestFreeRDPCodecPlanar.c index 976b655a6..8d10e9cca 100644 --- a/libfreerdp/codec/test/TestFreeRDPCodecPlanar.c +++ b/libfreerdp/codec/test/TestFreeRDPCodecPlanar.c @@ -5,6 +5,7 @@ #include #include #include +#include /** * Experimental Case 01: 64x64 (32bpp) @@ -2864,16 +2865,6 @@ const BYTE TEST_RDP6_SCANLINES_DELTA_2C_ENCODED_UNSIGNED[3][6] = { 0x01, 0x67, 0x8B, 0xA3, 0x78, 0xAF } }; -#include "../planar.h" - -static unsigned long next = 1; - -static int simple_rand(void) -{ - next = next * 1103515245 + 12345; - return ((unsigned int) (next / 65536) % 32768); -} - static void fill_bitmap_alpha_channel(BYTE* data, int width, int height, BYTE value) { int i, j; @@ -3095,9 +3086,10 @@ int test_individual_planes_encoding_rle() int TestFreeRDPCodecPlanar(int argc, char* argv[]) { - int i, j; + int i; int dstSize; UINT32 format; + BYTE* pDstData; HCLRCONV clrconv; DWORD planarFlags; BYTE* srcBitmap32; @@ -3105,7 +3097,6 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) int width, height; BYTE* blackBitmap; BYTE* whiteBitmap; - BYTE* randomBitmap; BYTE* compressedBitmap; BYTE* decompressedBitmap; BITMAP_PLANAR_CONTEXT* planar; @@ -3147,7 +3138,10 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) decompressedBitmap = (BYTE*) malloc(width * height * 4); ZeroMemory(decompressedBitmap, width * height * 4); - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) + pDstData = decompressedBitmap; + + if (planar_decompress(planar, compressedBitmap, dstSize, &pDstData, + PIXEL_FORMAT_XRGB32, width * 4, 0, 0, width, height) < 0) { printf("failed to decompress white bitmap: width: %d height: %d\n", width, height); return -1; @@ -3187,7 +3181,10 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) decompressedBitmap = (BYTE*) malloc(width * height * 4); ZeroMemory(decompressedBitmap, width * height * 4); - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) + pDstData = decompressedBitmap; + + if (planar_decompress(planar, compressedBitmap, dstSize, &pDstData, + PIXEL_FORMAT_XRGB32, width * 4, 0, 0, width, height) < 0) { printf("failed to decompress black bitmap: width: %d height: %d\n", width, height); return -1; @@ -3213,50 +3210,7 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) free(decompressedBitmap); } - for (i = 4; i < 64; i += 4) - { - width = i; - height = i; - - randomBitmap = (BYTE*) malloc(width * height * 4); - - for (j = 0; j < width * height * 4; j++) - { - randomBitmap[j] = (BYTE) (simple_rand() % 256); - } - - fill_bitmap_alpha_channel(randomBitmap, width, height, 0x00); - - compressedBitmap = freerdp_bitmap_compress_planar(planar, randomBitmap, format, width, height, width * 4, NULL, &dstSize); - - decompressedBitmap = (BYTE*) malloc(width * height * 4); - ZeroMemory(decompressedBitmap, width * height * 4); - - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) - { - printf("failed to decompress random bitmap: width: %d height: %d\n", width, height); - return -1; - } - else - { - printf("success decompressing random bitmap: width: %d height: %d\n", width, height); - } - - if (memcmp(decompressedBitmap, randomBitmap, width * height * 4) != 0) - { - printf("random bitmap\n"); - winpr_HexDump(randomBitmap, width * height * 4); - - printf("decompressed bitmap\n"); - winpr_HexDump(decompressedBitmap, width * height * 4); - - printf("error decompressed random bitmap corrupted: width: %d height: %d\n", width, height); - return -1; - } - - free(compressedBitmap); - free(decompressedBitmap); - } + return 0; /* Experimental Case 01 */ @@ -3269,7 +3223,10 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) decompressedBitmap = (BYTE*) malloc(width * height * 4); ZeroMemory(decompressedBitmap, width * height * 4); - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) + pDstData = decompressedBitmap; + + if (planar_decompress(planar, compressedBitmap, dstSize, &pDstData, + PIXEL_FORMAT_XRGB32, width * 4, 0, 0, width, height) < 0) { printf("failed to decompress experimental bitmap 01: width: %d height: %d\n", width, height); return -1; @@ -3310,7 +3267,10 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) decompressedBitmap = (BYTE*) malloc(width * height * 4); ZeroMemory(decompressedBitmap, width * height * 4); - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) + pDstData = decompressedBitmap; + + if (planar_decompress(planar, compressedBitmap, dstSize, &pDstData, + PIXEL_FORMAT_XRGB32, width * 4, 0, 0, width, height) < 0) { printf("failed to decompress experimental bitmap 02: width: %d height: %d\n", width, height); return -1; @@ -3357,7 +3317,10 @@ int TestFreeRDPCodecPlanar(int argc, char* argv[]) decompressedBitmap = (BYTE*) malloc(width * height * 4); ZeroMemory(decompressedBitmap, width * height * 4); - if (!bitmap_decompress(compressedBitmap, decompressedBitmap, width, height, dstSize, 32, 32)) + pDstData = decompressedBitmap; + + if (planar_decompress(planar, compressedBitmap, dstSize, &pDstData, + PIXEL_FORMAT_XRGB32, width * 4, 0, 0, width, height) < 0) { printf("failed to decompress experimental bitmap 03: width: %d height: %d\n", width, height); return -1; diff --git a/libfreerdp/core/CMakeLists.txt b/libfreerdp/core/CMakeLists.txt index 6504599d8..20961a213 100644 --- a/libfreerdp/core/CMakeLists.txt +++ b/libfreerdp/core/CMakeLists.txt @@ -79,6 +79,7 @@ set(${MODULE_PREFIX}_SRCS client.h server.c server.h + codecs.c metrics.c capabilities.c capabilities.h diff --git a/libfreerdp/core/codecs.c b/libfreerdp/core/codecs.c new file mode 100644 index 000000000..7aaf1367a --- /dev/null +++ b/libfreerdp/core/codecs.c @@ -0,0 +1,157 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * RDP Codecs + * + * Copyright 2014 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "rdp.h" + +#include + +int freerdp_client_codecs_prepare(rdpCodecs* codecs, UINT32 flags) +{ + if (flags & FREERDP_CODEC_INTERLEAVED) + { + if (!codecs->interleaved) + { + codecs->interleaved = bitmap_interleaved_context_new(FALSE); + } + } + + if (flags & FREERDP_CODEC_PLANAR) + { + if (!codecs->planar) + { + codecs->planar = freerdp_bitmap_planar_context_new(FALSE, 64, 64); + } + } + + if (flags & FREERDP_CODEC_NSCODEC) + { + if (!codecs->nsc) + { + codecs->nsc = nsc_context_new(); + } + } + + if (flags & FREERDP_CODEC_REMOTEFX) + { + if (!codecs->rfx) + { + codecs->rfx = rfx_context_new(FALSE); + } + } + + if (flags & FREERDP_CODEC_CLEARCODEC) + { + if (!codecs->clear) + { + codecs->clear = clear_context_new(FALSE); + } + } + + if (flags & FREERDP_CODEC_ALPHACODEC) + { + + } + + if (flags & FREERDP_CODEC_PROGRESSIVE) + { + if (!codecs->progressive) + { + codecs->progressive = progressive_context_new(FALSE); + } + } + + if (flags & FREERDP_CODEC_H264) + { + if (!codecs->h264) + { + codecs->h264 = h264_context_new(FALSE); + } + } + + return 1; +} + +rdpCodecs* codecs_new(rdpContext* context) +{ + rdpCodecs* codecs; + + codecs = (rdpCodecs*) calloc(1, sizeof(rdpCodecs)); + + if (codecs) + { + codecs->context = context; + } + + return codecs; +} + +void codecs_free(rdpCodecs* codecs) +{ + if (!codecs) + return; + + if (codecs->rfx) + { + rfx_context_free(codecs->rfx); + codecs->rfx = NULL; + } + + if (codecs->nsc) + { + nsc_context_free(codecs->nsc); + codecs->nsc = NULL; + } + + if (codecs->h264) + { + h264_context_free(codecs->h264); + codecs->h264 = NULL; + } + + if (codecs->clear) + { + clear_context_free(codecs->clear); + codecs->clear = NULL; + } + + if (codecs->progressive) + { + progressive_context_free(codecs->progressive); + codecs->progressive = NULL; + } + + if (codecs->planar) + { + freerdp_bitmap_planar_context_free(codecs->planar); + codecs->planar = NULL; + } + + if (codecs->interleaved) + { + bitmap_interleaved_context_free(codecs->interleaved); + codecs->interleaved = NULL; + } + + free(codecs); +} + diff --git a/libfreerdp/core/freerdp.c b/libfreerdp/core/freerdp.c index eeec2a7ae..65a201e2b 100644 --- a/libfreerdp/core/freerdp.c +++ b/libfreerdp/core/freerdp.c @@ -410,6 +410,7 @@ int freerdp_context_new(freerdp* instance) PubSub_AddEventTypes(context->pubSub, FreeRDP_Events, sizeof(FreeRDP_Events) / sizeof(wEventType)); context->metrics = metrics_new(context); + context->codecs = codecs_new(context); rdp = rdp_new(context); instance->input = rdp->input; @@ -465,6 +466,7 @@ void freerdp_context_free(freerdp* instance) PubSub_Free(instance->context->pubSub); metrics_free(instance->context->metrics); + codecs_free(instance->context->codecs); free(instance->context); instance->context = NULL; diff --git a/libfreerdp/gdi/gdi.c b/libfreerdp/gdi/gdi.c index 140bad8e4..24f96a924 100644 --- a/libfreerdp/gdi/gdi.c +++ b/libfreerdp/gdi/gdi.c @@ -793,55 +793,49 @@ void gdi_surface_frame_marker(rdpContext* context, SURFACE_FRAME_MARKER* surface int tilenum = 0; -void gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits_command) +void gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* cmd) { int i, j; int tx, ty; char* tile_bitmap; RFX_MESSAGE* message; rdpGdi* gdi = context->gdi; - RFX_CONTEXT* rfx_context = (RFX_CONTEXT*) gdi->rfx_context; - NSC_CONTEXT* nsc_context = (NSC_CONTEXT*) gdi->nsc_context; DEBUG_GDI("destLeft %d destTop %d destRight %d destBottom %d " "bpp %d codecID %d width %d height %d length %d", - surface_bits_command->destLeft, surface_bits_command->destTop, - surface_bits_command->destRight, surface_bits_command->destBottom, - surface_bits_command->bpp, surface_bits_command->codecID, - surface_bits_command->width, surface_bits_command->height, - surface_bits_command->bitmapDataLength); + cmd->destLeft, cmd->destTop, + cmd->destRight, cmd->destBottom, + cmd->bpp, cmd->codecID, + cmd->width, cmd->height, + cmd->bitmapDataLength); tile_bitmap = (char*) _aligned_malloc(32, 16); if (!tile_bitmap) return; - if (surface_bits_command->codecID == RDP_CODEC_ID_REMOTEFX) + if (cmd->codecID == RDP_CODEC_ID_REMOTEFX) { - message = rfx_process_message(rfx_context, - surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_REMOTEFX); + + message = rfx_process_message(gdi->codecs->rfx, + cmd->bitmapData, cmd->bitmapDataLength); DEBUG_GDI("num_rects %d num_tiles %d", message->numRects, message->numTiles); /* blit each tile */ for (i = 0; i < message->numTiles; i++) { - tx = message->tiles[i]->x + surface_bits_command->destLeft; - ty = message->tiles[i]->y + surface_bits_command->destTop; + tx = message->tiles[i]->x + cmd->destLeft; + ty = message->tiles[i]->y + cmd->destTop; freerdp_image_convert(message->tiles[i]->data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); -#ifdef DUMP_REMOTEFX_TILES - sprintf(tile_bitmap, "/tmp/rfx/tile_%d.bmp", tilenum++); - winpr_bitmap_write(tile_bitmap, gdi->tile->bitmap->data, 64, 64, 32); -#endif - - for (j = 0; j < message->numRects; j++) { gdi_SetClipRgn(gdi->primary->hdc, - surface_bits_command->destLeft + message->rects[j].x, - surface_bits_command->destTop + message->rects[j].y, + cmd->destLeft + message->rects[j].x, + cmd->destTop + message->rects[j].y, message->rects[j].width, message->rects[j].height); gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); @@ -849,43 +843,45 @@ void gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits_co } gdi_SetNullClipRgn(gdi->primary->hdc); - rfx_message_free(rfx_context, message); + rfx_message_free(gdi->codecs->rfx, message); } - else if (surface_bits_command->codecID == RDP_CODEC_ID_NSCODEC) + else if (cmd->codecID == RDP_CODEC_ID_NSCODEC) { - nsc_process_message(nsc_context, surface_bits_command->bpp, surface_bits_command->width, surface_bits_command->height, - surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); - gdi->image->bitmap->width = surface_bits_command->width; - gdi->image->bitmap->height = surface_bits_command->height; - gdi->image->bitmap->bitsPerPixel = surface_bits_command->bpp; + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_NSCODEC); + + nsc_process_message(gdi->codecs->nsc, cmd->bpp, cmd->width, cmd->height, + cmd->bitmapData, cmd->bitmapDataLength); + gdi->image->bitmap->width = cmd->width; + gdi->image->bitmap->height = cmd->height; + gdi->image->bitmap->bitsPerPixel = cmd->bpp; gdi->image->bitmap->bytesPerPixel = gdi->image->bitmap->bitsPerPixel / 8; gdi->image->bitmap->data = (BYTE*) _aligned_realloc(gdi->image->bitmap->data, gdi->image->bitmap->width * gdi->image->bitmap->height * 4, 16); - freerdp_image_convert(nsc_context->BitmapData, gdi->image->bitmap->data, - surface_bits_command->width, surface_bits_command->height, - surface_bits_command->bpp, gdi->dstBpp, gdi->clrconv); + freerdp_image_convert(gdi->codecs->nsc->BitmapData, gdi->image->bitmap->data, + cmd->width, cmd->height, + cmd->bpp, gdi->dstBpp, gdi->clrconv); freerdp_image_flip(gdi->image->bitmap->data, gdi->image->bitmap->data, gdi->image->bitmap->width, gdi->image->bitmap->height, gdi->dstBpp); - gdi_BitBlt(gdi->primary->hdc, surface_bits_command->destLeft, surface_bits_command->destTop, surface_bits_command->width, surface_bits_command->height, gdi->image->hdc, 0, 0, GDI_SRCCOPY); + gdi_BitBlt(gdi->primary->hdc, cmd->destLeft, cmd->destTop, cmd->width, cmd->height, gdi->image->hdc, 0, 0, GDI_SRCCOPY); } - else if (surface_bits_command->codecID == RDP_CODEC_ID_NONE) + else if (cmd->codecID == RDP_CODEC_ID_NONE) { - gdi->image->bitmap->width = surface_bits_command->width; - gdi->image->bitmap->height = surface_bits_command->height; - gdi->image->bitmap->bitsPerPixel = surface_bits_command->bpp; + gdi->image->bitmap->width = cmd->width; + gdi->image->bitmap->height = cmd->height; + gdi->image->bitmap->bitsPerPixel = cmd->bpp; gdi->image->bitmap->bytesPerPixel = gdi->image->bitmap->bitsPerPixel / 8; gdi->image->bitmap->data = (BYTE*) _aligned_realloc(gdi->image->bitmap->data, gdi->image->bitmap->width * gdi->image->bitmap->height * 4, 16); - if ((surface_bits_command->bpp != 32) || (gdi->clrconv->alpha == TRUE)) + if ((cmd->bpp != 32) || (gdi->clrconv->alpha)) { BYTE* temp_image; - freerdp_image_convert(surface_bits_command->bitmapData, gdi->image->bitmap->data, + freerdp_image_convert(cmd->bitmapData, gdi->image->bitmap->data, gdi->image->bitmap->width, gdi->image->bitmap->height, gdi->image->bitmap->bitsPerPixel, 32, gdi->clrconv); - surface_bits_command->bpp = 32; - surface_bits_command->bitmapData = gdi->image->bitmap->data; + cmd->bpp = 32; + cmd->bitmapData = gdi->image->bitmap->data; temp_image = (BYTE*) _aligned_malloc(gdi->image->bitmap->width * gdi->image->bitmap->height * 4, 16); freerdp_image_flip(gdi->image->bitmap->data, temp_image, gdi->image->bitmap->width, gdi->image->bitmap->height, 32); @@ -894,16 +890,16 @@ void gdi_surface_bits(rdpContext* context, SURFACE_BITS_COMMAND* surface_bits_co } else { - freerdp_image_flip(surface_bits_command->bitmapData, gdi->image->bitmap->data, + freerdp_image_flip(cmd->bitmapData, gdi->image->bitmap->data, gdi->image->bitmap->width, gdi->image->bitmap->height, 32); } - gdi_BitBlt(gdi->primary->hdc, surface_bits_command->destLeft, surface_bits_command->destTop, - surface_bits_command->width, surface_bits_command->height, gdi->image->hdc, 0, 0, GDI_SRCCOPY); + gdi_BitBlt(gdi->primary->hdc, cmd->destLeft, cmd->destTop, + cmd->width, cmd->height, gdi->image->hdc, 0, 0, GDI_SRCCOPY); } else { - DEBUG_WARN( "Unsupported codecID %d\n", surface_bits_command->codecID); + DEBUG_WARN( "Unsupported codecID %d\n", cmd->codecID); } if (tile_bitmap) @@ -1020,6 +1016,7 @@ int gdi_init(freerdp* instance, UINT32 flags, BYTE* buffer) instance->context->gdi = gdi; cache = instance->context->cache; + gdi->codecs = instance->context->codecs; gdi->width = instance->settings->DesktopWidth; gdi->height = instance->settings->DesktopHeight; gdi->srcBpp = instance->settings->ColorDepth; @@ -1103,9 +1100,6 @@ int gdi_init(freerdp* instance, UINT32 flags, BYTE* buffer) gdi_register_graphics(instance->context->graphics); - gdi->rfx_context = rfx_context_new(FALSE); - gdi->nsc_context = nsc_context_new(); - return 0; } @@ -1119,8 +1113,6 @@ void gdi_free(freerdp* instance) gdi_bitmap_free_ex(gdi->tile); gdi_bitmap_free_ex(gdi->image); gdi_DeleteDC(gdi->hdc); - rfx_context_free((RFX_CONTEXT*) gdi->rfx_context); - nsc_context_free((NSC_CONTEXT*) gdi->nsc_context); free(gdi->clrconv->palette); free(gdi->clrconv); free(gdi); diff --git a/libfreerdp/gdi/graphics.c b/libfreerdp/gdi/graphics.c index f68e62c11..2a8e0617c 100644 --- a/libfreerdp/gdi/graphics.c +++ b/libfreerdp/gdi/graphics.c @@ -98,7 +98,7 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, BYTE* data, int width, int height, int bpp, int length, BOOL compressed, int codecId) { - BOOL status; + int status; UINT16 size; BYTE* src; BYTE* dst; @@ -107,6 +107,8 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, rdpGdi* gdi; RFX_MESSAGE* msg; + gdi = context->gdi; + size = width * height * ((bpp + 7) / 8); if (!bitmap->data) @@ -117,15 +119,16 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, switch (codecId) { case RDP_CODEC_ID_NSCODEC: - gdi = context->gdi; - nsc_process_message(gdi->nsc_context, bpp, width, height, data, length); - freerdp_image_flip(((NSC_CONTEXT*) gdi->nsc_context)->BitmapData, bitmap->data, width, height, bpp); + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_NSCODEC); + nsc_process_message(gdi->codecs->nsc, bpp, width, height, data, length); + freerdp_image_flip(gdi->codecs->nsc->BitmapData, bitmap->data, width, height, bpp); break; case RDP_CODEC_ID_REMOTEFX: - gdi = context->gdi; - rfx_context_set_pixel_format(gdi->rfx_context, RDP_PIXEL_FORMAT_B8G8R8A8); - msg = rfx_process_message(gdi->rfx_context, data, length); + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_REMOTEFX); + rfx_context_set_pixel_format(gdi->codecs->rfx, RDP_PIXEL_FORMAT_B8G8R8A8); + msg = rfx_process_message(gdi->codecs->rfx, data, length); + if (!msg) { DEBUG_WARN( "gdi_Bitmap_Decompress: rfx Decompression Failed\n"); @@ -136,6 +139,7 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, { src = msg->tiles[0]->data + yindex * 64 * 4; dst = bitmap->data + yindex * width * 3; + for (xindex = 0; xindex < width; xindex++) { *(dst++) = *(src++); @@ -144,7 +148,7 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, src++; } } - rfx_message_free(gdi->rfx_context, msg); + rfx_message_free(gdi->codecs->rfx, msg); } break; case RDP_CODEC_ID_JPEG: @@ -158,11 +162,35 @@ void gdi_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, default: if (compressed) { - status = bitmap_decompress(data, bitmap->data, width, height, length, bpp, bpp); + BYTE* pDstData; + UINT32 SrcSize; - if (!status) + SrcSize = (UINT32) length; + pDstData = bitmap->data; + + if (bpp < 32) { - DEBUG_WARN( "gdi_Bitmap_Decompress: Bitmap Decompression Failed\n"); + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_INTERLEAVED); + + status = interleaved_decompress(gdi->codecs->interleaved, data, SrcSize, bpp, + &pDstData, PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("gdi_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } + } + else + { + freerdp_client_codecs_prepare(gdi->codecs, FREERDP_CODEC_PLANAR); + + status = planar_decompress(gdi->codecs->planar, data, SrcSize, &pDstData, + PIXEL_FORMAT_XRGB32_VF, width * 4, 0, 0, width, height); + + if (status < 0) + { + DEBUG_WARN("gdi_Bitmap_Decompress: Bitmap Decompression Failed\n"); + } } } else diff --git a/libfreerdp/primitives/test/prim_test.h b/libfreerdp/primitives/test/prim_test.h index 37db6a9b6..e535b4710 100644 --- a/libfreerdp/primitives/test/prim_test.h +++ b/libfreerdp/primitives/test/prim_test.h @@ -112,7 +112,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf_s(label, "%s-%-4d", oplabel, size); \ + sprintf(label, "%s-%-4d", oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultNormal[s], \ _funcNormal_); \ } \ @@ -128,7 +128,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf_s(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \ + sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultOpt[s], \ _funcOpt_); \ } \ @@ -147,7 +147,7 @@ extern int test_or_32u_speed(void); int size = size_array[s]; \ _prework_; \ iter = iterations/size; \ - sprintf_s(label, "IPP-%s-%-4d", oplabel, size); \ + sprintf(label, "IPP-%s-%-4d", oplabel, size); \ MEASURE_TIMED(label, iter, test_time, resultIPP[s], \ _funcIPP_); \ } \ @@ -218,7 +218,7 @@ static void _name_( \ _floatprint(resultOpt[s], sSN); \ if (resultNormal[s] > 0.0) \ { \ - sprintf_s(sSNp, "%d%%", \ + sprintf(sSNp, "%d%%", \ (int) (resultOpt[s] / resultNormal[s] * 100.0 + 0.5)); \ } \ } \ @@ -227,7 +227,7 @@ static void _name_( \ _floatprint(resultIPP[s], sIPP); \ if (resultNormal[s] > 0.0) \ { \ - sprintf_s(sIPPp, "%d%%", \ + sprintf(sIPPp, "%d%%", \ (int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \ } \ } \ From 89e5fef11f807976da88ce74483d4395dd0b082c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Wed, 10 Sep 2014 11:38:38 -0400 Subject: [PATCH 31/31] wfreerdp: fix build on Windows --- client/Windows/wf_gdi.c | 13 ++++++------- client/Windows/wf_graphics.c | 2 +- client/Windows/wf_interface.c | 6 ------ client/Windows/wf_interface.h | 2 -- include/freerdp/codec/interleaved.h | 2 +- libfreerdp/codec/interleaved.c | 2 +- libfreerdp/primitives/CMakeLists.txt | 7 +------ 7 files changed, 10 insertions(+), 24 deletions(-) diff --git a/client/Windows/wf_gdi.c b/client/Windows/wf_gdi.c index 183401632..d02c36970 100644 --- a/client/Windows/wf_gdi.c +++ b/client/Windows/wf_gdi.c @@ -568,15 +568,13 @@ void wf_gdi_surface_bits(wfContext* wfc, SURFACE_BITS_COMMAND* surface_bits_comm RFX_MESSAGE* message; BITMAPINFO bitmap_info; - RFX_CONTEXT* rfx_context = (RFX_CONTEXT*) wfc->rfx_context; - NSC_CONTEXT* nsc_context = (NSC_CONTEXT*) wfc->nsc_context; - tile_bitmap = (char*) malloc(32); ZeroMemory(tile_bitmap, 32); if (surface_bits_command->codecID == RDP_CODEC_ID_REMOTEFX) { - message = rfx_process_message(rfx_context, surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); + freerdp_client_codecs_prepare(wfc->codecs, FREERDP_CODEC_REMOTEFX); + message = rfx_process_message(wfc->codecs->rfx, surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); /* blit each tile */ for (i = 0; i < message->numTiles; i++) @@ -607,11 +605,12 @@ void wf_gdi_surface_bits(wfContext* wfc, SURFACE_BITS_COMMAND* surface_bits_comm wf_invalidate_region(wfc, tx, ty, message->rects[i].width, message->rects[i].height); } - rfx_message_free(rfx_context, message); + rfx_message_free(wfc->codecs->rfx, message); } else if (surface_bits_command->codecID == RDP_CODEC_ID_NSCODEC) { - nsc_process_message(nsc_context, surface_bits_command->bpp, surface_bits_command->width, surface_bits_command->height, + freerdp_client_codecs_prepare(wfc->codecs, FREERDP_CODEC_NSCODEC); + nsc_process_message(wfc->codecs->nsc, surface_bits_command->bpp, surface_bits_command->width, surface_bits_command->height, surface_bits_command->bitmapData, surface_bits_command->bitmapDataLength); ZeroMemory(&bitmap_info, sizeof(bitmap_info)); bitmap_info.bmiHeader.biSize = sizeof(BITMAPINFOHEADER); @@ -622,7 +621,7 @@ void wf_gdi_surface_bits(wfContext* wfc, SURFACE_BITS_COMMAND* surface_bits_comm bitmap_info.bmiHeader.biCompression = BI_RGB; SetDIBitsToDevice(wfc->primary->hdc, surface_bits_command->destLeft, surface_bits_command->destTop, surface_bits_command->width, surface_bits_command->height, 0, 0, 0, surface_bits_command->height, - nsc_context->BitmapData, &bitmap_info, DIB_RGB_COLORS); + wfc->codecs->nsc->BitmapData, &bitmap_info, DIB_RGB_COLORS); wf_invalidate_region(wfc, surface_bits_command->destLeft, surface_bits_command->destTop, surface_bits_command->width, surface_bits_command->height); } diff --git a/client/Windows/wf_graphics.c b/client/Windows/wf_graphics.c index e0adc70ac..039f1975d 100644 --- a/client/Windows/wf_graphics.c +++ b/client/Windows/wf_graphics.c @@ -23,7 +23,7 @@ #include -#include +#include #include "wf_gdi.h" #include "wf_graphics.h" diff --git a/client/Windows/wf_interface.c b/client/Windows/wf_interface.c index 8821ae37d..6ed2df709 100644 --- a/client/Windows/wf_interface.c +++ b/client/Windows/wf_interface.c @@ -376,12 +376,6 @@ BOOL wf_post_connect(freerdp* instance) if (settings->RemoteFxCodec) { wfc->tile = wf_image_new(wfc, 64, 64, 32, NULL); - wfc->rfx_context = rfx_context_new(FALSE); - } - - if (settings->NSCodec) - { - wfc->nsc_context = nsc_context_new(); } } diff --git a/client/Windows/wf_interface.h b/client/Windows/wf_interface.h index ff291e0f8..feb0846f7 100644 --- a/client/Windows/wf_interface.h +++ b/client/Windows/wf_interface.h @@ -111,8 +111,6 @@ struct wf_context wfBitmap* tile; DWORD mainThreadId; DWORD keyboardThreadId; - RFX_CONTEXT* rfx_context; - NSC_CONTEXT* nsc_context; BOOL sw_gdi; diff --git a/include/freerdp/codec/interleaved.h b/include/freerdp/codec/interleaved.h index 5f6662b6a..d46abac8e 100644 --- a/include/freerdp/codec/interleaved.h +++ b/include/freerdp/codec/interleaved.h @@ -36,7 +36,7 @@ struct _BITMAP_INTERLEAVED_CONTEXT BYTE* FlipBuffer; }; -int interleaved_decompress(BITMAP_INTERLEAVED_CONTEXT* interleaved, BYTE* pSrcData, UINT32 SrcSize, int bpp, +FREERDP_API int interleaved_decompress(BITMAP_INTERLEAVED_CONTEXT* interleaved, BYTE* pSrcData, UINT32 SrcSize, int bpp, BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight); FREERDP_API BITMAP_INTERLEAVED_CONTEXT* bitmap_interleaved_context_new(BOOL Compressor); diff --git a/libfreerdp/codec/interleaved.c b/libfreerdp/codec/interleaved.c index 68a224b90..05525156f 100644 --- a/libfreerdp/codec/interleaved.c +++ b/libfreerdp/codec/interleaved.c @@ -316,7 +316,7 @@ BITMAP_INTERLEAVED_CONTEXT* bitmap_interleaved_context_new(BOOL Compressor) { BITMAP_INTERLEAVED_CONTEXT* interleaved; - interleaved = (BITMAP_INTERLEAVED_CONTEXT*) calloc(1, sizeof(BITMAP_INTERLEAVED_CONTEXT*)); + interleaved = (BITMAP_INTERLEAVED_CONTEXT*) calloc(1, sizeof(BITMAP_INTERLEAVED_CONTEXT)); if (interleaved) { diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 8830e76b8..0cf492670 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -60,7 +60,7 @@ if(WITH_SSE2) endif() if(MSVC) - set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2 /O2") + set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2") endif() elseif(WITH_NEON) if(CMAKE_COMPILER_IS_GNUCC) @@ -76,11 +76,6 @@ if(CMAKE_COMPILER_IS_GNUCC) set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS "-O2") endif() -if(MSVC) - set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS "/O2") -endif() - - set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS}) add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"