diff --git a/cunit/test_nsc.c b/cunit/test_nsc.c index 4f26468d4..97ea25eb6 100644 --- a/cunit/test_nsc.c +++ b/cunit/test_nsc.c @@ -363,7 +363,12 @@ void test_nsc_encode(void) enc_stream = stream_new(65536); stream_clear(enc_stream); - nsc_compose_message(context, enc_stream, rgb_data, 64, 64, 64 * 3); + + for (i = 0; i < 30000; i++) + { + stream_set_pos(enc_stream, 0); + nsc_compose_message(context, enc_stream, rgb_data, 64, 64, 64 * 3); + } /*freerdp_hexdump(stream_get_head(enc_stream), stream_get_length(enc_stream));*/ nsc_process_message(context, 32, 64, 64, stream_get_head(enc_stream), stream_get_length(enc_stream)); /*freerdp_hexdump(context->bmpdata, 64 * 64 * 4);*/ diff --git a/libfreerdp-codec/CMakeLists.txt b/libfreerdp-codec/CMakeLists.txt index 5ed1b0c79..4d0fbda5b 100644 --- a/libfreerdp-codec/CMakeLists.txt +++ b/libfreerdp-codec/CMakeLists.txt @@ -48,6 +48,8 @@ if(WITH_SSE2) set(FREERDP_CODEC_SRCS ${FREERDP_CODEC_SRCS} rfx_sse2.c rfx_sse2.h + nsc_sse2.c + nsc_sse2.h ) set_property(SOURCE rfx_sse2.c PROPERTY COMPILE_FLAGS "-msse2") endif() diff --git a/libfreerdp-codec/nsc.c b/libfreerdp-codec/nsc.c index ad3a453af..69db2e2ac 100644 --- a/libfreerdp-codec/nsc.c +++ b/libfreerdp-codec/nsc.c @@ -28,6 +28,10 @@ #include "nsc_types.h" #include "nsc_encode.h" +#ifdef WITH_SSE2 +#include "nsc_sse2.h" +#endif + #ifndef NSC_INIT_SIMD #define NSC_INIT_SIMD(_nsc_context) do { } while (0) #endif diff --git a/libfreerdp-codec/nsc_sse2.c b/libfreerdp-codec/nsc_sse2.c new file mode 100644 index 000000000..1dc198228 --- /dev/null +++ b/libfreerdp-codec/nsc_sse2.c @@ -0,0 +1,345 @@ +/** + * FreeRDP: A Remote Desktop Protocol client. + * NSCodec Library - SSE2 Optimizations + * + * Copyright 2012 Vic Lee + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "nsc_types.h" +#include "nsc_sse2.h" + +static void nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, uint8* bmpdata, int rowstride) +{ + uint16 x; + uint16 y; + uint16 rw; + uint8 ccl; + uint8* src; + uint8* yplane; + uint8* coplane; + uint8* cgplane; + uint8* aplane; + __m128i r_val; + __m128i g_val; + __m128i b_val; + __m128i a_val; + __m128i y_val; + __m128i co_val; + __m128i cg_val; + uint32 tempWidth; + uint32 tempHeight; + + tempWidth = ROUND_UP_TO(context->width, 8); + tempHeight = ROUND_UP_TO(context->height, 2); + rw = (context->nsc_stream.ChromaSubSamplingLevel > 0 ? tempWidth : context->width); + ccl = context->nsc_stream.ColorLossLevel; + yplane = context->priv->plane_buf[0]; + coplane = context->priv->plane_buf[1]; + cgplane = context->priv->plane_buf[2]; + aplane = context->priv->plane_buf[3]; + + for (y = 0; y < context->height; y++) + { + src = bmpdata + (context->height - 1 - y) * rowstride; + yplane = context->priv->plane_buf[0] + y * rw; + coplane = context->priv->plane_buf[1] + y * rw; + cgplane = context->priv->plane_buf[2] + y * rw; + aplane = context->priv->plane_buf[3] + y * context->width; + for (x = 0; x < context->width; x += 8) + { + switch (context->pixel_format) + { + case RDP_PIXEL_FORMAT_B8G8R8A8: + b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12), *(src + 8), *(src + 4), *src); + g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13), *(src + 9), *(src + 5), *(src + 1)); + r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14), *(src + 10), *(src + 6), *(src + 2)); + a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15), *(src + 11), *(src + 7), *(src + 3)); + src += 32; + break; + case RDP_PIXEL_FORMAT_R8G8B8A8: + r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12), *(src + 8), *(src + 4), *src); + g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13), *(src + 9), *(src + 5), *(src + 1)); + b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14), *(src + 10), *(src + 6), *(src + 2)); + a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15), *(src + 11), *(src + 7), *(src + 3)); + src += 32; + break; + case RDP_PIXEL_FORMAT_B8G8R8: + b_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9), *(src + 6), *(src + 3), *src); + g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10), *(src + 7), *(src + 4), *(src + 1)); + r_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11), *(src + 8), *(src + 5), *(src + 2)); + a_val = _mm_set1_epi16(0xFF); + src += 24; + break; + case RDP_PIXEL_FORMAT_R8G8B8: + r_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9), *(src + 6), *(src + 3), *src); + g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10), *(src + 7), *(src + 4), *(src + 1)); + b_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11), *(src + 8), *(src + 5), *(src + 2)); + a_val = _mm_set1_epi16(0xFF); + src += 24; + break; + case RDP_PIXEL_FORMAT_B5G6R5_LE: + b_val = _mm_set_epi16( + (((*(src + 15)) & 0xF8) | ((*(src + 15)) >> 5)), + (((*(src + 13)) & 0xF8) | ((*(src + 13)) >> 5)), + (((*(src + 11)) & 0xF8) | ((*(src + 11)) >> 5)), + (((*(src + 9)) & 0xF8) | ((*(src + 9)) >> 5)), + (((*(src + 7)) & 0xF8) | ((*(src + 7)) >> 5)), + (((*(src + 5)) & 0xF8) | ((*(src + 5)) >> 5)), + (((*(src + 3)) & 0xF8) | ((*(src + 3)) >> 5)), + (((*(src + 1)) & 0xF8) | ((*(src + 1)) >> 5))); + g_val = _mm_set_epi16( + ((((*(src + 15)) & 0x07) << 5) | (((*(src + 14)) & 0xE0) >> 3)), + ((((*(src + 13)) & 0x07) << 5) | (((*(src + 12)) & 0xE0) >> 3)), + ((((*(src + 11)) & 0x07) << 5) | (((*(src + 10)) & 0xE0) >> 3)), + ((((*(src + 9)) & 0x07) << 5) | (((*(src + 8)) & 0xE0) >> 3)), + ((((*(src + 7)) & 0x07) << 5) | (((*(src + 6)) & 0xE0) >> 3)), + ((((*(src + 5)) & 0x07) << 5) | (((*(src + 4)) & 0xE0) >> 3)), + ((((*(src + 3)) & 0x07) << 5) | (((*(src + 2)) & 0xE0) >> 3)), + ((((*(src + 1)) & 0x07) << 5) | (((*src) & 0xE0) >> 3))); + r_val = _mm_set_epi16( + ((((*(src + 14)) & 0x1F) << 3) | (((*(src + 14)) >> 2) & 0x07)), + ((((*(src + 12)) & 0x1F) << 3) | (((*(src + 12)) >> 2) & 0x07)), + ((((*(src + 10)) & 0x1F) << 3) | (((*(src + 10)) >> 2) & 0x07)), + ((((*(src + 8)) & 0x1F) << 3) | (((*(src + 8)) >> 2) & 0x07)), + ((((*(src + 6)) & 0x1F) << 3) | (((*(src + 6)) >> 2) & 0x07)), + ((((*(src + 4)) & 0x1F) << 3) | (((*(src + 4)) >> 2) & 0x07)), + ((((*(src + 2)) & 0x1F) << 3) | (((*(src + 2)) >> 2) & 0x07)), + ((((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07))); + a_val = _mm_set1_epi16(0xFF); + src += 16; + break; + case RDP_PIXEL_FORMAT_R5G6B5_LE: + r_val = _mm_set_epi16( + (((*(src + 15)) & 0xF8) | ((*(src + 15)) >> 5)), + (((*(src + 13)) & 0xF8) | ((*(src + 13)) >> 5)), + (((*(src + 11)) & 0xF8) | ((*(src + 11)) >> 5)), + (((*(src + 9)) & 0xF8) | ((*(src + 9)) >> 5)), + (((*(src + 7)) & 0xF8) | ((*(src + 7)) >> 5)), + (((*(src + 5)) & 0xF8) | ((*(src + 5)) >> 5)), + (((*(src + 3)) & 0xF8) | ((*(src + 3)) >> 5)), + (((*(src + 1)) & 0xF8) | ((*(src + 1)) >> 5))); + g_val = _mm_set_epi16( + ((((*(src + 15)) & 0x07) << 5) | (((*(src + 14)) & 0xE0) >> 3)), + ((((*(src + 13)) & 0x07) << 5) | (((*(src + 12)) & 0xE0) >> 3)), + ((((*(src + 11)) & 0x07) << 5) | (((*(src + 10)) & 0xE0) >> 3)), + ((((*(src + 9)) & 0x07) << 5) | (((*(src + 8)) & 0xE0) >> 3)), + ((((*(src + 7)) & 0x07) << 5) | (((*(src + 6)) & 0xE0) >> 3)), + ((((*(src + 5)) & 0x07) << 5) | (((*(src + 4)) & 0xE0) >> 3)), + ((((*(src + 3)) & 0x07) << 5) | (((*(src + 2)) & 0xE0) >> 3)), + ((((*(src + 1)) & 0x07) << 5) | (((*src) & 0xE0) >> 3))); + b_val = _mm_set_epi16( + ((((*(src + 14)) & 0x1F) << 3) | (((*(src + 14)) >> 2) & 0x07)), + ((((*(src + 12)) & 0x1F) << 3) | (((*(src + 12)) >> 2) & 0x07)), + ((((*(src + 10)) & 0x1F) << 3) | (((*(src + 10)) >> 2) & 0x07)), + ((((*(src + 8)) & 0x1F) << 3) | (((*(src + 8)) >> 2) & 0x07)), + ((((*(src + 6)) & 0x1F) << 3) | (((*(src + 6)) >> 2) & 0x07)), + ((((*(src + 4)) & 0x1F) << 3) | (((*(src + 4)) >> 2) & 0x07)), + ((((*(src + 2)) & 0x1F) << 3) | (((*(src + 2)) >> 2) & 0x07)), + ((((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07))); + a_val = _mm_set1_epi16(0xFF); + src += 16; + break; + case RDP_PIXEL_FORMAT_P4_PLANER: + { + int shift; + uint8 idx[8]; + + for (shift = 7; shift >= 0; shift--) + { + idx[shift] = ((*src) >> shift) & 1; + idx[shift] |= (((*(src + 1)) >> shift) & 1) << 1; + idx[shift] |= (((*(src + 2)) >> shift) & 1) << 2; + idx[shift] |= (((*(src + 3)) >> shift) & 1) << 3; + idx[shift] *= 3; + } + r_val = _mm_set_epi16( + context->palette[idx[0]], + context->palette[idx[1]], + context->palette[idx[2]], + context->palette[idx[3]], + context->palette[idx[4]], + context->palette[idx[5]], + context->palette[idx[6]], + context->palette[idx[7]]); + g_val = _mm_set_epi16( + context->palette[idx[0] + 1], + context->palette[idx[1] + 1], + context->palette[idx[2] + 1], + context->palette[idx[3] + 1], + context->palette[idx[4] + 1], + context->palette[idx[5] + 1], + context->palette[idx[6] + 1], + context->palette[idx[7] + 1]); + b_val = _mm_set_epi16( + context->palette[idx[0] + 2], + context->palette[idx[1] + 2], + context->palette[idx[2] + 2], + context->palette[idx[3] + 2], + context->palette[idx[4] + 2], + context->palette[idx[5] + 2], + context->palette[idx[6] + 2], + context->palette[idx[7] + 2]); + src += 4; + } + a_val = _mm_set1_epi16(0xFF); + break; + case RDP_PIXEL_FORMAT_P8: + { + r_val = _mm_set_epi16( + context->palette[(*(src + 7)) * 3], + context->palette[(*(src + 6)) * 3], + context->palette[(*(src + 5)) * 3], + context->palette[(*(src + 4)) * 3], + context->palette[(*(src + 3)) * 3], + context->palette[(*(src + 2)) * 3], + context->palette[(*(src + 1)) * 3], + context->palette[(*src) * 3]); + g_val = _mm_set_epi16( + context->palette[(*(src + 7)) * 3 + 1], + context->palette[(*(src + 6)) * 3 + 1], + context->palette[(*(src + 5)) * 3 + 1], + context->palette[(*(src + 4)) * 3 + 1], + context->palette[(*(src + 3)) * 3 + 1], + context->palette[(*(src + 2)) * 3 + 1], + context->palette[(*(src + 1)) * 3 + 1], + context->palette[(*src) * 3 + 1]); + b_val = _mm_set_epi16( + context->palette[(*(src + 7)) * 3 + 2], + context->palette[(*(src + 6)) * 3 + 2], + context->palette[(*(src + 5)) * 3 + 2], + context->palette[(*(src + 4)) * 3 + 2], + context->palette[(*(src + 3)) * 3 + 2], + context->palette[(*(src + 2)) * 3 + 2], + context->palette[(*(src + 1)) * 3 + 2], + context->palette[(*src) * 3 + 2]); + src += 8; + } + a_val = _mm_set1_epi16(0xFF); + break; + default: + r_val = g_val = b_val = a_val = _mm_set1_epi16(0); + break; + } + + y_val = _mm_srai_epi16(r_val, 2); + y_val = _mm_add_epi16(y_val, _mm_srai_epi16(g_val, 1)); + y_val = _mm_add_epi16(y_val, _mm_srai_epi16(b_val, 2)); + co_val = _mm_sub_epi16(r_val, b_val); + co_val = _mm_srai_epi16(co_val, ccl); + cg_val = _mm_sub_epi16(g_val, _mm_srai_epi16(r_val, 1)); + cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1)); + cg_val = _mm_srai_epi16(cg_val, ccl); + + y_val = _mm_packus_epi16(y_val, y_val); + _mm_storeu_si128((__m128i*) yplane, y_val); + co_val = _mm_packs_epi16(co_val, co_val); + _mm_storeu_si128((__m128i*) coplane, co_val); + cg_val = _mm_packs_epi16(cg_val, cg_val); + _mm_storeu_si128((__m128i*) cgplane, cg_val); + a_val = _mm_packus_epi16(a_val, a_val); + _mm_storeu_si128((__m128i*) aplane, a_val); + yplane += 8; + coplane += 8; + cgplane += 8; + aplane += 8; + } + if (context->nsc_stream.ChromaSubSamplingLevel > 0 && (context->width % 2) == 1) + { + context->priv->plane_buf[0][y * rw + context->width] = context->priv->plane_buf[0][y * rw + context->width - 1]; + context->priv->plane_buf[1][y * rw + context->width] = context->priv->plane_buf[1][y * rw + context->width - 1]; + context->priv->plane_buf[2][y * rw + context->width] = context->priv->plane_buf[2][y * rw + context->width - 1]; + } + } + if (context->nsc_stream.ChromaSubSamplingLevel > 0 && (y % 2) == 1) + { + memcpy(yplane + rw, yplane, rw); + memcpy(coplane + rw, coplane, rw); + memcpy(cgplane + rw, cgplane, rw); + } +} + +static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context) +{ + uint16 x; + uint16 y; + uint8* co_dst; + uint8* cg_dst; + sint8* co_src0; + sint8* co_src1; + sint8* cg_src0; + sint8* cg_src1; + uint32 tempWidth; + uint32 tempHeight; + __m128i t; + __m128i val; + __m128i mask = _mm_set1_epi16(0xFF); + + tempWidth = ROUND_UP_TO(context->width, 8); + tempHeight = ROUND_UP_TO(context->height, 2); + + for (y = 0; y < tempHeight >> 1; y++) + { + co_dst = context->priv->plane_buf[1] + y * (tempWidth >> 1); + cg_dst = context->priv->plane_buf[2] + y * (tempWidth >> 1); + co_src0 = (sint8*) context->priv->plane_buf[1] + (y << 1) * tempWidth; + co_src1 = co_src0 + tempWidth; + cg_src0 = (sint8*) context->priv->plane_buf[2] + (y << 1) * tempWidth; + cg_src1 = cg_src0 + tempWidth; + for (x = 0; x < tempWidth >> 1; x += 8) + { + t = _mm_loadu_si128((__m128i*) co_src0); + t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*) co_src1)); + val = _mm_and_si128(_mm_srli_si128(t, 1), mask); + val = _mm_avg_epu16(val, _mm_and_si128(t, mask)); + val = _mm_packus_epi16(val, val); + _mm_storeu_si128((__m128i*) co_dst, val); + co_dst += 8; + co_src0 += 16; + co_src1 += 16; + + t = _mm_loadu_si128((__m128i*) cg_src0); + t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*) cg_src1)); + val = _mm_and_si128(_mm_srli_si128(t, 1), mask); + val = _mm_avg_epu16(val, _mm_and_si128(t, mask)); + val = _mm_packus_epi16(val, val); + _mm_storeu_si128((__m128i*) cg_dst, val); + cg_dst += 8; + cg_src0 += 16; + cg_src1 += 16; + } + } +} + +static void nsc_encode_sse2(NSC_CONTEXT* context, uint8* bmpdata, int rowstride) +{ + nsc_encode_argb_to_aycocg_sse2(context, bmpdata, rowstride); + if (context->nsc_stream.ChromaSubSamplingLevel > 0) + { + nsc_encode_subsampling_sse2(context); + } +} + +void nsc_init_sse2(NSC_CONTEXT* context) +{ + IF_PROFILER(context->priv->prof_nsc_encode->name = "nsc_encode_sse2"); + + context->encode = nsc_encode_sse2; +} diff --git a/libfreerdp-codec/nsc_sse2.h b/libfreerdp-codec/nsc_sse2.h new file mode 100644 index 000000000..1c0b632b4 --- /dev/null +++ b/libfreerdp-codec/nsc_sse2.h @@ -0,0 +1,31 @@ +/** + * FreeRDP: A Remote Desktop Protocol client. + * NSCodec Library - SSE2 Optimizations + * + * Copyright 2012 Vic Lee + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NSC_SSE2_H +#define __NSC_SSE2_H + +#include + +void nsc_init_sse2(NSC_CONTEXT* context); + +#ifndef NSC_INIT_SIMD +#define NSC_INIT_SIMD(_context) nsc_init_sse2(_context) +#endif + +#endif /* __NSC_SSE2_H */