opencl: inline the openCL program in the source code

This commit is contained in:
David Fort
2019-11-13 08:28:14 +01:00
committed by akallabeth
parent 00e9efd189
commit 6123920a2e
8 changed files with 101 additions and 128 deletions

View File

@@ -144,7 +144,6 @@ typedef pstatus_t (*__andC_32u_t)(const UINT32* pSrc, UINT32 val, UINT32* pDst,
typedef pstatus_t (*__orC_32u_t)(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len);
typedef pstatus_t (*primitives_uninit_t)(void);
typedef struct
{
/* Memory-to-memory copy routines */
@@ -208,11 +207,10 @@ extern "C"
FREERDP_API void primitives_set_hints(primitive_hints hints);
FREERDP_API primitive_hints primitives_get_hints(void);
FREERDP_API primitives_t* primitives_get_generic(void);
FREERDP_API DWORD primitives_flags(primitives_t *p);
FREERDP_API BOOL primitives_init(primitives_t *p, primitive_hints hints);
FREERDP_API DWORD primitives_flags(primitives_t* p);
FREERDP_API BOOL primitives_init(primitives_t* p, primitive_hints hints);
FREERDP_API void primitives_uninit();
#ifdef __cplusplus
}
#endif

View File

@@ -293,13 +293,11 @@ if (WITH_NEON)
endif()
if (WITH_OPENCL)
freerdp_definition_add(-DOPENCL_SOURCE_PATH="${CMAKE_INSTALL_PREFIX}/${FREERDP_PLUGIN_PATH}")
set(PRIMITIVES_OPENCL_SRCS primitives/prim_YUV_opencl.c)
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
freerdp_library_add(OpenCL::OpenCL)
install(FILES primitives/primitives.cl DESTINATION ${FREERDP_PLUGIN_PATH})
endif()
set(PRIMITIVES_OPT_SRCS

View File

@@ -567,8 +567,8 @@ static void dump_window_state_order(wLog* log, const char* msg, const WINDOW_ORD
if (order->fieldFlags & WINDOW_ORDER_FIELD_TITLE)
DUMP_APPEND(buffer, bufferSize, " title");
if (order->fieldFlags & WINDOW_ORDER_FIELD_CLIENT_AREA_OFFSET)
DUMP_APPEND(buffer, bufferSize, " clientOffset=(%"PRId32",%"PRId32")",
state->clientOffsetX, state->clientOffsetY);
DUMP_APPEND(buffer, bufferSize, " clientOffset=(%" PRId32 ",%" PRId32 ")",
state->clientOffsetX, state->clientOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_CLIENT_AREA_SIZE)
DUMP_APPEND(buffer, bufferSize, " clientAreaWidth=%" PRIu32 " clientAreaHeight=%" PRIu32 "",
state->clientAreaWidth, state->clientAreaHeight);
@@ -585,10 +585,11 @@ static void dump_window_state_order(wLog* log, const char* msg, const WINDOW_ORD
if (order->fieldFlags & WINDOW_ORDER_FIELD_ROOT_PARENT)
DUMP_APPEND(buffer, bufferSize, " rootParent=0x%" PRIx32 "", state->rootParentHandle);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_OFFSET)
DUMP_APPEND(buffer, bufferSize, " windowOffset=(%"PRId32",%"PRId32")", state->windowOffsetX, state->windowOffsetY);
DUMP_APPEND(buffer, bufferSize, " windowOffset=(%" PRId32 ",%" PRId32 ")",
state->windowOffsetX, state->windowOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_CLIENT_DELTA)
DUMP_APPEND(buffer, bufferSize, " windowClientDelta=(%"PRId32",%"PRId32")",
state->windowClientDeltaX, state->windowClientDeltaY);
DUMP_APPEND(buffer, bufferSize, " windowClientDelta=(%" PRId32 ",%" PRId32 ")",
state->windowClientDeltaX, state->windowClientDeltaY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_SIZE)
DUMP_APPEND(buffer, bufferSize, " windowWidth=%" PRIu32 " windowHeight=%" PRIu32 "",
state->windowWidth, state->windowHeight);
@@ -607,8 +608,8 @@ static void dump_window_state_order(wLog* log, const char* msg, const WINDOW_ORD
}
if (order->fieldFlags & WINDOW_ORDER_FIELD_VIS_OFFSET)
DUMP_APPEND(buffer, bufferSize, " visibleOffset=(%"PRId32",%"PRId32")", state->visibleOffsetX,
state->visibleOffsetY);
DUMP_APPEND(buffer, bufferSize, " visibleOffset=(%" PRId32 ",%" PRId32 ")",
state->visibleOffsetX, state->visibleOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_VISIBILITY)
{

View File

@@ -54,12 +54,13 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
{
cl_int ret;
int i;
cl_mem objs[3] = {NULL, NULL, NULL};
cl_mem objs[3] = { NULL, NULL, NULL };
cl_mem destObj;
cl_kernel kernel;
cl_event events[3];
size_t indexes[2];
const char *sourceNames[] = {"Y", "U", "V"};
primitives_opencl_context *cl = primitives_get_opencl_context();
const char* sourceNames[] = { "Y", "U", "V" };
primitives_opencl_context* cl = primitives_get_opencl_context();
kernel = clCreateKernel(cl->program, kernelName, &ret);
if (ret != CL_SUCCESS)
@@ -70,15 +71,16 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
for (i = 0; i < 3; i++)
{
objs[i] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, srcStep[i] * roi->height, NULL, &ret);
objs[i] =
clCreateBuffer(cl->context, CL_MEM_READ_ONLY, srcStep[i] * roi->height, NULL, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
goto error_objs;
}
ret = clEnqueueWriteBuffer(cl->commandQueue, objs[i], CL_TRUE, 0, srcStep[i] * roi->height,
pSrc[i], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(cl->commandQueue, objs[i], CL_FALSE, 0, srcStep[i] * roi->height,
pSrc[i], 0, NULL, &events[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to enqueue write command for %sobj", sourceNames[i]);
@@ -96,14 +98,14 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
/* push source + stride arguments*/
for (i = 0; i < 3; i++)
{
ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), (void *)&objs[i]);
ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), (void*)&objs[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
goto error_set_args;
}
ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), (void *)&srcStep[i]);
ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), (void*)&srcStep[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
@@ -111,14 +113,14 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
}
}
ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&destObj);
ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&destObj);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg destObj");
goto error_set_args;
}
ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&dstStep);
ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&dstStep);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg dstStep");
@@ -127,8 +129,7 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
indexes[0] = roi->width;
indexes[1] = roi->height;
ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL,
0, NULL, NULL);
ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL, 3, events, NULL);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to enqueue call kernel");
@@ -136,7 +137,8 @@ static pstatus_t opencl_YUV420ToRGB(const char* kernelName, const BYTE* pSrc[3],
}
/* Transfer result to host */
ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0, NULL, NULL);
ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0,
NULL, NULL);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to read back buffer");
@@ -184,18 +186,19 @@ pstatus_t primitives_uninit_opencl(void)
return PRIMITIVES_SUCCESS;
}
static const char* openclProgram =
#include "primitives.cl"
;
BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
{
cl_platform_id* platform_ids = NULL;
cl_uint ndevices, nplatforms, i;
cl_kernel kernel;
cl_int ret;
char sourcePath[1000];
BOOL gotGPU = FALSE;
FILE* f;
size_t programLen;
char* programSource;
ret = clGetPlatformIDs(0, NULL, &nplatforms);
if (ret != CL_SUCCESS || nplatforms < 1)
@@ -270,45 +273,14 @@ BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
return FALSE;
}
snprintf(sourcePath, sizeof(sourcePath), "%s/primitives.cl", OPENCL_SOURCE_PATH);
f = fopen(sourcePath, "r");
if (!f)
{
WLog_ERR(TAG, "openCL: unable to open source file %s", sourcePath);
goto error_source_file;
}
fseek(f, 0, SEEK_END);
programLen = ftell(f);
fseek(f, 0, SEEK_SET);
programSource = malloc(programLen);
if (!programSource)
{
WLog_ERR(TAG, "openCL: unable to allocate memory(%d bytes) for source file %s", programLen,
sourcePath);
fclose(f);
goto error_source_file;
}
if (fread(programSource, programLen, 1, f) <= 0)
{
WLog_ERR(TAG, "openCL: unable to read openCL program in %s", sourcePath);
free(programSource);
fclose(f);
goto error_source_file;
}
fclose(f);
programLen = strlen(openclProgram);
cl->program =
clCreateProgramWithSource(cl->context, 1, (const char**)&programSource, &programLen, &ret);
clCreateProgramWithSource(cl->context, 1, (const char**)&openclProgram, &programLen, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create command queue");
WLog_ERR(TAG, "openCL: unable to create program");
goto out_program_create;
}
free(programSource);
ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL);
if (ret != CL_SUCCESS)
@@ -343,7 +315,6 @@ BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
out_program_build:
clReleaseProgram(cl->program);
error_source_file:
out_program_create:
clReleaseCommandQueue(cl->commandQueue);
clReleaseContext(cl->context);
@@ -363,11 +334,12 @@ BOOL primitives_init_opencl(primitives_t* prims)
}
static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], const UINT32 srcStep[3],
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, const prim_size_t* roi)
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* roi)
{
const char *kernel_name;
const char* kernel_name;
switch(DstFormat)
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
@@ -394,7 +366,4 @@ static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], const UINT32
void primitives_init_YUV_opencl(primitives_t* prims)
{
prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
}

View File

@@ -1448,7 +1448,6 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type, const BYTE*
}
}
void primitives_init_YUV_opt(primitives_t* prims)
{
generic = primitives_get_generic();

View File

@@ -40,7 +40,6 @@
#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
#endif
#if defined(WITH_SSE2)
/* Use lddqu for unaligned; load for 16-byte aligned. */
#define LOAD_SI128(_ptr_) \

View File

@@ -46,7 +46,6 @@ primitive_hints primitives_get_hints(void)
return primitivesHints;
}
/* Singleton pointer used throughout the program when requested. */
static primitives_t pPrimitivesGeneric = { 0 };
static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
@@ -62,14 +61,12 @@ static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#endif
#if defined(HAVE_OPTIMIZED_PRIMITIVES)
static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#endif
static primitives_t pPrimitives = { 0 };
/* ------------------------------------------------------------------------- */
static BOOL primitives_init_generic(primitives_t *prims)
static BOOL primitives_init_generic(primitives_t* prims)
{
primitives_init_add(prims);
primitives_init_andor(prims);
@@ -93,7 +90,7 @@ static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PV
return primitives_init_generic(&pPrimitivesGeneric);
}
static BOOL primitives_init_optimized(primitives_t *prims)
static BOOL primitives_init_optimized(primitives_t* prims)
{
primitives_init_generic(prims);
@@ -113,11 +110,12 @@ static BOOL primitives_init_optimized(primitives_t *prims)
return TRUE;
}
typedef struct {
BYTE *channels[3];
typedef struct
{
BYTE* channels[3];
UINT32 steps[3];
prim_size_t roi;
BYTE *outputBuffer;
BYTE* outputBuffer;
UINT32 outputStride;
UINT32 testedFormat;
} primitives_YUV_benchmark;
@@ -155,7 +153,7 @@ static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_be
for (i = 0; i < 3; i++)
{
BYTE *buf = ret->channels[i] = malloc(roi->width * roi->height);
BYTE* buf = ret->channels[i] = malloc(roi->width * roi->height);
if (!buf)
goto fail;
@@ -170,11 +168,11 @@ fail:
return ret;
}
static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark *bench, primitives_t *prims,
UINT64 runTime, UINT32 *computations)
static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
UINT64 runTime, UINT32* computations)
{
ULONGLONG dueDate = GetTickCount64() + runTime;
const BYTE *channels[3];
ULONGLONG dueDate;
const BYTE* channels[3];
int i;
*computations = 0;
@@ -182,10 +180,20 @@ static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark *bench, primit
for (i = 0; i < 3; i++)
channels[i] = bench->channels[i];
/* do a first dry run to initialize cache and such */
pstatus_t status =
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* let's run the benchmark */
dueDate = GetTickCount64() + runTime;
while (GetTickCount64() < dueDate)
{
pstatus_t status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
pstatus_t status =
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
*computations = *computations + 1;
@@ -193,10 +201,10 @@ static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark *bench, primit
return TRUE;
}
static BOOL primitives_autodetect_best(primitives_t *prims)
static BOOL primitives_autodetect_best(primitives_t* prims)
{
BOOL ret = FALSE;
UINT64 benchDuration = 150; // 100 ms
UINT64 benchDuration = 150; /* 150 ms */
UINT32 genericCount = 0;
UINT32 bestCount;
primitives_t* genericPrims = primitives_get_generic();
@@ -258,25 +266,14 @@ static BOOL primitives_autodetect_best(primitives_t *prims)
}
#endif
WLog_DBG(TAG,
"benchmark result: generic=%" PRIu32
WLog_DBG(TAG, "primitives benchmark result:");
WLog_DBG(TAG, " * generic=%" PRIu32, genericCount);
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
" optimized=%" PRIu32
WLog_DBG(TAG, " * optimized=%" PRIu32, optimizedCount);
#endif
#if defined(WITH_OPENCL)
" openCL=%" PRIu32
WLog_DBG(TAG, " * openCL=%" PRIu32, openclCount);
#endif
,
genericCount
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
,
optimizedCount
#endif
#if defined(WITH_OPENCL)
,
openclCount
#endif
);
WLog_INFO(TAG, "primitives autodetect, using %s", primName);
ret = TRUE;
out:
@@ -291,6 +288,9 @@ static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID*
WINPR_UNUSED(param);
WINPR_UNUSED(context);
if (!primitives_init_optimized(&pPrimitivesGpu))
return FALSE;
if (!primitives_init_opencl(&pPrimitivesGpu))
return FALSE;
@@ -305,10 +305,8 @@ static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID*
WINPR_UNUSED(param);
WINPR_UNUSED(context);
if (!primitives_init_optimized(&pPrimitivesCpu))
return FALSE;
return TRUE;
return primitives_init_optimized(&pPrimitivesCpu);
}
#endif
@@ -323,22 +321,26 @@ static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID
BOOL primitives_init(primitives_t* p, primitive_hints hints)
{
switch(hints)
switch (hints)
{
case PRIMITIVES_AUTODETECT:
return primitives_autodetect_best(p);
case PRIMITIVES_PURE_SOFT:
*p = pPrimitivesGeneric;
return TRUE;
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
case PRIMITIVES_ONLY_CPU:
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
*p = pPrimitivesCpu;
return TRUE;
#else
*p = pPrimitivesGeneric;
#endif
#if defined(WITH_OPENCL)
return TRUE;
case PRIMITIVES_ONLY_GPU:
#if defined(WITH_OPENCL)
*p = pPrimitivesGpu;
return TRUE;
#else
return FALSE;
#endif
default:
WLog_ERR(TAG, "unknown hint %d", hints);
@@ -367,7 +369,7 @@ static void setup(void)
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
#endif
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
#if defined(WITH_OPENCL)
InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL);
#endif
InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, NULL, NULL);
@@ -388,18 +390,23 @@ primitives_t* primitives_get_generic(void)
primitives_t* primitives_get_by_type(DWORD type)
{
InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
switch (type)
{
#if defined(WITH_OPENCL)
case PRIMITIVES_ONLY_GPU:
InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
return &pPrimitivesGpu;
#if defined(WITH_OPENCL)
if (InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL))
return &pPrimitivesGpu;
#endif
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
return NULL;
case PRIMITIVES_ONLY_CPU:
InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
return &pPrimitivesCpu;
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
if (InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL))
return &pPrimitivesCpu;
#endif
return NULL;
case PRIMITIVES_PURE_SOFT:
default:
return &pPrimitivesGeneric;

View File

@@ -1,22 +1,24 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Optimized operations using openCL
* vi:ts=4 sw=4
*
* Copyright 2019 David Fort <contact@hardening-consulting.com>
* Copyright 2019 Rangee Gmbh
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#define STRINGIFY(x) #x
STRINGIFY(
unsigned char clamp_uc(int v, int l, int h)
{
if (v > h)
@@ -79,4 +81,4 @@ __kernel void yuv420_to_bgra_1b(
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
destPtr[3] = 0xff; /* A */
}
)