/**
 *    @file gpu_opencl_example.c
 *    @version 1.0.0
 *  
 *    @brief This example shows a little OpenCL demo program that executes on the i.MX6Q GPU
 *  
 *    BLT_DISCLAIMER
 *  
 *    @author Harald Krapfenbauer
 */


#include <CL/cl.h>
#include <assert.h>
#include <stdio.h>


/* Private globals */

/* This is OpenCL source code.
   It will be JIT-compiled in gpuInit()
*/
static const char* gpuKernelCode[] = {
	"kernel void calcAverage(global ushort* inAmp,", /* Argument nr. 0 */
	"                        global ushort* outMask,", /* Argument nr. 1 */
	"                        const ushort threshold)", /* Argument nr. 2 */
	"{",
	"    size_t idx = get_global_id(1) * get_global_size(0) + get_global_id(0);",
	"    if (inAmp[idx] > threshold) {",
	"        outMask[idx] = 0xffff;",
	"    } else {",
	"        outMask[idx] = 0x0;",
	"    }",
	"}",
};
static cl_context gpuContext;
static cl_command_queue gpuCQ;
static cl_program gpuProgram;
static cl_kernel gpuKernel;


int gpuInit(void)
{
	printf("%s\n", __func__);

	/* Get an OpenCL platform */
	cl_platform_id gpuPlatform;
	clGetPlatformIDs(1, &gpuPlatform, NULL);
	assert(gpuPlatform);

	/* Get a GPU device */
	cl_device_id gpuDevice;
	clGetDeviceIDs(gpuPlatform, CL_DEVICE_TYPE_GPU, 1, &gpuDevice, NULL);
	assert(gpuDevice);
	char cBuffer[1024];
	clGetDeviceInfo(gpuDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
	printf("   CL_DEVICE_NAME: %s\n", cBuffer);
	clGetDeviceInfo(gpuDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
	printf("   CL_DRIVER_VERSION: %s\n", cBuffer);
	
	/* Create a context to run OpenCL enabled GPU */
	gpuContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
	assert(gpuContext);

	/* Create a command-queue on the GPU device */
	gpuCQ = clCreateCommandQueue(gpuContext, gpuDevice, 0, NULL);
	assert(gpuCQ);

	/* Create OpenCL program with source code */
	cl_int ret = -1;
	gpuProgram = clCreateProgramWithSource(
		gpuContext, sizeof(gpuKernelCode)/sizeof(char *),
		gpuKernelCode, NULL, &ret);
	if (gpuProgram == NULL) {
		fprintf(stderr, "clCreateProgramWithSource: Error code = %d\n",ret);
		return -1;
	}

	/* Build the program (OpenCL JIT compilation) */
	ret = clBuildProgram(gpuProgram, 0, NULL, NULL, NULL, NULL);
	if (ret != CL_SUCCESS) {
		clGetProgramBuildInfo(gpuProgram, gpuDevice,
				CL_PROGRAM_BUILD_LOG, sizeof(cBuffer), cBuffer,
				NULL);
		fprintf(stderr, "clBuildProgram: Error code: %d\nBuild log: %s\n", ret,
			cBuffer);
		return -1;
	}

	/* Create a handle to the compiled OpenCL function (Kernel) */
	gpuKernel = clCreateKernel(
		gpuProgram, "calcAverage", &ret);
	if (gpuKernel == NULL) {
		fprintf(stderr, "clCreateKernel: Error code = %d\n",ret);
		return -1;
	}

	printf("GPU initialized\n");

	return 0;
}


int gpuCreateAmplitudeMask(uint16_t *amplitudes, uint16_t xRes, uint16_t yRes, uint16_t threshold, uint16_t *outMask) 
{
	cl_mem gpuOutMask;
	cl_mem gpuInAmp;
	cl_int ret;


	/*
	  P R E P A R A T I O N
	*/

	/* Allocate output memory on GPU for output mask
	   Note - if xRes and yRes is static,
	   may as well be done in gpuInit()
	*/
	gpuOutMask = clCreateBuffer(
		gpuContext, CL_MEM_WRITE_ONLY,
		sizeof(unsigned short) * xRes * yRes,
		NULL, NULL);
	assert(gpuOutMask);

	/* Assign output memory to kernel argument */
	ret = clSetKernelArg(gpuKernel, 1, sizeof(cl_mem), (void *)&gpuOutMask);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clSetKernelArg: Error code = %d\n", ret);
		return -1;
	}

	/* Allocate GPU memory for source vector and initialize from CPU memory */
	assert(amplitudes);
	gpuInAmp = clCreateBuffer(
		gpuContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
		sizeof(unsigned short) * xRes * yRes,
		amplitudes, NULL);
	assert(gpuInAmp);

	/* Associate the GPU memory with kernel argument */
	ret = clSetKernelArg(gpuKernel, 0, sizeof(cl_mem), (void *)&gpuInAmp);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clSetKernelArg: Error code = %d\n", ret);
		return -1;
	}

	/* Threshold argument - constant in kernel */
	ret = clSetKernelArg(gpuKernel, 2, sizeof(uint16_t),
			     (void *)&threshold);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clSetKernelArg: Error code = %d\n", ret);
		return -1;
	}

	/*
	  E X E C U T E   O N   G P U
	*/

	/* Launch the Kernel on the GPU
	   This kernel only uses global data */
	const size_t workSizes[2] = {xRes, yRes};
	ret = clEnqueueNDRangeKernel(gpuCQ, gpuKernel, 2, NULL, workSizes,
				     NULL, 0, NULL, NULL);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clEnqueueNDRangeKernel: Error code = %d\n", ret);
		return -1;
	}

	/*
	  G E T   O U T P U T
	*/

	/* Copy the output in GPU memory back to CPU memory */
	ret = clEnqueueReadBuffer(gpuCQ, gpuOutMask, CL_TRUE /* Block until read finished */, 0,
				  xRes * yRes * sizeof(unsigned short), 
				  outMask, 0, NULL, NULL);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clEnqueueReadBuffer: Error code = %d\n", ret);
		return -1;
	}

	/*
	  C L E A N   U P
	*/

	clReleaseMemObject(gpuInAmp);
	clReleaseMemObject(gpuOutMask);

	return 0;
}


void gpuExit(void)
{
	/* Cleanup */
	clReleaseKernel(gpuKernel);
	clReleaseProgram(gpuProgram);
	clReleaseCommandQueue(gpuCQ);
	clReleaseContext(gpuContext);
}
