CIsimpleCLDGMA.c

/*****************************************************************************

	CIsimpleDGMA.c		Source for BitFlow CI lib DirectGMA CL example program

	Oct 15,		2015	CIW/SJT

	© Copyright 2015, BitFlow, Inc. All rights reserved.

	Tabstops are 4

	$Author: steve $

	$Date: 2020/10/02 23:30:13 $

	$Id: CIsimpleCLDGMA.c,v 1.5 2020/10/02 23:30:13 steve Exp $

*****************************************************************************/

/*==========================================================================*/
/*
**	For access to command line display.
*/
#include	<stdio.h>
#include	<stdarg.h>
#include	<string.h>
/*
**	For checking for keypress
*/
#include	<sys/time.h>
#include	<sys/types.h>
#include	<unistd.h>
/*
**	For access to BitFlow camera interface library.
*/
#include	"BFciLib.h"
#include	"BFPCI2PCI.h"
/*--------------------------------------------------------------------------*/
/*
**	Bring in the CL header
*/
#include	"CL/cl.h"
/*
**	And the (AMD) extensions
*/
#include	"CL/cl_ext.h"
/*==========================================================================*/
static int sExitAns = 0;		/* program exit code */
static tCIp sCIp = NULL;		/* BitFlow device open token */
static int sNdx = 0;			/* BitFlow device index */
static int sNframes = 4;		/* number of DMA frame buffers */
static int sNdx2 = 0;			/* AMD GPU device index */

static int sLoClamp = 0;		/* lower clamp pixel value */
static int sHiClamp = 255;		/* higher clamp pixel value */

static int sMaxFrames = 0;		/* total frames to display */
static int sSkipFrames = 0;		/* skip between display */
static int sDidFrames = 0;		/* total frames handled */
static int sBGok = 0;			/* ok to be in background */

static int sPageSize = 0;		/* system dependent *?
								   /*-------------------------------------------------------------------------- */
#define	SHOW(x)	{ (void)printf x ; (void)fflush(stdout); }
#define	ERR(x)	{ SHOW(("ERR: ")); SHOW(x); }
/*
**
#define	DO_DEBUG_CLDGMA
**
*/
#ifdef	DO_DEBUG_CLDGMA
#define	DB(x)	{ SHOW(("DB: ")); SHOW(x); }
#else
#define	DB(x)
#endif

static char *sArgv0 = NULL;		/* name of executable */
static void ShowHelp(void)
{
	SHOW(("%s of " __DATE__ " at " __TIME__ "\n", sArgv0));
	SHOW(("   -h           display this message and exit\n"));
	SHOW(("\n"));
	SHOW(("   -x ndx       choose available BitFow device ndx (default 0)\n"));
	SHOW(("   -n nFrames   change number of DMA frame buffers (default 4)\n"));
	SHOW(("   -y ndx       choose available AMD device ndx (default 0)\n"));
	SHOW(("   -c lo hi     specify lo/hi clamp values (default no clamp)\n"));
	SHOW(("   -m maxFrames max frames to display (default infinite)\n"));
	SHOW(("   -s skipFr    frames to skip between display (default 0)\n"));
	SHOW(("   -b           program is backgrounded (no newline exit)\n"));
	SHOW(("\n"));
	SHOW(("  CIsimpleCLDGMA: init a VFG and display clamped pixels from GPU\n"));
	SHOW(("      display ends with newline\n"));
	SHOW(("\n"));
}

/*==========================================================================*/
#include	<time.h>
#include	<sys/timeb.h>
static tCIDOUBLE GetTime(void)
/*
**	Return fractional seconds
*/
{
	tCIDOUBLE ans = 0.0;

#ifdef _POSIX_TIMERS

	struct timespec tp;

	(void)clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
	ans = (tCIDOUBLE) tp.tv_sec;
	ans += ((tCIDOUBLE) tp.tv_nsec) / 1000000000.0;

#else

	struct timeb tb;

	(void)ftime(&tb);
	ans = tb.millitm;
	ans /= 1000.0;
	ans += tb.time;

#endif

	return (ans);
}

/*--------------------------------------------------------------------------*/
static int DecodeArgs(int argc, char **argv)
/*
**	Parse the input arguments.
*/
{
	char *str;

	argv += 1;
	argc -= 1;					/* skip program name */

	while (argc-- > 0)
	{
		str = *argv++;
		if (str[0] != '-')
		{
			ERR(("Do not know '%s' arg\n", str));
			ShowHelp();
			return (1);
		}
		switch (str[1])
		{
		case 'h':
			ShowHelp();
			exit(0);
		case 'x':
			(void)sscanf(*argv, "%d", &sNdx);
			argv += 1;
			argc -= 1;
			break;
		case 'n':
			(void)sscanf(*argv, "%d", &sNframes);
			argv += 1;
			argc -= 1;
			break;
		case 'y':
			(void)sscanf(*argv, "%d", &sNdx2);
			argv += 1;
			argc -= 1;
			break;
		case 'c':
			(void)sscanf(*argv, "%d", &sLoClamp);
			argv += 1;
			argc -= 1;
			(void)sscanf(*argv, "%d", &sHiClamp);
			argv += 1;
			argc -= 1;
			if ((sLoClamp > sHiClamp) || (sHiClamp > 255) || (sLoClamp < 0))
			{
				ERR(("sorry, %d lo and %d hi makes no sense\n", sLoClamp, sHiClamp));
				sExitAns = 1;
				return (sExitAns);
			}
			break;
		case 'b':
			sBGok = 1;
			break;
		case 'm':
			(void)sscanf(*argv, "%d", &sMaxFrames);
			argv += 1;
			argc -= 1;
			break;
		case 's':
			(void)sscanf(*argv, "%d", &sSkipFrames);
			argv += 1;
			argc -= 1;
			break;
		default:
			ERR(("Do not know arg '%s'\n", str));
			ShowHelp();
			sExitAns = 1;
			return (sExitAns);
		}
	}

	return (kCIEnoErr);
}

/*--------------------------------------------------------------------------*/
static int CheckForKeyboardInput(void)
/*
**	Return 0 if no input available from stdin, 1 else
**
**	Note: the console needs a newline in order to post input.
*/
{
	fd_set exceptfds, readfds, writefds;
	struct timeval tv;
	int ans;
	char buff[1024];

	FD_ZERO(&exceptfds);
	FD_ZERO(&readfds);
	FD_ZERO(&writefds);
	FD_SET(fileno(stdin), &readfds);
	(void)memset(&tv, '\0', sizeof(struct timeval));

	ans = select(1, &readfds, &writefds, &exceptfds, &tv);

	if ((ans == 1) && FD_ISSET(fileno(stdin), &readfds))
	{
		/*
		 **   Consume the line.
		 */
		(void)fgets(buff, 1024, stdin);
		return (1);
	}

	return (0);
}

/*--------------------------------------------------------------------------*/
static cl_context sContext = NULL;
static cl_command_queue sCommandQueue = NULL;
static clEnqueueMakeBuffersResidentAMD_fn sMakeBuffersResidentAMD = NULL;
static clEnqueueWaitSignalAMD_fn sWaitSignalAMD = NULL;
static clEnqueueWriteSignalAMD_fn sWriteSignalAMD = NULL;
static cl_mem *sBufs = NULL;
static cl_bus_address_amd *sDMAaddrs = NULL;
/*
**	An OpenCL kernel to clamp the input data as it is copied to the output buff
**
**	We use OpenCL clamp()
*/
static char sRealClampCL[] =
	"__kernel void bf_clamp("
	"__global uchar *dst, __global const uchar *src, uint lo, uint hi)\n"
	"{\n" "uint tid = get_global_id(0);\n"
	/*
	 ** Fetch
	 */
	"uint tmp = src[tid];\n"
	/*
	 ** Clamp lo/hi
	 */
	"int cmp1 = clamp(tmp,lo,hi);\n"
	/*
	 ** Write
	 */
	"dst[tid] = cmp1;\n" "}\n";

static char *sClampCL[] = { sRealClampCL, NULL };

static cl_mem sTempOnGPU = NULL;
static cl_mem sOutputOnPC = NULL;
static tCIU8 *sOutRAM = NULL;

static cl_program sProgramClamp;
static cl_kernel sKernelClamp;

static size_t sGlobalWorkSize = 0;

static tCIU64 sFrameSize = 0;

static cl_device_id sDID = NULL;
/*--------------------------------------------------------------------------*/
static void TermDGMA(void)
{
	int i;
	cl_event event;
	cl_int clrc;

	DB(("TermDGMA: entry\n"));
	if (NULL != sKernelClamp)
	{
		(void)clReleaseKernel(sKernelClamp);
	}
	sKernelClamp = NULL;

	if (NULL != sProgramClamp)
	{
		(void)clReleaseProgram(sProgramClamp);
	}
	sProgramClamp = NULL;

	if (NULL != sDMAaddrs)
	{
		free(sDMAaddrs);
	}
	sDMAaddrs = NULL;

	if (NULL != sBufs)
	{
		for (i = 0; i < sNframes; i++)
		{
			if (NULL != sBufs[i])
			{
				(void)clReleaseMemObject(sBufs[i]);
			}
		}
		free(sBufs);
	}
	sBufs = NULL;

	if (NULL != sTempOnGPU)
	{
		(void)clReleaseMemObject(sTempOnGPU);
	}
	sTempOnGPU = NULL;

	if (NULL != sTempOnGPU)
	{
		(void)clReleaseMemObject(sTempOnGPU);
	}
	sTempOnGPU = NULL;

	if (NULL != sCommandQueue)
	{
		(void)clFinish(sCommandQueue);
		(void)clReleaseCommandQueue(sCommandQueue);
	}
	sCommandQueue = NULL;
	if (NULL != sContext)
	{
		(void)clReleaseContext(sContext);
	}
	sContext = NULL;
	DB(("TermDGMA: exit\n"));

	return;
}

/*--------------------------------------------------------------------------*/
/*
**	The interface clCreateCommandQueue() is now deprecated and the docs say to
**	use clCreateCommandQueueWithProperties() instead.
**
**	However some installs/upgrades end up with stale libOpenCL.* so here
**	is a token to force use of the deprecated call.  The compiler warns but
**	the linker is happy.
**	
#define	BF_FORCE_OLD_WQ 1
**
*/
#ifdef	CL_VERSION_2_0
#define	USE_OLD_WQ	0
#else
#define	USE_OLD_WQ	1
#endif

#ifdef BF_FORCE_OLD_WQ
#ifdef	USE_OLD_WQ
#undef	USE_OLD_WQ
#endif
#define	USE_OLD_WQ	1
#endif
/*--------------------------------------------------------------------------*/
static int InitDGMA(tCIU64 frameSize)	/* initialize the GPU hardware */
{
	cl_int clrc;
	cl_uint oclPlatformCnt = 0, ndx, ndx2;
	cl_platform_id *oclPlatforms = NULL;
#define MAX_PROP_SZ	4096
	char buff[MAX_PROP_SZ];
	cl_device_id did[MAX_PROP_SZ / sizeof(cl_device_id)];
	int count = -1, i;
	int rc = 1;
	size_t sz;
	tCIU64 roundSize = frameSize + (sPageSize - 1), x;
#if	USE_OLD_WQ == 0
	cl_queue_properties properties;
#endif

	DB(("InitDGMA: entry for %lld\n", frameSize));

	sFrameSize = frameSize;

	/*
	 ** Find available hardware
	 */
	clrc = clGetPlatformIDs(0, NULL, &oclPlatformCnt);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: clGetPlatformIDs() failed (%d)\n", clrc));
		return (1);
	}
	if (0 >= oclPlatformCnt)
	{
		ERR(("InitDGMA: no OpenCL platforms available on this system\n"));
		return (1);
	}
	oclPlatforms = malloc(oclPlatformCnt * sizeof(cl_platform_id));
	if (NULL == oclPlatforms)
	{
		ERR(("InitDGMA: cannot get memory %d platformIDs\n", oclPlatformCnt));
		return (1);
	}
	
	clrc = clGetPlatformIDs(oclPlatformCnt, oclPlatforms, NULL);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: clGetPlatformIDs failed (%d)\n", clrc));
		goto andOut;
	}
	
	for (ndx = 0; ndx < oclPlatformCnt; ndx++)
	{
		clrc = clGetPlatformInfo(oclPlatforms[ndx], CL_PLATFORM_VENDOR, MAX_PROP_SZ, buff, NULL);
		if (CL_SUCCESS != clrc)
		{
			ERR(("InitDGMA: clGetPlatformInfo %d failed (%d)\n", ndx, clrc));
			goto andOut;
		}
#define	AMD_STR	"Advanced Micro Devices, Inc."
		if (0 == strcmp(AMD_STR, buff))
		{
			goto gotAMDplatform;
		}
	}
	ERR(("InitDGMA: found no AMD devices\n"));
	goto andOut;

gotAMDplatform:
	/*
	 ** Now load the required extension
	 */
	DB(("InitDGMA: dynamic load\n"));
	sMakeBuffersResidentAMD = (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddressForPlatform(oclPlatforms[ndx], "clEnqueueMakeBuffersResidentAMD");
	if (NULL == sMakeBuffersResidentAMD)
	{
		ERR(("InitDGMA: cannot get AMD resident func\n"));
		goto andOut;
	}
	sWaitSignalAMD = (clEnqueueWaitSignalAMD_fn)clGetExtensionFunctionAddressForPlatform(oclPlatforms[ndx], "clEnqueueWaitSignalAMD");
	if (NULL == sWaitSignalAMD)
	{
		ERR(("InitDGMA: cannot get AMD waitSignal func\n"));
		goto andOut;
	}
	sWriteSignalAMD = (clEnqueueWriteSignalAMD_fn)clGetExtensionFunctionAddressForPlatform(oclPlatforms[ndx], "clEnqueueWriteSignalAMD");
	if (NULL == sWriteSignalAMD)
	{
		ERR(("InitDGMA: cannot get AMD writeSignal func\n"));
		goto andOut;
	}
	DB(("InitDGMA: dynamic load done\n"));
	/*
	 ** Create our context
	 */
	{
		cl_context_properties oclContextProps[] = {
			CL_CONTEXT_PLATFORM,
			(cl_context_properties) oclPlatforms[ndx],
			0
		};

		sContext = clCreateContextFromType(oclContextProps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clrc);
		if ((NULL == sContext) || (CL_SUCCESS != clrc))
		{
			ERR(("InitDGMA: clCreateContextFromType failed (%d)\n", clrc));
			goto andOut;
		}
	}

	clrc = clGetContextInfo(sContext, CL_CONTEXT_DEVICES, MAX_PROP_SZ, did, &sz);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: clGetContextInfo failed (%d)\n", clrc));
		goto andOut;
	}
	sz /= sizeof(cl_device_id);
	for (ndx2 = 0; ndx2 < sz; ndx2++)
	{
		clrc = clGetDeviceInfo(did[ndx2], CL_DEVICE_VENDOR, MAX_PROP_SZ, buff, NULL);
		if (CL_SUCCESS != clrc)
		{
			ERR(("InitDGMA: clGetDeviceInfo %d failed (%d)\n", ndx, clrc));
			goto andOut;
		}
		if (0 == strcmp(AMD_STR, buff))
		{
			count += 1;			/* another AMD device */
			if (count == sNdx2)
			{
				goto gotThisAMDdevice;
			}
		}
	}
	ERR(("InitDGMA: found %d AMD devs in context but index is %d\n", count, sNdx2));
	goto andOut;

gotThisAMDdevice:
	/*
	 ** OK, we have the device: open a command queue
	 */
	sDID = did[ndx2];
#if	USE_OLD_WQ == 0
	properties = 0;
	sCommandQueue = clCreateCommandQueueWithProperties(sContext, did[ndx2], &properties, &clrc);
	if (NULL == sCommandQueue || CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: failed to create command queue (%d)\n", clrc));
		goto andOut;
	}
#else
	sCommandQueue = clCreateCommandQueue(sContext, did[ndx2], 0, &clrc));
	if (NULL == sCommandQueue || CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: failed to create command queue (%d)\n", clrc));
		goto andOut;
	}
#endif
	/*
	 ** Create the buffers
	 */
	sBufs = (cl_mem *) malloc(sNframes * sizeof(cl_mem));
	if (NULL == sBufs)
	{
		ERR(("InitDGMA: failed to get buff list\n"));
		goto andOut;
	}
	(void)memset(sBufs, '\0', (sNframes * sizeof(cl_mem)));
	DB(("InitDGMA: before buffers\n"));
	for (i = 0; i < sNframes; i++)
	{
		sBufs[i] = clCreateBuffer(sContext, CL_MEM_BUS_ADDRESSABLE_AMD, roundSize, NULL, &clrc);
		if (NULL == sBufs[i] || CL_SUCCESS != clrc)
		{
			ERR(("InitDGMA: failed to get %d buff (%d)\n", i, clrc));
			goto andOut;
		}
	}
	DB(("InitDGMA: buffers done\n"));
	/*
	 ** Now get the PCI bus address info
	 */
	sDMAaddrs = (cl_bus_address_amd *) malloc(sNframes * sizeof(cl_bus_address_amd));
	if (NULL == sDMAaddrs)
	{
		ERR(("InitDGMA: failed to get DMA address list\n"));
		goto andOut;
	}
	
	clrc = (*(sMakeBuffersResidentAMD)) (sCommandQueue, sNframes, sBufs, CL_TRUE, sDMAaddrs, 0, NULL, NULL);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitDGMA: failed to get physical addresses (%d)\n", clrc));
		goto andOut;
	}
	DB(("InitDGMA: buffers resident\n"));
	/*
	 ** OK, now initialize our CL program.
	 **
	 ** We start by creating a single output buffer.
	 */
	sOutRAM = (tCIU8 *) malloc(frameSize);
	if (NULL == sOutRAM)
	{
		ERR(("InitDGMA: failed to get outRAM buff\n"));
		goto andOut;
	}
	(void)memset(sOutRAM, '\0', frameSize);
	/*
	 ** We also create a temporary results buffer on the GPU.
	 */
	sTempOnGPU = clCreateBuffer(sContext, CL_MEM_WRITE_ONLY, frameSize, NULL, &clrc);
	if ((NULL == sTempOnGPU) || (CL_SUCCESS != clrc))
	{
		ERR(("InitDGMA: failed to get temporary buff\n"));
		goto andOut;
	}
	/*
	 ** CPU-side storage
	 */
	sOutputOnPC = clCreateBuffer(sContext, CL_MEM_USE_HOST_PTR, frameSize, sOutRAM, &clrc);
	if ((NULL == sOutputOnPC) || (CL_SUCCESS != clrc))
	{
		ERR(("InitDGMA: failed to get CPU buff\n"));
		goto andOut;
	}
	/*
	 ** Now create the program.
	 */
	sz = (size_t)strlen(sClampCL[0]);
	DB(("InitDGMA: now program %d\n", ((int)(sz))));
	sProgramClamp = clCreateProgramWithSource(sContext, 1, (const char **)(sClampCL), &sz, &clrc);
	if ((NULL == sProgramClamp) || (CL_SUCCESS != clrc))
	{
		ERR(("InitGDMA: failed to create progam (%d)\n", clrc));
		goto andOut;
	}
	DB(("InitDGMA: program created\n"));
	/*
	 ** Build the program
	 */
	clrc = clBuildProgram(sProgramClamp, 1, &sDID, "", NULL, NULL);
	if (CL_SUCCESS != clrc)
	{
		char *log = NULL;
		ERR(("InitGDMA: failed to build program (%d)\n", clrc));
		if (CL_BUILD_PROGRAM_FAILURE == clrc)
		{
			clGetProgramBuildInfo(sProgramClamp, sDID, CL_PROGRAM_BUILD_LOG, 0, NULL, &sz);
			log = (char *)malloc(sz + 1);
			if (NULL == log)
			{
				ERR(("InitDGMA: cannot get %ld for log\n", sz));
				goto andOut;
			}
			clGetProgramBuildInfo(sProgramClamp, sDID, CL_PROGRAM_BUILD_LOG, sz, log, NULL);
			SHOW(("InitDGMA: build log\n%s", log));
			free(log);
		}
		goto andOut;
	}
	DB(("InitDGMA: program built\n"));
	/*
	 ** Create the kernel
	 */
	sKernelClamp = clCreateKernel(sProgramClamp, "bf_clamp", &clrc);
	if ((NULL == sKernelClamp) || (CL_SUCCESS != clrc))
	{
		ERR(("InitGDMA: failed to create kernel (%d)\n", clrc));
		goto andOut;
	}
	DB(("InitDGMA: kernel done\n"));
	/*
	 ** Set the static arguments.
	 */
	clrc = clSetKernelArg(sKernelClamp, 0, sizeof(sTempOnGPU), &sTempOnGPU);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitGDMA: failed to set kernel arg dst (%d)\n", clrc));
		goto andOut;
	}
	
	clrc = clSetKernelArg(sKernelClamp, 2, sizeof(sLoClamp), &sLoClamp);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitGDMA: failed to set kernel arg lo (%d)\n", clrc));
		goto andOut;
	}
	
	clrc = clSetKernelArg(sKernelClamp, 3, sizeof(sHiClamp), &sHiClamp);
	if (CL_SUCCESS != clrc)
	{
		ERR(("InitGDMA: failed to set kernel arg lo (%d)\n", clrc));
		goto andOut;
	}
	DB(("InitDGMA: all done\n"));
	/*
	 ** For the kernel's work size
	 */
	sGlobalWorkSize = frameSize / 4;
	/*
	 ** We are done.
	 */
	rc = 0;						/* success */

andOut:
	if (NULL != oclPlatforms)
	{
		free(oclPlatforms);
	}
	/*
	 ** Do any error cleanup.
	 */
	if (0 != rc)
	{
		TermDGMA();
	}
	DB(("InitDGMA: exit with %d\n", rc));

	return (rc);
}

/*--------------------------------------------------------------------------*/
static int RunKernel(tCIU32 buffNdx)
/*
**	We have a new frame in the buffNdx buffer of the circular list.
**
**	Run the GPU program.
*/
{
	cl_int clrc;
	cl_event event, event2;

	DB(("RunKernel: entry for %d\n", buffNdx));
	/*
	 ** We need to specify the src buffer
	 */
	clrc = clSetKernelArg(sKernelClamp, 1, sizeof(sBufs[0]), sBufs + buffNdx);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to set kernel arg src (%d)\n", clrc));
		return (1);
	}
	/*
	 ** Now start the clamp
	 */
	clrc = clEnqueueNDRangeKernel(sCommandQueue, sKernelClamp, 1, NULL, &sGlobalWorkSize, NULL, 0, NULL, &event);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to enqueue clamp (%d)\n", clrc));
		return (2);
	}
	/*
	 ** Now enqueue a buffer copy to the PC when the clamp is done.
	 **
	 ** CL has the opportunity here to employ DMA for effecient use of PCI
	 ** bandwidth.
	 */
	clrc = clEnqueueCopyBuffer(sCommandQueue, sTempOnGPU, sOutputOnPC, 0, 0, sFrameSize, 1, &event, &event2);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to enqueue copy (%d)\n", clrc));
		return (3);
	}
	/*
	 ** We wait here for completion of the copy.
	 */
	clrc = clWaitForEvents(1, &event2);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to wait (%d)\n", clrc));
		return (4);
	}
	/*
	 ** Release the events
	 */
	clrc = clReleaseEvent(event);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to release clamp event (%d)\n", clrc));
		return (5);
	}

	clrc = clReleaseEvent(event2);
	if (CL_SUCCESS != clrc)
	{
		ERR(("RunKernel: failed to release clamp event2 (%d)\n", clrc));
		return (6);
	}
	DB(("RunKernel: exit\n"));
	/*
	 ** We are done and the output data is in PC RAM.
	 */
	return (0);
}

/*--------------------------------------------------------------------------*/
static void InitAndGetDataUntilKeyPress(void)
/*
**	Illustrate a simple example VFG interaction sequence.
*/
{
	tCIRC circ;
	tCIDOUBLE a = -1.0, b, c, d;
	tCIU64 totalBytes = 0, totalLines = 0, *pciAddrs = NULL;
	tCIU32 i;
	tCIU32 nPtrs;
	tCIU8 **uPtrs = NULL;
	tCIU8 *p8;
	tCIU32 frameID, value, value2, value3, pixToShow;
	tCIU8 *frameP;
	tCIU32 nFrames, bitsPerPix, hROIoffset, hROIsize, vROIoffset, vROIsize, stride;
	/*
	 ** Open the ndx'th frame grabber with exclusive write permission.
	 **
	 ** We need exclusive write because GDMA is a case of user scatter-gather
	 ** DMA buffer definition.
	 */
	circ = CiVFGopen(sNdx, kCIBO_exclusiveWrAccess, &sCIp);
	if (kCIEnoErr != circ)
	{
		ERR(("CiVFGopen gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		return;
	}
	/*
	 ** Init the VFG with the config file specified by the DIP switches.
	 */
	circ = CiVFGinitialize(sCIp, NULL);
	if (kCIEnoErr != circ)
	{
		ERR(("CiVFGinitialize gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** Configure the VFG (temporarily) for 1 frame buffer.
	 */
	circ = CiDrvrBuffConfigure(sCIp, 1, 0, 0, 0, 0);
	if (kCIEnoErr != circ)
	{
		ERR(("CiDrvrBuffConfigure gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** Determine buffer configuration.
	 */
	circ = CiBufferInterrogate(sCIp, &nFrames, &bitsPerPix, &hROIoffset, &hROIsize, &vROIoffset, &vROIsize, &stride);
	if (kCIEnoErr != circ)
	{
		ERR(("CiBufferInterrogate gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** Now release the driver buffers.
	 */
	circ = CiDrvrBuffConfigure(sCIp, 0, 0, 0, 0, 0);
	if (kCIEnoErr != circ)
	{
		ERR(("CiDrvrBuffConfigure(2) gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** For this example we only deal w/8bpp
	 **
	 ** Use one of BitFlow's synthetic camera files and all is OK
	 */
	if (8 != bitsPerPix)
	{
		ERR(("we need 8 bitsPerPix, not %d\n", bitsPerPix));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** We now know the frame information: initialize DMA buffers on the GPU
	 */
	if (0 != InitDGMA(stride * vROIsize))
	{
		ERR(("InitDGMA failed\n"));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** If the clamp values are default then set for noclamp
	 */
	if ((0 > sLoClamp) || (0 > sHiClamp))
	{
		sLoClamp = 0;
		sHiClamp = (1 << bitsPerPix) - 1;
	}
	/*
	 ** OK, now we have the info to configure for DMA buffers on the GPU
	 **
	 ** Build the array of PCI addresses
	 */
	pciAddrs = (tCIU64 *) malloc(sNframes * sizeof(tCIU64));
	if (NULL == pciAddrs)
	{
		ERR(("could not get PCI address buffer\n"));
		sExitAns = 1;
		goto andOut;
	}
	for (i = 0; i < sNframes; i++)
	{
		pciAddrs[i] = sDMAaddrs[i].surface_bus_address;
		/*
		 **   Be sure to align -- extra space was allotted for this.
		 */
		pciAddrs[i] = (pciAddrs[i] + (sPageSize - 1)) & (~((tCIU64) (sPageSize - 1)));
	}
	circ = CiDGMAbuffConfigure(sCIp, sNframes, pciAddrs, stride * vROIsize, 0, 0, 0, 0);
	if (kCIEnoErr != circ)
	{
		ERR(("CiDGMAbuffConfigure(2) gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** This is a necessary step to promote the VFG state.  However, the
	 ** returned buffer pointers are not valid for access to the pixel data.
	 */
	circ = CiMapFrameBuffers(sCIp, 0, &nPtrs, &uPtrs);
	if (kCIEnoErr != circ)
	{
		ERR(("CiMapFrameBuffers gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** We show 16 8b pix per line
	 */
	pixToShow = 16;
	/*
	 ** Reset acquisition and clear all error conditions.
	 */
	circ = CiAqSWreset(sCIp);
	if (kCIEnoErr != circ)
	{
		ERR(("CiAqSWreset gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	/*
	 ** Tell the user how to stop this cleanly.
	 */
	if (0 == sBGok)
	{
		SHOW(("Will now dump first line of frames until newline (skip %d, ndx %d)\n", sSkipFrames, sNdx));
	}
	else
	{
		SHOW(("Will now dump first line of frames until %d (skip %d, ndx %d)\n", sMaxFrames, sSkipFrames, sNdx));
	}
	/*
	 ** Start continuous acquisition.
	 */
	circ = CiAqStart(sCIp, 0);
	if (kCIEnoErr != circ)
	{
		ERR(("CiAqStart gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
		goto andOut;
	}
	a = GetTime();
	/*
	 ** Display each acquired frame 1st line in a loop.  Stop at newline.
	 */
	while (1)
	{
		/*
		 ** Check to see if a frame is already available before waiting.
		 */
checkAgain:
		circ = CiGetOldestNotDeliveredFrame(sCIp, &frameID, &frameP);
		switch (circ)
		{
		case kCIEnoErr:
			/*
			 **   We have the frame.
			 */
			break;
		case kCIEnoNewData:
			/*
			 **   We need to wait for another frame.
			 */
			circ = CiWaitNextUndeliveredFrame(sCIp, -1);
			if (kCIEnoErr != circ)
			{
				switch (circ)
				{
				case kCIEaqAbortedErr:
					SHOW(("CiWaitNextUndeliveredFrame gave '%s'\n", CiErrStr(circ)));
					break;
				default:
					ERR(("CiWaitNextUndeliveredFrame gave '%s'\n", CiErrStr(circ)));
					sExitAns = 1;
				}
				goto andOut;
			}
			goto checkAgain;
		case kCIEaqAbortedErr:
			SHOW(("CiGetOldestNotDeliveredFrame: acqistion aborted\n"));
			goto andOut;
		default:
			ERR(("CiGetOldestNotDeliveredFrame gave '%s'\n", CiErrStr(circ)));
			sExitAns = 1;
			goto andOut;
		}
		/*
		 **   OK.  There is a new frame on the GPU.  Run the CL program.
		 */
		if (0 != RunKernel(frameID % sNframes))
		{
			ERR(("RunKernel failed at %d\n", frameID));
			sExitAns = 1;
			goto andOut;
		}
		/*
		 **   Change the frameP to the (single) output buffer of the GPU
		 */
		frameP = sOutRAM;
		/*
		 **   Allow skipping frames so display is not an issue.
		 */
		if ((0 != sSkipFrames) && (0 != (sDidFrames % (sSkipFrames + 1))))
		{
			goto skipHere;
		}
		/*
		 **   Display the frameID and the first line of frame data.
		 */
		SHOW(("frameID %9d:", frameID));
		p8 = frameP;
		for (i = 0; i < pixToShow; i++)
		{
			value = *p8++;
			SHOW((" %02X", value));
		}
		SHOW(("\n"));

skipHere:
		totalLines += vROIsize;
		totalBytes += stride * vROIsize;
		sDidFrames += 1;
		/*
		 **   Break out of this loop on newline
		 */
		if (0 == sBGok)
		{
			if (0 != CheckForKeyboardInput())
			{
				break;
			}
		}
		/*
		 **   Break out of loop if countdown hits zero
		 */
		if ((0 != sMaxFrames) && (--sMaxFrames == 0))
		{
			break;
		}
	}

andOut:
	/*
	 ** We must stop acquisition because we are about to tear down the buffers
	 */
	circ = CiAqAbort(sCIp);
	if (kCIEnoErr != circ)
	{
		ERR(("CiAqAbort gave '%s'\n", CiErrStr(circ)));
	}
	/*
	 ** Unmap the frame buffers.
	 */
	if ((NULL != uPtrs) && (kCIEnoErr != (circ = CiUnmapFrameBuffers(sCIp))))
	{
		ERR(("CiUnmapFrameBuffers gave '%s'\n", CiErrStr(circ)));
	}
	/*
	 ** Remove the buffers from the driver
	 */
	circ = CiDGMAbuffConfigure(sCIp, 0, NULL, 0, 0, 0, 0, 0);
	if (kCIEnoErr != circ)
	{
		ERR(("CiDGMAbuffConfigure(end) gave '%s'\n", CiErrStr(circ)));
		sExitAns = 1;
	}
	if (NULL != pciAddrs)
	{
		free(pciAddrs);
	}
	/*
	 ** Close the access.
	 */
	if ((NULL != sCIp) && (kCIEnoErr != (circ = CiVFGclose(sCIp))))
	{
		ERR(("CiVFGclose gave '%s'\n", CiErrStr(circ)));
	}
	/*
	 ** Tear down the GDMA stuff
	 */
	TermDGMA();
	/*
	 ** Show data rate
	 */
	c = b = GetTime() - a;
	if ((a < 0.0) || (c < 0.001))
	{
		a = 0.0;
		b = 0.0;
		c = 0.0;
		d = 0.0;
	}
	else
	{
		a = ((tCIDOUBLE) totalBytes) / c;
		b = ((tCIDOUBLE) totalLines) / c;
		d = ((tCIDOUBLE) sDidFrames) / c;
	}
	SHOW(("%d: Data rate %.1lf ln/s (%.1lf b/s) (%.1lf FPS) (%.1lf sec) after %d fr\n", sNdx, b, a, d, c, sDidFrames));

	return;
}

/*==========================================================================*/
int main(int argc, char **argv)
/*
**	Decode command line and acquire/display frames until EOF
*/
{
	sArgv0 = *argv;

	sPageSize = getpagesize();

	if (kCIEnoErr == DecodeArgs(argc, argv))
	{
		InitAndGetDataUntilKeyPress();
	}

	return (sExitAns);
}

/*==========================================================================*/
/*
	$Log: CIsimpleCLDGMA.c,v $
	Revision 1.5  2020/10/02 23:30:13  steve
	CLOCK_MONOTONIC is not always monotonic, so prefer CLOCK_MONOTONIC_RAW.

	Revision 1.4  2020/10/02 01:17:23  steve
	ftime is deprecated. Use clock_gettime.

	Revision 1.3  2017/09/29 09:14:01  steve
	Now dump all camf

	Revision 1.2  2016/07/11 20:02:57  steve
	Gn2 xx-2Y/2YE, GPUD, DGMA

	Revision 1.1  2016/03/06 23:31:44  steve
	Support CXP_usualINit and Axion

*/