CIsimpleGPUD.cu

/*****************************************************************************

	CIsimpleGPUD.c		Source for BitFlow simple GPUDirect example program

	Oct 17,		2015	CIW/SJT

	© Copyright 2015, BitFlow, Inc. All rights reserved.

	Tabstops are 4

	$Author: steve $

	$Date: 2020/10/02 23:30:13 $

	$Id: CIsimpleGPUD.cu,v 1.6 2020/10/02 23:30:13 steve Exp $

*****************************************************************************/

/*==========================================================================*/
/*
**	For access to command line display.
*/
#include	<stdio.h>
#include	<stdarg.h>
#include	<string.h>
/*
**	For checking for keypress
*/
#include	<sys/time.h>
#include	<sys/types.h>
#include	<unistd.h>
/*
**	For access to BitFlow camera interface library.
*/
#include	"BFciLib.h"
#include	"BFPCI2PCI.h"
/*--------------------------------------------------------------------------*/
/*
**	Bring in the necessary NVidia/CUDA headers.
**
**	NONE NEEDED: we are building w/nvcc
**
**	However, NVidia says this only works w/64b
*/
#if !defined(__x86_64__) && !defined(__aarch64__)
#error	"This example only works w/x86_64 and aarch64 processors\n"
#endif
#include	<cuda.h>
#include	<cuda_runtime_api.h>
/*==========================================================================*/
static int	sExitAns=0;						/* program exit code */
static tCIp	sCIp=NULL;						/* BitFlow device open token */
static int	sNdx=0;							/* BitFlow device index */
static int	sNframes=4;						/* number of DMA frame buffers */
static int	sNdx2=0;						/* CUDA device index */

static int	sLibDebug=0;					/* BitFlow SDK internals */

static int	sMaxFrames=0;					/* total frames to display */
static int	sSkipFrames=0;					/* skip between display */
static int	sDidFrames=0;					/* total frames handled */
static int	sBGok=0;						/* ok to be in background */
/*--------------------------------------------------------------------------*/
#define	SHOW(x)	{ (void)printf x ; (void)fflush(stdout); }
#define	ERR(x)	{ SHOW(("ERR: ")); SHOW(x); }

static char	*sArgv0=NULL;					/* name of executable */
static void ShowHelp(void)
{
	SHOW(("%s of " __DATE__ " at " __TIME__ "\n",sArgv0));
	SHOW(("   -h           display this message and exit\n"));
	SHOW(("\n"));
	SHOW(("   -x ndx       choose available BitFlow device ndx (default 0)\n"));
	SHOW(("   -n nFrames   change number of DMA frame buffers (default 4)\n"));
	SHOW(("   -y ndx       choose available CUDA device ndx (default 0)\n"));
	SHOW(("   -m maxFrames max frames to display (default infinite)\n"));
	SHOW(("   -s skipFr    frames to skip between display (default 0)\n"));
	SHOW(("   -b           program is backgrounded (no newline exit)\n"));
	SHOW(("   -D           show BitFlow SDK library debug info\n"));
	SHOW(("\n"));
	SHOW(("  CIsimpleGPUD: init a VFG for RDMA and display pixels from GPU\n"));
	SHOW(("      display ends with newline\n"));
	SHOW(("\n"));
}
/*==========================================================================*/
#include	<time.h>
#include	<sys/timeb.h>
static tCIDOUBLE GetTime(void)
/*
**	Return fractional seconds
*/
{
tCIDOUBLE		ans=0.0;

#ifdef _POSIX_TIMERS

struct timespec tp;

	(void)clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
	ans = (tCIDOUBLE)tp.tv_sec;
	ans += ((tCIDOUBLE)tp.tv_nsec) / 1000000000.0;

#else

struct timeb	tb;

	(void)ftime(&tb);
	ans = tb.millitm;
	ans /= 1000.0;
	ans += tb.time;

#endif

	return(ans);
}
/*--------------------------------------------------------------------------*/
static void LDBdisplay(char *format, ...)
/*
**	Library debug display function.
**
**	This will display progress/information/error strings from BFciLib
*/
{
va_list	val;
char	buff[1024];

	va_start(val,format);
	(void)vsprintf(buff,format,val);
	va_end(val);
	/*
	**	buff now contains the library display string
	**
	**	This function simply prints it to the console.
	*/
	(void)printf("LDB: %s\n",buff);
	(void)fflush(stdout);
}
/*--------------------------------------------------------------------------*/
static int DecodeArgs(int argc, char **argv)
/*
**	Parse the input arguments.
*/
{
char	*str;

	argv += 1;	argc -= 1;					/* skip program name */

	while (argc-- > 0) {
	  str = *argv++;
	  if (str[0] != '-') {
		ERR(("Do not know '%s' arg\n",str));
		ShowHelp();
		sExitAns = 1;
		return(sExitAns);
		}
	  switch (str[1]) {
		case 'h':
		  ShowHelp();
		  exit(0);
		case 'x':
		  (void)sscanf(*argv,"%d",&sNdx);
		  argv += 1;	argc -= 1;
		  break;
		case 'n':
		  (void)sscanf(*argv,"%d",&sNframes);
		  argv += 1;	argc -= 1;
		  break;
		case 'y':
		  (void)sscanf(*argv,"%d",&sNdx2);
		  argv += 1;	argc -= 1;
		  break;
		case 'b':
		  sBGok = 1;
		  break;
		case 'D':
		  sLibDebug = 1;
		  break;
		case 'm':
		  (void)sscanf(*argv,"%d",&sMaxFrames);
		  argv += 1;	argc -= 1;
		  break;
		case 's':
		  (void)sscanf(*argv,"%d",&sSkipFrames);
		  argv += 1;	argc -= 1;
		  break;
		default:
		  ERR(("Do not know arg '%s' arg\n",str));
		  ShowHelp();
		  sExitAns = 1;
		  return(sExitAns);
		}
	  }

	return(kCIEnoErr);
}
/*--------------------------------------------------------------------------*/
static int CheckForKeyboardInput(void)
/*
**	Return 0 if no input available from stdin, 1 else
**
**	Note: the console needs a newline in order to post input.
*/
{
fd_set			exceptfds,readfds,writefds;
struct timeval	tv;
int				ans;
char			buff[1024];

	FD_ZERO(&exceptfds);
	FD_ZERO(&readfds);
	FD_ZERO(&writefds);
	FD_SET(fileno(stdin),&readfds);
	(void)memset(&tv,'\0',sizeof(struct timeval));

	ans = select(1,&readfds,&writefds,&exceptfds,&tv);

	if ((ans == 1) && FD_ISSET(fileno(stdin),&readfds)) {
	  /*
	  **	Consume the line.
	  */
	  (void)fgets(buff,1024,stdin);
	  return(1);
	  }

	return(0);
}
/*--------------------------------------------------------------------------*/
/*
**	All we need to put data into GPU.
*/
static void		**sCUDAbuffs=NULL;			/* frame buffs on GPU */
static tCIU8	**sCUDAPAbuffs=NULL;		/* page-aligned GPU frame buffs */

static tCIU8	*sHostBuff=NULL;			/* single output buff */

static tCIU64	sFrameSize=0;
/*--------------------------------------------------------------------------*/
static void TermGPUD(void)
{
int	i;

	if (NULL != sCUDAPAbuffs) { free(sCUDAPAbuffs); }
	sCUDAPAbuffs = NULL;

#if defined(NV_BUILD_DGPU)

	if (NULL != sCUDAbuffs) {
	  for (i=0; i<sNframes; i++) {
		if (NULL != sCUDAbuffs[i]) { (void)cudaFree(sCUDAbuffs[i]); }
		}
	  free(sCUDAbuffs);
	  }
	sCUDAbuffs = NULL;
	
	if (NULL != sHostBuff) { cudaFreeHost(sHostBuff); }
	sHostBuff = NULL;
	
#else

	if (NULL != sCUDAbuffs) {
	  for (i=0; i<sNframes; i++) {
		if (NULL != sCUDAbuffs[i]) { (void)cudaFreeHost(sCUDAbuffs[i]); }
		}
	  free(sCUDAbuffs);
	  }
	sCUDAbuffs = NULL;
	
#endif

	(void)cudaDeviceReset();

	return;
}
/*--------------------------------------------------------------------------*/
static int InitGPUD(tCIU64	frameSize)
/*
**	Allocate the memory buffers on the GPU.
**
**	The BitFlow driver is required to page-lock the buffers on 64kb boundaries
**	so we need to keep track of the page-aligned buffer address to properly
**	access the pixel data.
*/
{
cudaError_t			ce;
CUresult			cr;
int					i,rc=1;
unsigned long long	x,mask;
unsigned int		flag;

#if defined(NV_BUILD_DGPU)
	/*
	**	We need to specify the CUDA device.
	*/
	if (cudaSuccess != (ce = cudaGetDeviceCount(&i))) {
	  ERR(("InitGPUD: cudaGetDeviceCount failed\n"));
	  return(1);
	  }
	if (sNdx2 >= i) {
	  ERR(("InitGUPD: %d device specified but only %d exist\n",sNdx2,i));
	  return(1);
	  }
	if (cudaSuccess != (ce = cudaSetDevice(sNdx2))) {
	  ERR(("InitGUPD: cudaSetDevice failed for %d\n",sNdx2));
	  return(1);
	  }
#endif

	/*
	**	Set the global
	*/
	sFrameSize = frameSize;
	/*
	**	Get the buffers.
	*/
	
	if (NULL == (sCUDAbuffs = (void **)malloc(sNframes * sizeof(void *)))) {
	  ERR(("InitGPUD: cannot get %d for CUDA buffs\n",sNframes));
	  return(1);
	  }
	(void)memset(sCUDAbuffs,'\0',sNframes * sizeof(void *));
	if (NULL == (sCUDAPAbuffs = (tCIU8 **)malloc(sNframes * sizeof(void *)))) {
	  ERR(("InitGPUD: cannot get %d for PACUDA buffs\n",sNframes));
	  goto andOut;
	  }
	(void)memset(sCUDAPAbuffs,'\0',sNframes * sizeof(void *));

	for (i=0; i<sNframes; i++) {
#if defined(NV_BUILD_DGPU)
	  /*
	  **	Be sure to allocate extra for CUDA page alignment.
	  */
	  x = 1;
	  x <<= 16;
	  x -= 1;
	  mask = ~x;
	  /*
	  **	Allocate the acquisition buffer.
	  */
	  if (cudaSuccess != (ce = cudaMalloc(sCUDAbuffs+i,frameSize+x))) {
		ERR(("InitGPUD: cudaMalloc failed on frame %d\n",sNframes));
		goto andOut;
		}
	  /*
	  **	Now set the aligned addresses.
	  */
	  x = (unsigned long long)(sCUDAbuffs[i]);
	  sCUDAPAbuffs[i] = (tCIU8*)(x & mask);
#else
	  /*
	  **	Be sure to allocate extra for CUDA page alignment.
	  */
	  x = 1;
	  x <<= 12;
	  x -= 1;
	  /*
	  **	Allocate the acquisition buffer.
	  */
	  if (cudaSuccess != (ce = cudaHostAlloc(sCUDAbuffs+i, frameSize+x, cudaHostAllocDefault))) {
		ERR(("InitGPUD: cudaHostAlloc failed on frame %d\n",sNframes));
		goto andOut;
		}

	  flag = 1;
	  if (CUDA_SUCCESS != (cr = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)sCUDAbuffs[i]))) {
		ERR(("InitGPUD: cuPointerSetAttribute failed on frame %d\n",sNframes));
		goto andOut;
		}
		
	  /*
	  **	Now set the output addresses.
	  */
	  sCUDAPAbuffs[i] = (tCIU8*)(sCUDAbuffs[i]);
#endif
	  }

#if defined(NV_BUILD_DGPU)
	/*
	**	Allocate a single copy buffer for results.
	*/
	if (cudaSuccess != (ce = cudaHostAlloc(
	  	(void **)&sHostBuff,frameSize,cudaHostAllocDefault))) {
	  ERR(("InitGPUD: failed to alloc output buffer\n"));
	  goto andOut;
	  }
#endif

	rc = 0;

andOut:
	if (0 != rc) { TermGPUD(); }
	return(rc);
}
/*--------------------------------------------------------------------------*/
static int RunGPUD(tCIU32 buffNdx)
/*
**	Here we copy from the (page aligned) device buffer to the single host
**	buffer.
*/
{
cudaError_t	ce;

#if defined(NV_BUILD_DGPU)

	if (cudaSuccess != (ce = cudaMemcpy(
		sHostBuff,sCUDAPAbuffs[buffNdx],sFrameSize,cudaMemcpyDeviceToHost))){
	  ERR(("RunGPUD: cudaMemcpy failed buff %d\n",buffNdx));
	  return(1);
	  }
	  
#else
	sHostBuff = (tCIU8*)sCUDAbuffs[buffNdx];
#endif

	return(0);
}
/*--------------------------------------------------------------------------*/
static void InitAndGetDataUntilKeyPress(void)
/*
**	Illustrate a simple example board interaction sequence.
*/
{
tCIRC			circ;
tCIDOUBLE		a=-1.0,b,c,d;
tCIU64			totalBytes=0,totalLines=0;
tCIU32			i;
tCIU32			nPtrs;
tCIU8			**uPtrs=NULL;
tCIU8			*p8;
tCIU32			frameID,value,pixToShow;
tCIU8			*frameP;
tCIU32	nFrames,bitsPerPix,hROIoffset,hROIsize,vROIoffset,vROIsize,stride;
	/*
	**	Open the ndx'th frame grabber with exclusive write permission.
	**
	**	We need exclusive write because GPUD is a case of non standard
	**  scatter-gather DMA buffer definition.
	*/
	if (kCIEnoErr != (circ = CiVFGopen(sNdx,kCIBO_exclusiveWrAccess,&sCIp))) {
	  ERR(("CiVFGopen gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  return;
	  }
	/*
	**	Set library debug function if '-D' was specified.
	*/
	if (kCIEnoErr != (circ = CiSetDebug(sCIp,-1,
			(0 == sLibDebug) ? NULL : LDBdisplay))) {
	  ERR(("CiSetDebug of %d gave '%s'\n",sNdx,CiErrStr(circ)));
	  }
	/*
	**	Init the board with the config file specified by the DIP switches.
	*/
	if (kCIEnoErr != (circ = CiVFGinitialize(sCIp,NULL))) {
	  ERR(("CiVFGinitialize gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Configure the board for 4 frame buffers.
	*/
	if (kCIEnoErr != (circ = CiDrvrBuffConfigure(sCIp,4,0,0,0,0))) {
	  ERR(("CiDrvrBuffConfigure gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Determine buffer configuration.  We only need bitsPerPix and stride.
	*/
	if (kCIEnoErr != (circ = CiBufferInterrogate(sCIp,&nFrames,&bitsPerPix,
			&hROIoffset,&hROIsize,&vROIoffset,&vROIsize,&stride))) {
	  ERR(("CiBufferInterrogate gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Now release these driver buffers.
	*/
	if (kCIEnoErr != (circ = CiDrvrBuffConfigure(sCIp,0,0,0,0,0))) {
	  ERR(("CiDrvrBuffConfigure(2) gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	For this example we only deal w/8bpp
	**
	**	Use one of BitFlow's synthetic camera files and all isOK
	*/
	if (8 != bitsPerPix) {
	  ERR(("we need 8 bitsPerPix, not %d\n",bitsPerPix));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Set up the GPU.
	*/
	if (0 != InitGPUD(vROIsize*stride)) {
	  ERR(("InitGPUD failed\n"));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Now give the GPU buffers to the BitFlow driver
	*/
	if (kCIEnoErr != (circ = CiGPUDbuffConfigure(sCIp,sNframes,sCUDAPAbuffs,
			(vROIsize * stride),0,0,0,0))) {
	  ERR(("CiGPUDbuffConfigure gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	This is a necessary step to promote the VFG state.  However, the
	**	returned buffer pointers are not valid for access to the pixel
	**	data.
	*/
	if (kCIEnoErr != (circ = CiMapFrameBuffers(sCIp,0,&nPtrs,&uPtrs))) {
	  ERR(("CiMapFrameBuffers gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	We show 16 8b pix per line
	*/
	pixToShow = 16;
	/*
	**	Reset acquisition and clear all error conditions.
	*/
	if (kCIEnoErr != (circ = CiAqSWreset(sCIp))) {
	  ERR(("CiAqSWreset gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	/*
	**	Clear the first line of all frame buffers.
	*/
	for (i=0; i<nPtrs; i++) { (void)memset(uPtrs[i],'\0',stride); }
	/*
	**	Tell the user how to stop this cleanly.
	*/
	if (0 == sBGok) {
SHOW(("Will now dump first line of frames until newline (skip %d, ndx %d)\n",
		sSkipFrames,sNdx));
	  } else {
SHOW(("Will now dump first line of frames until %d (skip %d, ndx %d)\n",
		sMaxFrames,sSkipFrames,sNdx));
	  }
	/*
	**	Start continuous acquisition.
	*/
	if (kCIEnoErr != (circ = CiAqStart(sCIp,0))) {
	  ERR(("CiAqStart gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  goto andOut;
	  }
	a = GetTime();
	/*
	**	Display each acquired frame 1st line in a loop.  Stop at newline.
	*/
	while (1) {
		/*
		**	Check to see if a frame is already available before waiting.
		*/
checkAgain:
	  switch (circ = CiGetOldestNotDeliveredFrame(sCIp,&frameID,&frameP)) {
		case kCIEnoErr:
		  /*
		  **	We have the frame.
		  */
		  break;
		case kCIEnoNewData:
		  /*
		  **	We need to wait for another frame.
		  */
		  if (kCIEnoErr != (circ = CiWaitNextUndeliveredFrame(sCIp,-1))) {
			switch (circ) {
			  case kCIEaqAbortedErr:
				SHOW(("CiWaitNextUndeliveredFrame gave '%s'\n",CiErrStr(circ)));
				break;
			  default:
				ERR(("CiWaitNextUndeliveredFrame gave '%s'\n",CiErrStr(circ)));
				sExitAns = 1;
			  }
			goto andOut;
			}
		  goto checkAgain;
		case kCIEaqAbortedErr:
		  SHOW(("CiGetOldestNotDeliveredFrame: acqistion aborted\n"));
		  goto andOut;
		default:
		  ERR(("CiGetOldestNotDeliveredFrame gave '%s'\n",CiErrStr(circ)));
		  sExitAns = 1;
		  goto andOut;
		}
	  /*
	  **	OK.  There is a new frame on the GPU.  Move the data to host.
	  */
	  if (0 != RunGPUD(frameID % sNframes)) {
		ERR(("RunGPUD failed at %d\n",frameID));
		sExitAns = 1;
		goto andOut;
		}
	  /*
	  **	Change the frameP to the (single) host output buffer
	  */
	  frameP = sHostBuff;
	  /*
	  **	Allow skipping frames so display is not an issue.
	  */
	  if ((0 != sSkipFrames) && (0 != (sDidFrames % (sSkipFrames+1)))) {
		goto skipHere;
		}
	  /*
	  **	Display the frameID and the first line of frame data.
	  */
	  SHOW(("frameID %9d:",frameID));
	  p8 = frameP;
	  for (i=0; i<pixToShow; i++) {
		value = *p8++;  SHOW((" %02X",value));
		}
	  SHOW(("\n"));

skipHere:
	  totalLines += vROIsize;
	  totalBytes += stride * vROIsize;
	  sDidFrames += 1;
	  /*
	  **	Break out of this loop on newline
	  */
	  if (0 == sBGok) { if (0 != CheckForKeyboardInput()) { break; } }
	  /*
	  **	Break out of loop if countdown hits zero
	  */
	  if ((0 != sMaxFrames) && (--sMaxFrames == 0)) { break; }
	  }

andOut:
	/*
	**	We must stop acquisition because we are about to tear down the buffers.
	*/
	if (kCIEnoErr != (circ = CiAqSWreset(sCIp))) {
	  ERR(("CiAqSWreset (end) gave '%s'\n",CiErrStr(circ)));
	  }
	/*
	**	Unmap the frame buffers.
	*/
	if ((NULL != uPtrs) && (kCIEnoErr != (circ = CiUnmapFrameBuffers(sCIp)))){
	  ERR(("CiUnmapFrameBuffers gave '%s'\n",CiErrStr(circ)));
	  }
	/*
	**	Remove the buffers from the driver.
	*/
	if (kCIEnoErr != (circ = CiGPUDbuffConfigure(sCIp,0,NULL,0,0,0,0,0))) {
	  ERR(("CiGPUDbuffConfigure(end) gave '%s'\n",CiErrStr(circ)));
	  sExitAns = 1;
	  }
	/*
	**	Close the access.
	*/
	if ((NULL != sCIp) && (kCIEnoErr != (circ = CiVFGclose(sCIp)))) {
	  ERR(("CiVFGclose gave '%s'\n",CiErrStr(circ)));
	  }
	/*
	**	Tear down the GPUD stuff
	*/
	TermGPUD();
	/*
	**	Show data rate
	*/
	c = b = GetTime() - a;
	if ((a < 0.0) || (c < 0.001)) {
	  a = 0.0;
	  b = 0.0;
	  c = 0.0;
	  d = 0.0;
	  } else {
	  a = ((tCIDOUBLE)totalBytes)/c;
	  b = ((tCIDOUBLE)totalLines)/c;
	  d = ((tCIDOUBLE)sDidFrames)/c;
	  }
	SHOW((
"%d: Data rate %.1lf ln/s (%.1lf b/s) (%.1lf FPS) (%.1lf sec) after %d fr\n",
			sNdx,b,a,d,c,sDidFrames));

	return;
}
/*==========================================================================*/
int main(int argc, char **argv)
/*
**	Decode command line and acquire/display frames until EOF
*/
{
	sArgv0 = *argv;

	if (kCIEnoErr == DecodeArgs(argc,argv)) {
	  InitAndGetDataUntilKeyPress();
	  }

	return(sExitAns);
}
/*==========================================================================*/
/*
	$Log: CIsimpleGPUD.cu,v $
	Revision 1.6  2020/10/02 23:30:13  steve
	CLOCK_MONOTONIC is not always monotonic, so prefer CLOCK_MONOTONIC_RAW.

	Revision 1.5  2020/10/02 01:17:23  steve
	ftime is deprecated. Use clock_gettime.

	Revision 1.4  2020/06/24 22:57:58  steve
	Add support for NVIDIA Jetson platform (GPUD only).

	Added missing GPUD buffer mapping step. Other GPUD bug fixes.

	Revision 1.2  2016/07/11 20:02:59  steve
	Gn2 xx-2Y/2YE, GPUD, DGMA

	Revision 1.1  2016/03/06 23:31:46  steve
	Support CXP_usualINit and Axion

*/