/*****************************************************************************
CIsimpleGPUD.c Source for BitFlow simple GPUDirect example program
Oct 17, 2015 CIW/SJT
© Copyright 2015, BitFlow, Inc. All rights reserved.
Tabstops are 4
$Author: steve $
$Date: 2020/10/02 23:30:13 $
$Id: CIsimpleGPUD.cu,v 1.6 2020/10/02 23:30:13 steve Exp $
*****************************************************************************/
/*==========================================================================*/
/*
** For access to command line display.
*/
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
/*
** For checking for keypress
*/
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
/*
** For access to BitFlow camera interface library.
*/
#include "BFciLib.h"
#include "BFPCI2PCI.h"
/*--------------------------------------------------------------------------*/
/*
** Bring in the necessary NVidia/CUDA headers.
**
** NONE NEEDED: we are building w/nvcc
**
** However, NVidia says this only works w/64b
*/
#if !defined(__x86_64__) && !defined(__aarch64__)
#error "This example only works w/x86_64 and aarch64 processors\n"
#endif
#include <cuda.h>
#include <cuda_runtime_api.h>
/*==========================================================================*/
static int sExitAns=0; /* program exit code */
static tCIp sCIp=NULL; /* BitFlow device open token */
static int sNdx=0; /* BitFlow device index */
static int sNframes=4; /* number of DMA frame buffers */
static int sNdx2=0; /* CUDA device index */
static int sLibDebug=0; /* BitFlow SDK internals */
static int sMaxFrames=0; /* total frames to display */
static int sSkipFrames=0; /* skip between display */
static int sDidFrames=0; /* total frames handled */
static int sBGok=0; /* ok to be in background */
/*--------------------------------------------------------------------------*/
#define SHOW(x) { (void)printf x ; (void)fflush(stdout); }
#define ERR(x) { SHOW(("ERR: ")); SHOW(x); }
static char *sArgv0=NULL; /* name of executable */
static void ShowHelp(void)
{
SHOW(("%s of " __DATE__ " at " __TIME__ "\n",sArgv0));
SHOW((" -h display this message and exit\n"));
SHOW(("\n"));
SHOW((" -x ndx choose available BitFlow device ndx (default 0)\n"));
SHOW((" -n nFrames change number of DMA frame buffers (default 4)\n"));
SHOW((" -y ndx choose available CUDA device ndx (default 0)\n"));
SHOW((" -m maxFrames max frames to display (default infinite)\n"));
SHOW((" -s skipFr frames to skip between display (default 0)\n"));
SHOW((" -b program is backgrounded (no newline exit)\n"));
SHOW((" -D show BitFlow SDK library debug info\n"));
SHOW(("\n"));
SHOW((" CIsimpleGPUD: init a VFG for RDMA and display pixels from GPU\n"));
SHOW((" display ends with newline\n"));
SHOW(("\n"));
}
/*==========================================================================*/
#include <time.h>
#include <sys/timeb.h>
static tCIDOUBLE GetTime(void)
/*
** Return fractional seconds
*/
{
tCIDOUBLE ans=0.0;
#ifdef _POSIX_TIMERS
struct timespec tp;
(void)clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
ans = (tCIDOUBLE)tp.tv_sec;
ans += ((tCIDOUBLE)tp.tv_nsec) / 1000000000.0;
#else
struct timeb tb;
(void)ftime(&tb);
ans = tb.millitm;
ans /= 1000.0;
ans += tb.time;
#endif
return(ans);
}
/*--------------------------------------------------------------------------*/
static void LDBdisplay(char *format, ...)
/*
** Library debug display function.
**
** This will display progress/information/error strings from BFciLib
*/
{
va_list val;
char buff[1024];
va_start(val,format);
(void)vsprintf(buff,format,val);
va_end(val);
/*
** buff now contains the library display string
**
** This function simply prints it to the console.
*/
(void)printf("LDB: %s\n",buff);
(void)fflush(stdout);
}
/*--------------------------------------------------------------------------*/
static int DecodeArgs(int argc, char **argv)
/*
** Parse the input arguments.
*/
{
char *str;
argv += 1; argc -= 1; /* skip program name */
while (argc-- > 0) {
str = *argv++;
if (str[0] != '-') {
ERR(("Do not know '%s' arg\n",str));
ShowHelp();
sExitAns = 1;
return(sExitAns);
}
switch (str[1]) {
case 'h':
ShowHelp();
exit(0);
case 'x':
(void)sscanf(*argv,"%d",&sNdx);
argv += 1; argc -= 1;
break;
case 'n':
(void)sscanf(*argv,"%d",&sNframes);
argv += 1; argc -= 1;
break;
case 'y':
(void)sscanf(*argv,"%d",&sNdx2);
argv += 1; argc -= 1;
break;
case 'b':
sBGok = 1;
break;
case 'D':
sLibDebug = 1;
break;
case 'm':
(void)sscanf(*argv,"%d",&sMaxFrames);
argv += 1; argc -= 1;
break;
case 's':
(void)sscanf(*argv,"%d",&sSkipFrames);
argv += 1; argc -= 1;
break;
default:
ERR(("Do not know arg '%s' arg\n",str));
ShowHelp();
sExitAns = 1;
return(sExitAns);
}
}
return(kCIEnoErr);
}
/*--------------------------------------------------------------------------*/
static int CheckForKeyboardInput(void)
/*
** Return 0 if no input available from stdin, 1 else
**
** Note: the console needs a newline in order to post input.
*/
{
fd_set exceptfds,readfds,writefds;
struct timeval tv;
int ans;
char buff[1024];
FD_ZERO(&exceptfds);
FD_ZERO(&readfds);
FD_ZERO(&writefds);
FD_SET(fileno(stdin),&readfds);
(void)memset(&tv,'\0',sizeof(struct timeval));
ans = select(1,&readfds,&writefds,&exceptfds,&tv);
if ((ans == 1) && FD_ISSET(fileno(stdin),&readfds)) {
/*
** Consume the line.
*/
(void)fgets(buff,1024,stdin);
return(1);
}
return(0);
}
/*--------------------------------------------------------------------------*/
/*
** All we need to put data into GPU.
*/
static void **sCUDAbuffs=NULL; /* frame buffs on GPU */
static tCIU8 **sCUDAPAbuffs=NULL; /* page-aligned GPU frame buffs */
static tCIU8 *sHostBuff=NULL; /* single output buff */
static tCIU64 sFrameSize=0;
/*--------------------------------------------------------------------------*/
static void TermGPUD(void)
{
int i;
if (NULL != sCUDAPAbuffs) { free(sCUDAPAbuffs); }
sCUDAPAbuffs = NULL;
#if defined(NV_BUILD_DGPU)
if (NULL != sCUDAbuffs) {
for (i=0; i<sNframes; i++) {
if (NULL != sCUDAbuffs[i]) { (void)cudaFree(sCUDAbuffs[i]); }
}
free(sCUDAbuffs);
}
sCUDAbuffs = NULL;
if (NULL != sHostBuff) { cudaFreeHost(sHostBuff); }
sHostBuff = NULL;
#else
if (NULL != sCUDAbuffs) {
for (i=0; i<sNframes; i++) {
if (NULL != sCUDAbuffs[i]) { (void)cudaFreeHost(sCUDAbuffs[i]); }
}
free(sCUDAbuffs);
}
sCUDAbuffs = NULL;
#endif
(void)cudaDeviceReset();
return;
}
/*--------------------------------------------------------------------------*/
static int InitGPUD(tCIU64 frameSize)
/*
** Allocate the memory buffers on the GPU.
**
** The BitFlow driver is required to page-lock the buffers on 64kb boundaries
** so we need to keep track of the page-aligned buffer address to properly
** access the pixel data.
*/
{
cudaError_t ce;
CUresult cr;
int i,rc=1;
unsigned long long x,mask;
unsigned int flag;
#if defined(NV_BUILD_DGPU)
/*
** We need to specify the CUDA device.
*/
if (cudaSuccess != (ce = cudaGetDeviceCount(&i))) {
ERR(("InitGPUD: cudaGetDeviceCount failed\n"));
return(1);
}
if (sNdx2 >= i) {
ERR(("InitGUPD: %d device specified but only %d exist\n",sNdx2,i));
return(1);
}
if (cudaSuccess != (ce = cudaSetDevice(sNdx2))) {
ERR(("InitGUPD: cudaSetDevice failed for %d\n",sNdx2));
return(1);
}
#endif
/*
** Set the global
*/
sFrameSize = frameSize;
/*
** Get the buffers.
*/
if (NULL == (sCUDAbuffs = (void **)malloc(sNframes * sizeof(void *)))) {
ERR(("InitGPUD: cannot get %d for CUDA buffs\n",sNframes));
return(1);
}
(void)memset(sCUDAbuffs,'\0',sNframes * sizeof(void *));
if (NULL == (sCUDAPAbuffs = (tCIU8 **)malloc(sNframes * sizeof(void *)))) {
ERR(("InitGPUD: cannot get %d for PACUDA buffs\n",sNframes));
goto andOut;
}
(void)memset(sCUDAPAbuffs,'\0',sNframes * sizeof(void *));
for (i=0; i<sNframes; i++) {
#if defined(NV_BUILD_DGPU)
/*
** Be sure to allocate extra for CUDA page alignment.
*/
x = 1;
x <<= 16;
x -= 1;
mask = ~x;
/*
** Allocate the acquisition buffer.
*/
if (cudaSuccess != (ce = cudaMalloc(sCUDAbuffs+i,frameSize+x))) {
ERR(("InitGPUD: cudaMalloc failed on frame %d\n",sNframes));
goto andOut;
}
/*
** Now set the aligned addresses.
*/
x = (unsigned long long)(sCUDAbuffs[i]);
sCUDAPAbuffs[i] = (tCIU8*)(x & mask);
#else
/*
** Be sure to allocate extra for CUDA page alignment.
*/
x = 1;
x <<= 12;
x -= 1;
/*
** Allocate the acquisition buffer.
*/
if (cudaSuccess != (ce = cudaHostAlloc(sCUDAbuffs+i, frameSize+x, cudaHostAllocDefault))) {
ERR(("InitGPUD: cudaHostAlloc failed on frame %d\n",sNframes));
goto andOut;
}
flag = 1;
if (CUDA_SUCCESS != (cr = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)sCUDAbuffs[i]))) {
ERR(("InitGPUD: cuPointerSetAttribute failed on frame %d\n",sNframes));
goto andOut;
}
/*
** Now set the output addresses.
*/
sCUDAPAbuffs[i] = (tCIU8*)(sCUDAbuffs[i]);
#endif
}
#if defined(NV_BUILD_DGPU)
/*
** Allocate a single copy buffer for results.
*/
if (cudaSuccess != (ce = cudaHostAlloc(
(void **)&sHostBuff,frameSize,cudaHostAllocDefault))) {
ERR(("InitGPUD: failed to alloc output buffer\n"));
goto andOut;
}
#endif
rc = 0;
andOut:
if (0 != rc) { TermGPUD(); }
return(rc);
}
/*--------------------------------------------------------------------------*/
static int RunGPUD(tCIU32 buffNdx)
/*
** Here we copy from the (page aligned) device buffer to the single host
** buffer.
*/
{
cudaError_t ce;
#if defined(NV_BUILD_DGPU)
if (cudaSuccess != (ce = cudaMemcpy(
sHostBuff,sCUDAPAbuffs[buffNdx],sFrameSize,cudaMemcpyDeviceToHost))){
ERR(("RunGPUD: cudaMemcpy failed buff %d\n",buffNdx));
return(1);
}
#else
sHostBuff = (tCIU8*)sCUDAbuffs[buffNdx];
#endif
return(0);
}
/*--------------------------------------------------------------------------*/
static void InitAndGetDataUntilKeyPress(void)
/*
** Illustrate a simple example board interaction sequence.
*/
{
tCIRC circ;
tCIDOUBLE a=-1.0,b,c,d;
tCIU64 totalBytes=0,totalLines=0;
tCIU32 i;
tCIU32 nPtrs;
tCIU8 **uPtrs=NULL;
tCIU8 *p8;
tCIU32 frameID,value,pixToShow;
tCIU8 *frameP;
tCIU32 nFrames,bitsPerPix,hROIoffset,hROIsize,vROIoffset,vROIsize,stride;
/*
** Open the ndx'th frame grabber with exclusive write permission.
**
** We need exclusive write because GPUD is a case of non standard
** scatter-gather DMA buffer definition.
*/
if (kCIEnoErr != (circ = CiVFGopen(sNdx,kCIBO_exclusiveWrAccess,&sCIp))) {
ERR(("CiVFGopen gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
return;
}
/*
** Set library debug function if '-D' was specified.
*/
if (kCIEnoErr != (circ = CiSetDebug(sCIp,-1,
(0 == sLibDebug) ? NULL : LDBdisplay))) {
ERR(("CiSetDebug of %d gave '%s'\n",sNdx,CiErrStr(circ)));
}
/*
** Init the board with the config file specified by the DIP switches.
*/
if (kCIEnoErr != (circ = CiVFGinitialize(sCIp,NULL))) {
ERR(("CiVFGinitialize gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** Configure the board for 4 frame buffers.
*/
if (kCIEnoErr != (circ = CiDrvrBuffConfigure(sCIp,4,0,0,0,0))) {
ERR(("CiDrvrBuffConfigure gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** Determine buffer configuration. We only need bitsPerPix and stride.
*/
if (kCIEnoErr != (circ = CiBufferInterrogate(sCIp,&nFrames,&bitsPerPix,
&hROIoffset,&hROIsize,&vROIoffset,&vROIsize,&stride))) {
ERR(("CiBufferInterrogate gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** Now release these driver buffers.
*/
if (kCIEnoErr != (circ = CiDrvrBuffConfigure(sCIp,0,0,0,0,0))) {
ERR(("CiDrvrBuffConfigure(2) gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** For this example we only deal w/8bpp
**
** Use one of BitFlow's synthetic camera files and all isOK
*/
if (8 != bitsPerPix) {
ERR(("we need 8 bitsPerPix, not %d\n",bitsPerPix));
sExitAns = 1;
goto andOut;
}
/*
** Set up the GPU.
*/
if (0 != InitGPUD(vROIsize*stride)) {
ERR(("InitGPUD failed\n"));
sExitAns = 1;
goto andOut;
}
/*
** Now give the GPU buffers to the BitFlow driver
*/
if (kCIEnoErr != (circ = CiGPUDbuffConfigure(sCIp,sNframes,sCUDAPAbuffs,
(vROIsize * stride),0,0,0,0))) {
ERR(("CiGPUDbuffConfigure gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** This is a necessary step to promote the VFG state. However, the
** returned buffer pointers are not valid for access to the pixel
** data.
*/
if (kCIEnoErr != (circ = CiMapFrameBuffers(sCIp,0,&nPtrs,&uPtrs))) {
ERR(("CiMapFrameBuffers gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** We show 16 8b pix per line
*/
pixToShow = 16;
/*
** Reset acquisition and clear all error conditions.
*/
if (kCIEnoErr != (circ = CiAqSWreset(sCIp))) {
ERR(("CiAqSWreset gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** Clear the first line of all frame buffers.
*/
for (i=0; i<nPtrs; i++) { (void)memset(uPtrs[i],'\0',stride); }
/*
** Tell the user how to stop this cleanly.
*/
if (0 == sBGok) {
SHOW(("Will now dump first line of frames until newline (skip %d, ndx %d)\n",
sSkipFrames,sNdx));
} else {
SHOW(("Will now dump first line of frames until %d (skip %d, ndx %d)\n",
sMaxFrames,sSkipFrames,sNdx));
}
/*
** Start continuous acquisition.
*/
if (kCIEnoErr != (circ = CiAqStart(sCIp,0))) {
ERR(("CiAqStart gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
a = GetTime();
/*
** Display each acquired frame 1st line in a loop. Stop at newline.
*/
while (1) {
/*
** Check to see if a frame is already available before waiting.
*/
checkAgain:
switch (circ = CiGetOldestNotDeliveredFrame(sCIp,&frameID,&frameP)) {
case kCIEnoErr:
/*
** We have the frame.
*/
break;
case kCIEnoNewData:
/*
** We need to wait for another frame.
*/
if (kCIEnoErr != (circ = CiWaitNextUndeliveredFrame(sCIp,-1))) {
switch (circ) {
case kCIEaqAbortedErr:
SHOW(("CiWaitNextUndeliveredFrame gave '%s'\n",CiErrStr(circ)));
break;
default:
ERR(("CiWaitNextUndeliveredFrame gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
}
goto andOut;
}
goto checkAgain;
case kCIEaqAbortedErr:
SHOW(("CiGetOldestNotDeliveredFrame: acqistion aborted\n"));
goto andOut;
default:
ERR(("CiGetOldestNotDeliveredFrame gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
goto andOut;
}
/*
** OK. There is a new frame on the GPU. Move the data to host.
*/
if (0 != RunGPUD(frameID % sNframes)) {
ERR(("RunGPUD failed at %d\n",frameID));
sExitAns = 1;
goto andOut;
}
/*
** Change the frameP to the (single) host output buffer
*/
frameP = sHostBuff;
/*
** Allow skipping frames so display is not an issue.
*/
if ((0 != sSkipFrames) && (0 != (sDidFrames % (sSkipFrames+1)))) {
goto skipHere;
}
/*
** Display the frameID and the first line of frame data.
*/
SHOW(("frameID %9d:",frameID));
p8 = frameP;
for (i=0; i<pixToShow; i++) {
value = *p8++; SHOW((" %02X",value));
}
SHOW(("\n"));
skipHere:
totalLines += vROIsize;
totalBytes += stride * vROIsize;
sDidFrames += 1;
/*
** Break out of this loop on newline
*/
if (0 == sBGok) { if (0 != CheckForKeyboardInput()) { break; } }
/*
** Break out of loop if countdown hits zero
*/
if ((0 != sMaxFrames) && (--sMaxFrames == 0)) { break; }
}
andOut:
/*
** We must stop acquisition because we are about to tear down the buffers.
*/
if (kCIEnoErr != (circ = CiAqSWreset(sCIp))) {
ERR(("CiAqSWreset (end) gave '%s'\n",CiErrStr(circ)));
}
/*
** Unmap the frame buffers.
*/
if ((NULL != uPtrs) && (kCIEnoErr != (circ = CiUnmapFrameBuffers(sCIp)))){
ERR(("CiUnmapFrameBuffers gave '%s'\n",CiErrStr(circ)));
}
/*
** Remove the buffers from the driver.
*/
if (kCIEnoErr != (circ = CiGPUDbuffConfigure(sCIp,0,NULL,0,0,0,0,0))) {
ERR(("CiGPUDbuffConfigure(end) gave '%s'\n",CiErrStr(circ)));
sExitAns = 1;
}
/*
** Close the access.
*/
if ((NULL != sCIp) && (kCIEnoErr != (circ = CiVFGclose(sCIp)))) {
ERR(("CiVFGclose gave '%s'\n",CiErrStr(circ)));
}
/*
** Tear down the GPUD stuff
*/
TermGPUD();
/*
** Show data rate
*/
c = b = GetTime() - a;
if ((a < 0.0) || (c < 0.001)) {
a = 0.0;
b = 0.0;
c = 0.0;
d = 0.0;
} else {
a = ((tCIDOUBLE)totalBytes)/c;
b = ((tCIDOUBLE)totalLines)/c;
d = ((tCIDOUBLE)sDidFrames)/c;
}
SHOW((
"%d: Data rate %.1lf ln/s (%.1lf b/s) (%.1lf FPS) (%.1lf sec) after %d fr\n",
sNdx,b,a,d,c,sDidFrames));
return;
}
/*==========================================================================*/
int main(int argc, char **argv)
/*
** Decode command line and acquire/display frames until EOF
*/
{
sArgv0 = *argv;
if (kCIEnoErr == DecodeArgs(argc,argv)) {
InitAndGetDataUntilKeyPress();
}
return(sExitAns);
}
/*==========================================================================*/
/*
$Log: CIsimpleGPUD.cu,v $
Revision 1.6 2020/10/02 23:30:13 steve
CLOCK_MONOTONIC is not always monotonic, so prefer CLOCK_MONOTONIC_RAW.
Revision 1.5 2020/10/02 01:17:23 steve
ftime is deprecated. Use clock_gettime.
Revision 1.4 2020/06/24 22:57:58 steve
Add support for NVIDIA Jetson platform (GPUD only).
Added missing GPUD buffer mapping step. Other GPUD bug fixes.
Revision 1.2 2016/07/11 20:02:59 steve
Gn2 xx-2Y/2YE, GPUD, DGMA
Revision 1.1 2016/03/06 23:31:46 steve
Support CXP_usualINit and Axion
*/