c - CUDA issue in a simple program -
i've spent time trying find out going on? problem i'm not able invoke simple kernel host code. i'm sure error notable people feel i'm wasting lot of time without reason probably. i'd appreciate help.
this .cpp code
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> #include <windows.h> #include <shrutils.h> #include <cutil_inline.h> #include <cutil_gl_inline.h> #include <cuda.h> cufunction reduce0; //i've used many ways declare kernel function,but..... int main( int argc , char *argv[] ){ int i,n,sum; int *data; int *md; srand ( time(null) ); n=(int)pow((float)2,(float)atoi(argv[1])); data=(int *)malloc(n * sizeof(int)); (i=0;i<n;i++){ data[i]=rand() % 10 + 1; } cudamalloc((void**) &md, n ); clock_t start = clock(); dim3 dimblock(512,0); dim3 dimgrid(1,1); reduce0<<< dimgrid,dimblock >>>(md,md); sum=0; for(i=0;i<n;i++){ sum=sum+data[i]; } printf("sum of %d-array %d \n", n , sum); printf("time elapsed: %f\n", ((double)clock() - start) / clocks_per_sec); return 0; }
and here .cu code
__global__ void reduce0(int*g_idata, int*g_odata){ extern __shared__ int sdata[]; // each thread loadsone element global shared mem unsigned int tid = threadidx.x; unsigned int i= blockidx.x*blockdim.x+ threadidx.x; sdata[tid] = g_idata[i]; __syncthreads(); // reduction in shared mem for(unsigned int s=1; s < blockdim.x; s *= 2) { if(tid % (2*s) == 0){ sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result block global mem if(tid == 0) g_odata[blockidx.x] = sdata[0]; }
so ask should invoke kernel? on compile doesn't recognise symbol "<<<" , far reduce0() recognises if declare in .cpp! please me start real cuda things!
cufunction driver api abstraction - not needed if going use language integration feature enables <<<>>> syntax of kernel invocation.
if don't have use driver api (and people don't), move c++ code .cu file , invoke kernel doing now.
the cudamalloc()
call allocates device memory cpu cannot read or write. have copy input reduction device memory using cudamemcpy(...,cudamemcpyhosttodevice);
then, after done processing, copy output host memory using cudamemcpy(..., cudamemcpydevicetohost);
ps reduction kernel slow. recommend open reduction sdk , use 1 of kernels there.
alternatively, use thrust library included in cuda 4.0. thrust supports fast , flexible reductions.
Comments
Post a Comment