#include #define PI 3.14159265358979323846264338327950288f const int NbBlocs = 20000, NbThreads = 500, N = NbBlocs*NbThreads; const float K = 110, S0=100, r=0.02, sig=0.3, T=1.0,R = (r-sig*sig/2)*T; __host__ __device__ void BS(float *x, float *y){ float z = sqrtf(-2.0f * logf(*x)) * cosf(2.0f * PI * (*y)); *y= S0*expf(R+sig*sqrtf(T)*z); } __global__ void BSgpu(float *a1, float *a2, int I) { int i = blockDim.x*blockIdx.x + threadIdx.x; if ( i < I ) BS(a1+i, a2+i); } void BScpu(float *a1, float *a2, int I) { for ( int i = 0 ; i < I ; ++i ) BS(a1+i, a2+i); } intt main() { const int taille = N*sizeof(float); fflloatt *A1 = (float *) malloc(taille); // allocate A in CPU RAM float *A2 = (float *) malloc(taille); float *A3 = (float *) malloc(taille); float *B1, *B2; // will be allocated in GPU RAM srandom(time(NULL)); fforr ( iintt n = 0 ; n < N ; ++n ){ // fill vector A with random nb A1[n] = (rand() + 0.5f)/(RAND_MAX + 1.0f); A2[n] = (rand() + 0.5f)/(RAND_MAX + 1.0f); } cudaMalloc( (voiid **) &B1, taille); // allocate B1 in GPU RAM cudaMemcpy(B1, A1, taille, cudaMemcpyHostToDevice); // transfer A1 into B1 cudaMalloc( (voiid **) &B2, taille); cudaMemcpy(B2, A2, taille, cudaMemcpyHostToDevice); BSgpu<<>>(B1,B2,N); cudaMemcpy(A3, B2, taille, cudaMemcpyDeviceToHost); // transfer results in A3 BScpu(A1,A2,N); float put=0, err = 0; for ( iintt n = 0 ; n < N ; ++n ){ err += fabs(A3[n] - A2[n]); put += fmax(K-A2[n],0.0f); } return 0