#include <math.h>
#define PI 3.14159265358979323846264338327950288f
const int NbBlocs = 20000, NbThreads = 500, N = NbBlocs*NbThreads;
const float K = 110, S0=100, r=0.02, sig=0.3, T=1.0,R = (r-sig*sig/2)*T;

__host__ __device__ void BS(float *x, float *y){
float z = sqrtf(-2.0f * logf(*x)) * cosf(2.0f * PI * (*y));
*y= S0*expf(R+sig*sqrtf(T)*z);
}
__global__ void BSgpu(float *a1, float *a2, int I) {
int i = blockDim.x*blockIdx.x + threadIdx.x;
if ( i < I ) BS(a1+i, a2+i);
}
void BScpu(float *a1, float *a2, int I) {
for ( int i = 0 ; i < I ; ++i ) BS(a1+i, a2+i);
}

intt main() {
const int taille = N*sizeof(float);
fflloatt *A1 = (float *) malloc(taille); // allocate A in CPU RAM
float *A2 = (float *) malloc(taille);
float *A3 = (float *) malloc(taille);
float *B1, *B2; // will be allocated in GPU RAM
srandom(time(NULL));
fforr ( iintt n = 0 ; n < N ; ++n ){ // fill vector A with random nb
A1[n] = (rand() + 0.5f)/(RAND_MAX + 1.0f);
A2[n] = (rand() + 0.5f)/(RAND_MAX + 1.0f);
}
cudaMalloc( (voiid **) &B1, taille); // allocate B1 in GPU RAM
cudaMemcpy(B1, A1, taille, cudaMemcpyHostToDevice); // transfer A1 into B1
cudaMalloc( (voiid **) &B2, taille);
cudaMemcpy(B2, A2, taille, cudaMemcpyHostToDevice);
BSgpu<<<NbBlocs, NbThreads>>>(B1,B2,N);
cudaMemcpy(A3, B2, taille, cudaMemcpyDeviceToHost); // transfer results in A3
BScpu(A1,A2,N);
float put=0, err = 0;
for ( iintt n = 0 ; n < N ; ++n ){
err += fabs(A3[n] - A2[n]);
put += fmax(K-A2[n],0.0f);
}
return 0