#include <stdio.h>
#include <windows.h>
#include "functions.h"

//#define TEST
//#define OUTPUT
//#define RESULTS

typedef struct retval
{
	unsigned int err;
	DBFL ord0;
	DBFL ord1,corr1;
	DBFL ord2,corr2;
} retval;


typedef struct exchange
{
	DBFL gm;
	DBFL s0;
	DBFL xstar;
	DBFL disc;
	DBFL dt;
	DBFL tt;
	DBFL x;
	DBFL et;
} exchange;

typedef struct extdata
{
	DBFL tmp;
	DBFL p;
	DBFL dtp;
	DBFL coeff;
} extdata;

typedef struct v2pps
{
	DBFL v2pp;
	DBFL s;
	DBFL coeff;
#ifdef TEST
	DBFL tmp1;
#endif
} v2pps;

// Computing ext[] starting from exch
__global__ void InitExtData(extdata* ext, exchange* exch, int npoint2, DBFL taumin, DBFL taumax)
{
	int i = blockIdx.x;
	DBFL tmp = taumin+(*exch).dt*(DBFL)(2*i+1)*0.5;
	DBFL p = 0.5*integs(tmp,taumax);
	DBFL dtp = (taumax-tmp)/(DBFL)(npoint2);

	ext[i].tmp = tmp;
	ext[i].p = p;
	ext[i].dtp = dtp;
	ext[i].coeff = (0.01-(*exch).gm*0.5*sigmaq(tmp))*dtp;
}

// Computing v2p[] starting from ext[] and exch
__global__ void InitV2p(extdata* src, v2pps* v2ps, int npoint2, DBFL taumax, exchange* exch)
{
	int i = npoint2 * blockIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;

	DBFL gm = (*exch).gm;
	DBFL tmp = src[blockIdx.x].tmp;
	DBFL dtp = src[blockIdx.x].dtp;
	DBFL tmp1 = tmp + dtp*(DBFL)(2*j+1)*0.50;

	if (j < npoint2) 
	{
		v2ps[i+j].v2pp = 0;
		v2ps[i+j].s = (DBFL)0.50 * integs(tmp1,taumax);
		v2ps[i+j].coeff = (0.01-gm*0.5*sigmaq(tmp1));
#ifdef TEST
		v2ps[i+j].tmp1 = tmp1;
#endif
	}
}


void BarrierUPD0(
					 int npoint, int npoint2,
					 DBFL kprice, DBFL stock, DBFL hbarr, DBFL taumin, DBFL taumax, int igm,
					 retval* ret, exchange* exch)
{
	DBFL v0=0.0, gm=0.0;
	DBFL e1=0.0, e2=0.0, e3=0.0, e4=0.0;
	DBFL xstar=0.0, s0=0.0;
	DBFL sigmat=0.0, disc=0.0, d1=0.0,d2=0.0,d3=0.0,d4=0.0;

	// Start computing
	switch(igm)
	{
	case 0:
		gm = 0.0;
		break;
	case 1:
		gm = integalpha(taumin,taumax)/(0.5*integs(taumin,taumax));
#ifndef GPU
		printf("gm=\t%f \n",gm);
#endif
		break;
	default:
#ifndef GPU
		printf("Invalid igm value, reset to 0.\n");
#endif
		igm = 0;
		gm = 0.0;
		break;
	}
	(*exch).gm = gm;

// xstar=min(0,log(kprice/hbarr))
	xstar = log(kprice/hbarr);
	(*exch).xstar = xstar;
	if (xstar > 0.0) xstar=0.0;
	sigmat = integs(taumin,taumax);
	disc = -integr(taumin,taumax);
	(*exch).disc = disc;

// Change of variable 
	s0=stock/hbarr;
	(*exch).s0 = s0;

// --- Compute the zero-th order term P_0 --- 

	d1=(xstar-log(s0)+(1.0-gm)*0.5*sigmat)/sqrt(sigmat);
	d2=(xstar+log(s0)+(1.0-gm)*0.5*sigmat)/sqrt(sigmat);
	d3=(xstar-log(s0)-(1.0+gm)*0.5*sigmat)/sqrt(sigmat);
	d4=(xstar+log(s0)-(1.0+gm)*0.5*sigmat)/sqrt(sigmat);

	e1=PHID(d1);
	e2=PHID(d2);
	e3=PHID(d3);
	e4=PHID(d4);

	v0 = kprice*e1-kprice*POW(s0,(1.0-gm))*e2;
	v0 += exp(gm*0.5*sigmat)*(-hbarr*s0*e3+hbarr*POW(s0,-gm)*e4);
	v0 *= exp(disc);
	(*ret).ord0 = v0;
}

void BarrierUPD1(
					 int npoint, int npoint2,
					 DBFL kprice, DBFL hbarr, DBFL taumin, DBFL taumax,
					 retval* ret, exchange* exch)
{
	DBFL v1p=0.0, ccaux=0.0;

	DBFL gm = (*exch).gm;
	DBFL s0 = (*exch).s0;
	DBFL xstar = (*exch).xstar;
	DBFL disc = (*exch).disc;
	DBFL v0 = (*ret).ord0;

// --- Compute the first order term P_1 --- 

	DBFL dt = (taumax-taumin)/DBFL(npoint);
	(*exch).dt = dt;
	DBFL tt = 0.5*integs(taumin,taumax);
	(*exch).tt = tt;
	DBFL x = log(s0);
	(*exch).x = x;
	DBFL et = exp(0.5*(1.0-gm)*x);
	(*exch).et = et;
	DBFL dsqpi = POW(PI,0.5);
	
	DBFL v1=0.0;
	for(int i=1;i<=npoint;i++)
	{
		v1p=0.0;
		DBFL tmp = taumin+dt*DBFL(2*i-1)*0.5;
		DBFL p = 0.5*integs(tmp,taumax);

		// Function E(p,tt,a,b,gm)
		DBFL caux = v(p,tt,x,xstar,gm) + v(p,tt,x,-xstar,gm) - v(p,tt,-x,xstar,gm);
		ccaux -= v(p,tt,-x,-xstar,gm);
		DBFL auxnew = ccaux*(-kprice*exp(-xstar*0.5*(1.0-gm))+hbarr*exp(xstar*0.5*(1.0+gm)));
		v1p += auxnew;

		// Function L(p,tt,a,b,c,gm)
		DBFL b = gm-1.0;
		DBFL c = -xstar;
		ccaux = llold(p,tt,x,b,c,gm) - llold(p,tt,-x,b,c,gm);
		auxnew = kprice*(1.0-gm)*ccaux;
		v1p += auxnew;

		b = -(gm+1.0);
		c = xstar;
		ccaux = llold(p,tt,x,b,c,gm)-llold(p,tt,-x,b,c,gm);
		auxnew = -exp(gm*p)*hbarr*ccaux;
		v1p += auxnew;

		b = (gm+1.0);
		c = -xstar;
		ccaux = llold(p,tt,x,b,c,gm)-llold(p,tt,-x,b,c,gm);
		auxnew = exp(gm*p)*hbarr*gm*ccaux;
		v1p += auxnew;

		// Function F(p,tt,a,b,c,gm)
		b = gm-1.0;
		auxnew = -kprice*(1.0-gm)*(ff(p,tt,x,b,gm)-ff(p,tt,-x,b,gm));
		v1p += auxnew;

		b = gm+1.0;
		auxnew = -exp(gm*p)*gm*hbarr*(ff(p,tt,x,b,gm)-ff(p,tt,-x,b,gm));
		v1p += auxnew;

		v1 += (0.01-gm*0.5*sigmaq(tmp))*v1p;
	}

	v1 *= exp(disc)*et*dt/(dsqpi*2.0);
	(*ret).ord1 = v0 + v1;
	(*ret).corr1 = v1;
}


#ifdef TEST
void BarrierUPD2(
				 int istart, int jstart,
				 int imax, int jmax,
				 int npoint, int npoint2,
				 DBFL kprice, DBFL hbarr, DBFL taumin, DBFL taumax,
				 retval* ret, exchange* exch)
{
// --- Compute the second order term P_2 --- 

	DBFL v2=0, tmp1, s;

	DBFL v0 = (*ret).ord0;
	DBFL v1 = (*ret).corr1;

	DBFL dt = (*exch).dt;
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;
	DBFL disc = (*exch).disc;
	DBFL et = (*exch).et;

#ifndef GPU
	printf("computing the 2nd order term (%3.3dx%3.3d)... wait...\n",npoint,npoint2);
#endif

	for(int i=1; i<=npoint; i++)
	{
		DBFL v2p = 0.0;
		DBFL tmp = taumin+dt*(DBFL)(2*i-1)*0.5;
		DBFL p = 0.5*integs(tmp,taumax);
		DBFL dtp = (taumax-tmp)/(DBFL)(npoint2);
#ifdef OUTPUT
		if ((i > istart) && (i <= imax))
			printf("[%3.3d]\ttmp=%f\tp=%f\tdtp=%f\n",i,tmp,p,dtp);
#endif

		for(int j=1; j<=npoint2; j++)
		{
			DBFL caux, caux2;
			DBFL v2pp = 0.0;
			tmp1=tmp+dtp*(DBFL)(2*j-1)*0.50;
			s=0.50*integs(tmp1,taumax);

			caux = dll(s,p,tt,-x,-1.0+gm,-xstar,gm)-dll(s,p,tt,x,-1.0+gm,-xstar,gm);
			v2pp = caux*kprice*(1.0-gm);

			caux = dll(s,p,tt,-x,-1.0-gm,xstar,gm)-dll(s,p,tt,x,-1.0-gm,xstar,gm);
			v2pp -= exp(gm*s)*hbarr*caux;

			caux = dll(s,p,tt,-x,1.0+gm,-xstar,gm)-dll(s,p,tt,x,1.0+gm,-xstar,gm);
			v2pp += exp(gm*s)*gm*hbarr*caux;

			// 04
			caux = dvv(s,p,tt,-x,xstar,gm)-dvv(s,p,tt,x,xstar,gm);
			caux += dvv(s,p,tt,-x,-xstar,gm)-dvv(s,p,tt,x,-xstar,gm);
			caux2 = hbarr*exp(0.5*(1.0+gm)*xstar)-kprice*exp(-0.5*(1.0-gm)*xstar);
			v2pp += caux2*caux;

			// 06
			caux = dff(s,p,tt,-x,-(DBFL)1.0+gm,gm)-dff(s,p,tt,x,-(DBFL)1.0+gm,gm);
			v2pp -= ((DBFL)1.0-gm)*kprice*caux;

			// 07
			caux = dff(s,p,tt,-x,(DBFL)1.0+gm,gm)-dff(s,p,tt,x,(DBFL)1.0+gm,gm);
			v2pp -= exp(gm*s)*gm*hbarr*caux;
			v2pp *= (DBFL)0.5*((DBFL)1.0-gm);

			caux=-ddll(s,p,tt,-x,-(DBFL)1.0+gm,-xstar,gm)+ddll(s,p,tt,x,-(DBFL)1.0+gm,-xstar,gm);
			v2pp=v2pp+caux*kprice*((DBFL)1.0-gm);

			caux=-ddll(s,p,tt,-x,-(DBFL)1.0-gm,xstar,gm)+ddll(s,p,tt,x,-(DBFL)1.0-gm,xstar,gm);
			v2pp=v2pp-exp(gm*s)*hbarr*caux;

			caux=-ddll(s,p,tt,-x,(DBFL)1.0+gm,-xstar,gm)+ddll(s,p,tt,x,(DBFL)1.0+gm,-xstar,gm);
			v2pp=v2pp+exp(gm*s)*gm*hbarr*caux;

			caux = -ddvv(s,p,tt,-x,xstar,gm)+ddvv(s,p,tt,x,xstar,gm);

			caux += -dvv(s,p,tt,-x,-xstar,gm)+dvv(s,p,tt,x,-xstar,gm);

			caux2 = hbarr*exp((DBFL)0.5*((DBFL)1.0+gm)*xstar)-kprice*exp(-(DBFL)0.5*((DBFL)1.0-gm)*xstar);
			v2pp += caux2*caux;

			caux = -ddff(s,p,tt,-x,-1.0+gm,gm)+ddff(s,p,tt,x,-1.0+gm,gm);
			v2pp -= ((DBFL)1.0-gm)*kprice*caux;

			caux = -ddff(s,p,tt,-x,1.0+gm,gm)+ddff(s,p,tt,x,1.0+gm,gm);
			v2pp -= exp(gm*s)*gm*hbarr*caux;

#ifdef OUTPUT
			if ((i > istart) && (i <= imax) && (j > jstart) && (j <= jmax))
				printf("v2p[%3.3d,%3.3d]=%f\n",i,j,v2pp);
#endif
			v2p += (0.01-gm*(DBFL)0.5*sigmaq(tmp1))*v2pp;
		}
#ifdef OUTPUT
		if ((i>istart) && (i<=imax)) 
		{
			printf("v2p[%3.3d]\t%f\n",i,v2p);
			printf("coeff[%3.3d]\t%f\n",i,(0.01-gm*(DBFL)0.5*sigmaq(tmp))*dtp);
		}
#endif

		v2 += v2p*(0.01-gm*(DBFL)0.5*sigmaq(tmp))*dtp;
	}

	v2 *= exp(disc)*et*dt;
	printf("v2\t%e\n",v2);
	(*ret).ord2 = v0 + v1 + v2;
	(*ret).corr2 = v2;
}
#endif


/*
	-------------------------------------------------------------------------------------
	The functions BarrierUPD2Cuda_xx_x are computed in parallel 
	on the GPU.
	To each GPU core is assigned a different index blockIdx. 
	Each GPU core computes a different block of the output 
	matrix[npoint,npoint2]. The input of the functions BarrierUPD2Cuda_xx_x
	is a matrix[npoint] of C++ extdata struct.
	-------------------------------------------------------------------------------------
*/


#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_01_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = dll(s,p,tt,-x,-1.0+gm,-xstar,gm);
	v2pp += caux*kprice*(1.0-gm);
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_01_2(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -dll(s,p,tt,x,-1.0+gm,-xstar,gm);
	v2pp += caux*kprice*(1.0-gm);
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_02_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = dll(s,p,tt,-x,-1.0-gm,xstar,gm);
	v2pp -= exp(gm*s)*hbarr*caux;
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_02_2(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -dll(s,p,tt,x,-1.0-gm,xstar,gm);
	v2pp -= exp(gm*s)*hbarr*caux;
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_03_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = dll(s,p,tt,-x,1.0+gm,-xstar,gm);
	v2pp += exp(gm*s)*gm*hbarr*caux;
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_03_2(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -dll(s,p,tt,x,1.0+gm,-xstar,gm);
	v2pp += exp(gm*s)*gm*hbarr*caux;
	
	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_04(
						int xmin, int xmax,
						extdata* src, v2pps* dst, int npoint2,
						DBFL kprice, DBFL hbarr, DBFL taumax,
						retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = 0;
	caux = dvv(s,p,tt,-x,xstar,gm)-dvv(s,p,tt,x,xstar,gm);
	caux += dvv(s,p,tt,-x,-xstar,gm)-dvv(s,p,tt,x,-xstar,gm);
	DBFL caux2 = hbarr*exp(0.5*(1.0+gm)*xstar)-kprice*exp(-0.5*(1.0-gm)*xstar);
	v2pp += caux2*caux;

	caux = dff(s,p,tt,-x,-(DBFL)1.0+gm,gm)-dff(s,p,tt,x,-(DBFL)1.0+gm,gm);
	v2pp -= ((DBFL)1.0-gm)*kprice*caux;

	caux = dff(s,p,tt,-x,(DBFL)1.0+gm,gm)-dff(s,p,tt,x,(DBFL)1.0+gm,gm);
	v2pp -= exp(gm*s)*gm*hbarr*caux;
	v2pp *= (DBFL)0.5*((DBFL)1.0-gm);


	dst[index].v2pp = v2pp;
}


#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_05_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -ddll(s,p,tt,-x,-(DBFL)1.0+gm,-xstar,gm);
	v2pp += caux*kprice*(1.0-gm);

	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_05_2( 
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = ddll(s,p,tt,x,-(DBFL)1.0+gm,-xstar,gm);
	v2pp += caux*kprice*(1.0-gm);

	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_06_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -ddll(s,p,tt,-x,-1.0-gm,xstar,gm);
	v2pp -= exp(gm*s)*hbarr*caux;

	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_06_2(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = ddll(s,p,tt,x,-1.0-gm,xstar,gm);
	v2pp -= exp(gm*s)*hbarr*caux;

	dst[index].v2pp = v2pp;
}


#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_07_1(
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = -ddll(s,p,tt,-x,1.0+gm,-xstar,gm);
	v2pp += exp(gm*s)*gm*hbarr*caux;

	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_07_2( 
						  int xmin, int xmax,
						  extdata* src, v2pps* dst, int npoint2,
						  DBFL kprice, DBFL hbarr, DBFL taumax,
						  retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = ddll(s,p,tt,x,1.0+gm,-xstar,gm);
	v2pp += exp(gm*s)*gm*hbarr*caux;

	dst[index].v2pp = v2pp;
}

#ifdef GPU 
__global__ 
#endif 
void BarrierUPD2Cuda_08( 
						int xmin, int xmax,
						extdata* src, v2pps* dst, int npoint2,
						DBFL kprice, DBFL hbarr, DBFL taumax,
						retval* ret, exchange* exch)
{
	int ix = xmin + blockIdx.x;
	if (ix > xmax) return;
	int j = 1 + blockDim.y * blockIdx.y + threadIdx.y;
	if (j > npoint2) return;
	int index = npoint2 * ix + j - 1;
	DBFL v2pp = dst[index].v2pp;
	DBFL s = dst[index].s;

	// Restoration of the variables computed in the zeroth order and 
	// in the first order terms of the perturbation expansion
	DBFL tt = (*exch).tt;
	DBFL x = (*exch).x;
	DBFL gm = (*exch).gm;
	DBFL xstar = (*exch).xstar;

	// Restoration of the variables associated to the loop
	extdata ext = src[ix];
	DBFL p = ext.p;

	// Computation of the order two partial sum
	DBFL caux = 0, caux2 = 0;
	caux = -ddvv(s,p,tt,-x,xstar,gm)+ddvv(s,p,tt,x,xstar,gm);
	caux2 = hbarr*exp(0.5*(1.0+gm)*xstar)-kprice*exp(-0.5*(1.0-gm)*xstar);
	v2pp += caux2*caux;

	caux = -dvv(s,p,tt,-x,-xstar,gm)+dvv(s,p,tt,x,-xstar,gm);
	caux2 = hbarr*exp(0.5*(1.0+gm)*xstar)-kprice*exp(-0.5*(1.0-gm)*xstar);
	v2pp += caux2*caux;

	caux = -ddff(s,p,tt,-x,-1+gm,gm)+ddff(s,p,tt,x,-1+gm,gm);
	v2pp -= (1.0-gm)*kprice*caux;

	caux = -ddff(s,p,tt,-x,(DBFL)1.0+gm,gm)+ddff(s,p,tt,x,(DBFL)1.0+gm,gm);
	v2pp -= exp(gm*s)*gm*hbarr*caux;


	dst[index].v2pp = v2pp;
}


// Function used for debugging purposes
void PrintTime(char* msg,DWORD ct_start)
{
	DWORD tm = GetTickCount() - ct_start;
	printf("%s\tElapsed time: %.4f\n",msg,tm/1000.f);
}


int main (int argc, char *argv[])
{
	// Algorithm parameters
	double kprice = (argc >= 2) ? atof(argv[1]) : 101.0;
	double stock =  (argc >= 3) ? atof(argv[2]) : 100.0;
	double hbarr =  (argc >= 4) ? atof(argv[3]) : 101.0;
	double taumin = (argc >= 5) ? atof(argv[4]) : 0.0;
	double taumax = (argc >= 6) ? atof(argv[5]) : 0.5;
	int igm = (argc >= 7) ? atoi(argv[6]) : 1;

	int npoint = (argc >= 8) ? atoi(argv[7]) : 300;
	int npoint2 = (argc >= 9) ? atoi(argv[8]) : 100;

	// CUDA computation parameters
	int blocksize = (argc >= 10) ? atoi(argv[9]) : 64;
	int hstep = (argc >= 11) ? atoi(argv[10]) : 0;
	if (hstep == 0) hstep = npoint;

	// Debugging parameters
	int runC = (argc >= 12) ? atoi(argv[11]) : 0;
	int istart = (argc >= 13) ? atoi(argv[12]) : 0;
	int jstart = (argc >= 14) ? atoi(argv[13]) : 0;
	int itot = (argc >= 15) ? atoi(argv[14]) : 0;
	int jtot = (argc >= 16) ? atoi(argv[15]) : 0;

/*
	Note: the function ND2 computes the bivariate normal probabilities

	ND2(DH,DK,RHO) calculates the probability that X > DH and Y > DK,
	RHO is the correlation coefficient between X and Y
	Note: Prob( X < DH, Y < DK ) = ND2( -DH, -DK, RHO )

	Assign the following quantities:        
	kprice: strike price                    
	stock : stock price at time t=taumin   
	hbarr : barrier value                   
	taumin: time at which the option is evaluated (t1)                 
	taumax: maturity time (t2)             
*/

	// Initialization of result struct
	retval *ret = new retval;
	(*ret).err = 0;
	(*ret).ord0 = 0;
	(*ret).ord1 = 0; (*ret).corr1 = 0;
	(*ret).ord2 = 0; (*ret).corr2 = 0;

	// Initialization of exchange struct
	exchange *exch = new exchange;

float elapsed_time_ms = 0.0f;
DWORD timeStart = GetTickCount();
BarrierUPD0(npoint, npoint2, kprice, stock, hbarr, taumin, taumax, igm, ret, exch);
BarrierUPD1(npoint, npoint2, kprice, hbarr, taumin, taumax, ret, exch);
//printf("--- RESULTS ---\n");
//printf("zero-th order\t%.15f\n",(*ret).ord0);
//printf("first order\t%.15f (%.15e)\n",(*ret).ord1,(*ret).corr1);
#ifdef OUTPUT
	PrintTime("P0+P1: ",timeStart);
#endif
DWORD timeEnd = GetTickCount() - timeStart;

#ifdef GPU
	/* Printing GPU information

	int device = 0;
	cudaSetDevice(device);
	
	cudaDeviceProp properties;
	cudaGetDeviceProperties(&properties, device);
	unsigned int gpumem = properties.totalGlobalMem;
	int thrblock = properties.maxThreadsPerBlock; // Max number of threads per block
	int blockx = properties.maxThreadsDim[0];
	int blocky = properties.maxThreadsDim[1];

	printf("\n");
	printf("3DFD running on: %s\n", properties.name);
	printf("Total GPU memory: %.2f MB\n", gpumem/(1024.f*1024.f));
	printf("Max number of threads per block: %d\n", thrblock);
	printf("Max block dimension: %d x %d x %d\n", blockx, blocky, properties.maxThreadsDim[2]);
	printf("Max grid size: %d x %d x %d\n", properties.maxGridSize[0], properties.maxGridSize[1], properties.maxGridSize[2]);
	printf("\n");

	*/

	int blockx = 512;
	int blocky = 512;

	// Allocating resources
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int size_ret = sizeof(retval);
	int size_exch = sizeof(exchange);
	int size_ext = sizeof(extdata);

	extdata *ext = new extdata[npoint];
	extdata *gpu_ext = 0;
	v2pps *v2p = new v2pps[npoint*npoint2];
	v2pps *gpu_v2p = 0;
	DBFL *tot = new DBFL[npoint];  for (int i=0; i<npoint; i++) tot[i] = 0;
	retval *gpu_ret = 0;
	exchange *gpu_exch = 0;

	cudaMalloc((void**)&gpu_ret, size_ret); // Structure contaning the results
	cudaMalloc((void**)&gpu_exch, size_exch); // Structure containing the common data used in the functions
	cudaMalloc((void**)&gpu_ext, npoint*size_ext); // Matrix containing the order two input data
	cudaMalloc((void**)&gpu_v2p, npoint*npoint2*sizeof(v2pps)); // Matrix containing the order two partial sums
	if (0==gpu_ret || 0==gpu_exch || 0==gpu_ext || 0==gpu_v2p)
	{
		printf("Unable to allocate GPU memory\n");
		exit(1);
	}

	// Start computing
	if (blocksize > blocky) blocksize = blocky;
	dim3 grid_dim(npoint,1+npoint2/blocksize);
	dim3 grid_small_dim(hstep,1+npoint2/blocksize);
	dim3 block_dim(1,blocksize);
	cudaEventRecord(start,0);
	cudaMemcpy(gpu_ret, ret, size_ret, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_exch, exch, size_exch, cudaMemcpyHostToDevice);
#ifdef OUTPUT
	printf("Computing 2nd order with [%4.4dx%4.4d] points.\n",npoint,npoint2);
	printf("Using blocksize=%3.3d and hstep=%3.3d.\n",blocksize,hstep);
	printf("Please wait...\n\n");
#endif

	DWORD ct_start = GetTickCount();
	InitExtData<<<npoint,1>>>(gpu_ext, gpu_exch, npoint2, taumin, taumax);
	InitV2p<<<grid_dim,block_dim>>>(gpu_ext, gpu_v2p, npoint2, taumax, gpu_exch);
	
	cudaError_t gpuerr = cudaSuccess;
	DWORD tm = 0;

	//int xmin = 0;
	//while (xmin < npoint)
	int xmin = hstep * int(npoint / hstep);
	if (xmin == npoint) xmin -= hstep;
	while (xmin >= 0)
	{
		int xmax = xmin + hstep - 1;
		if (xmax >= npoint) xmax = npoint -1;
#ifdef OUTPUT
		tm = GetTickCount() - ct_start;
		printf("Current interval: [%4.4d - %4.4d]\tElapsed time: %.4f\n",xmin,xmax,tm/1000.f);
#endif
		bool last = (xmin+hstep) >= npoint;
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_01_1",ct_start);
#endif
		BarrierUPD2Cuda_01_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_01_2",ct_start);
#endif
		BarrierUPD2Cuda_01_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_02_1",ct_start);
#endif
		BarrierUPD2Cuda_02_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_02_2",ct_start);
#endif
		BarrierUPD2Cuda_02_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_03_1",ct_start);
#endif
		BarrierUPD2Cuda_03_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_03_2",ct_start);
#endif
		BarrierUPD2Cuda_03_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_04",ct_start);
#endif
		BarrierUPD2Cuda_04<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_05_1",ct_start);
#endif
		BarrierUPD2Cuda_05_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_05_2",ct_start);
#endif
		BarrierUPD2Cuda_05_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_06_1",ct_start);
#endif
		BarrierUPD2Cuda_06_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_06_2",ct_start);
#endif
		BarrierUPD2Cuda_06_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_07_1",ct_start);
#endif
		BarrierUPD2Cuda_07_1<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_07_2",ct_start);
#endif
		BarrierUPD2Cuda_07_2<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
#ifdef OUTPUT 
		if (last) PrintTime("\tBarrierUPD2Cuda_08",ct_start);
#endif
		BarrierUPD2Cuda_08<<<grid_small_dim,block_dim>>>(xmin, xmax, gpu_ext, gpu_v2p, npoint2, kprice, hbarr, taumax, gpu_ret, gpu_exch);
		//gpuerr = cudaThreadSynchronize();
		//if (gpuerr != cudaSuccess) {
		//	printf("CUDA error: %s\n\n", cudaGetErrorString(gpuerr));
		//	break;
		//}


		//xmin += hstep;
		xmin -= hstep;
	}
	
#ifdef OUTPUT
	tm = GetTickCount() - ct_start;
	printf("Synchronizing threads\t\tElapsed time: %.4f\n",tm/1000.f);
	gpuerr = cudaThreadSynchronize();
#endif
	if (gpuerr == cudaSuccess) 
	{
#ifdef OUTPUT
		tm = GetTickCount() - ct_start;
		printf("Copying memory...\t\tElapsed time: %.4f\n",tm/1000.f);
#endif
		cudaMemcpy(ret, gpu_ret, size_ret, cudaMemcpyDeviceToHost);
		cudaMemcpy(exch, gpu_exch, size_exch, cudaMemcpyDeviceToHost);
		cudaMemcpy(ext, gpu_ext, npoint*size_ext, cudaMemcpyDeviceToHost);
		cudaMemcpy(v2p, gpu_v2p, npoint*npoint2*sizeof(v2pps), cudaMemcpyDeviceToHost);

		// Computing v2p[i] = sum v2p[i,j] when j=0..npoint2-1
#ifdef OUTPUT
		tm = GetTickCount() - ct_start;
		printf("Summing matrix elements (CPU).. Elapsed time: %.4f\n",tm/1000.f);
#endif
		for (int i=0; i<npoint; i++)
			for (int j=0; j<npoint2; j++)
			{
				DBFL v = v2p[npoint2*i+j].v2pp;
				DBFL coeff = v2p[npoint2*i+j].coeff;
				if (!isnan(v))
					tot[i] += v*coeff;
			}
		// Computing v2 = sum v2p[i] when i=0..npoint-1
		DBFL v2 = 0;
		for (int i=0; i<npoint; i++) 
			v2 += tot[i]*ext[i].coeff;
		// Last computation
		v2 *= exp((*exch).disc)*(*exch).et*(*exch).dt;


		// Storage of the results obtained
		DBFL v0 = (*ret).ord0;
		DBFL v1 = (*ret).corr1;
		(*ret).ord2 = v0+v1+v2;
		(*ret).corr2 = v2;

		cudaEventRecord(stop,0);
		cudaThreadSynchronize();
		cudaEventElapsedTime(&elapsed_time_ms, start, stop);

		// DEBUG
		int imax = istart + itot - 1; if (imax > npoint) imax = npoint;
		int jmax = jstart + jtot - 1; if (jmax > npoint2) jmax = npoint2;
		printf("--- CUDA ---\n");
		for (int i=istart; i<imax; i++) 
		{
			printf("[%3.3d]\ttmp=%f\tp=%f\tdtp=%f\n",i+1,(*(ext+i)).tmp,(*(ext+i)).p,(*(ext+i)).dtp);
			for (int j=jstart; j<jmax; j++) {
				v2pps v2 = v2p[npoint2*i+j];
				printf("v2pp[%3.3d,%3.3d]=%f\tcoeff[%3.3d,%3.3d]=%f\n",i+1,j+1,v2.v2pp,i+1,j+1,v2.coeff);
			}
			printf("v2p[%3.3d]\t%f\n",i+1,tot[i]);
			printf("coeff[%3.3d]\t%f\n",i+1,ext[i].coeff);
		}
		//printf("v2\t%e\n",v2);
		printf("elapsed time CUDA\t%.4fsec\n",(elapsed_time_ms)/1000.f);
	}
	//printf("CUDA response: %s\n\n", cudaGetErrorString(cudaGetLastError()));
	printf("CUDA response: %s\n\n", cudaGetErrorString(gpuerr));

	// Free resources
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	if (gpu_ret) cudaFree(gpu_ret);
	if (gpu_exch) cudaFree(gpu_exch);
	if (gpu_ext) cudaFree(gpu_ext);
	if (gpu_v2p) cudaFree(gpu_v2p);

#endif

	if (runC == 1) 
	{
	#ifdef TEST
		printf("\n--- C++ ---\n");
		int imax = istart + itot - 1;
		int jmax = jstart + jtot - 1;
		DWORD t_start = GetTickCount();
		BarrierUPD2(istart, jstart, imax, jmax, npoint, npoint2, kprice, hbarr, taumin, taumax, ret, exch);
		elapsed_time_ms = GetTickCount() - t_start;
		printf("elapsed time C++\t%.4fsec\n",(elapsed_time_ms)/1000.f);
	#else
		printf("C++ code is not compiled inside EXE, skipping...\n");
	#endif
	}

	printf("--- RESULTS ---\n");
	printf("zero order\t%.15f\n",(*ret).ord0);
	printf("first order\t%.15f (%.15e)\n",(*ret).ord1,(*ret).corr1);
	printf("second order\t%.15f (%.15e)\n",(*ret).ord2,(*ret).corr2);
	printf("elapsed time\t%.2fsec\n\n",(timeEnd+elapsed_time_ms)/1000.f);

#ifdef RESULTS
	printf("--- EXCHANGE ---\n");
	printf("gm\t%f\n",(*exch).gm);
	printf("s0\t%f\n",(*exch).s0);
	printf("xstar\t%f\n",(*exch).xstar);
	printf("disc\t%f\n",(*exch).disc);
	printf("dt\t%f\n",(*exch).dt);
	printf("tt\t%f\n",(*exch).tt);
	printf("x\t%f\n",(*exch).x);
	printf("et\t%f\n",(*exch).et);

	printf("--- RETVAL ---\n");
	printf("ord0\t%f\n",(*ret).ord0);
	printf("ord1\t%f\n",(*ret).ord1);
	printf("corr1\t%e\n",(*ret).corr1);
	printf("ord2\t%f\n",(*ret).ord2);
	printf("corr2\t%e\n",(*ret).corr2);
#endif

#ifdef GPU
	// exit
	cudaThreadExit();
	exit(EXIT_SUCCESS);
#endif
}



