// $Id: DcbSliceGridKernel.cu 839 2009-10-24 15:18:26Z cr333 $

#include <math.h>
#include "CudaHelperCommon.cuh"
#include "DcbGrid.cuh"
#include "GpuTiledImages.hpp"

// need global texture reference
texture<float2, 2, cudaReadModeElementType> gpuGridTex;

// slices in gpuGridTex and returns the normalised value
float __device__ sliceGrid4D(
	const float cX,
	const float cY,
	const float cL,
	const float cR,
	const unsigned int dX,
	const unsigned int dY,
	const unsigned int dw,
	const unsigned int dh,
	const unsigned int dc)
{
	//// nearest neighbour lookup
	//float2 newCost = tex2D(gpuGridTex, index3D(int(cL), dc, int(cX), dw, dX), index3D(int(cR), dc, int(cY), dh, dY));
	//float2 newCost = tex2D(gpuGridTex, index3D(cL, dc, int(cX), dw, dX), index3D(cR, dc, int(cY), dh, dY)); // with colour blur

	// Quadrilinear interpolation based on 4 bilinear lookups
	// ------------------------------------------------------
	// The following three versions are essentially the same.
	// They look up 4 bilinearly interpolated values around (cX, cY), named as follows:
	//
	// TL--------TR                  TL - top left
	// |          |                  TR - top right
	// |      x <-|--- (cX, cY)
	// |          |
	// |          |                  BL - bottom left
	// BL--------BR                  BR - bottom right
	//
	// The only difference in the following 3 versions are the offsets used for the bilinear interpolation.

	//// Attempt 1. TL is (round(cX), round(cY)) => does not work correctly
	//const float2 newCostTL = tex2D(gpuGridTex, dc * dw * d + dc * max(0,      int(cX + 0.5f)) + cL, dc * max(0,      int(cY + 0.5f)) + cR);
	//const float2 newCostBL = tex2D(gpuGridTex, dc * dw * d + dc * max(0,      int(cX + 0.5f)) + cL, dc * min(dh + 1, int(cY + 1.5f)) + cR);
	//const float2 newCostTR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1.5f)) + cL, dc * max(0,      int(cY + 0.5f)) + cR);
	//const float2 newCostBR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1.5f)) + cL, dc * min(dh + 1, int(cY + 1.5f)) + cR);
	//const float2 newCostT = lerp(newCostTL, newCostTR, cX + 0.5f - int(cX + 0.5f));
	//const float2 newCostB = lerp(newCostBL, newCostBR, cX + 0.5f - int(cX + 0.5f));
	//const float2 newCost  = lerp(newCostT, newCostB, cY + 0.5f - int(cY + 0.5f));
	
	//// Attempt 2. TL is (floor(cX), floor(cY)) => does not work correctly
	//const float2 newCostTL = tex2D(gpuGridTex, dc * dw * d + dc *             int(cX + 0)  + cL, dc *             int(cY + 0)  + cR);
	//const float2 newCostBL = tex2D(gpuGridTex, dc * dw * d + dc *             int(cX + 0)  + cL, dc * min(dh + 1, int(cY + 1)) + cR);
	//const float2 newCostTR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1)) + cL, dc *             int(cY + 0)  + cR);
	//const float2 newCostBR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1)) + cL, dc * min(dh + 1, int(cY + 1)) + cR);
	//const float2 newCostT = lerp(newCostTL, newCostTR, cX - int(cX));
	//const float2 newCostB = lerp(newCostBL, newCostBR, cX - int(cX));
	//const float2 newCost  = lerp(newCostT, newCostB, cY - int(cY));

	// Attempt 3. TL is (round(cX) - 1, round(cY) - 1)
	// Why does this work? It follows the linear interpolation as described in the CUDA Programming Guide (E.2 Linear Filtering).
	const float2 newCostTL = tex2D(gpuGridTex, index3D(cL, dc, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR, dc, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostBL = tex2D(gpuGridTex, index3D(cL, dc, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR, dc, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostTR = tex2D(gpuGridTex, index3D(cL, dc, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR, dc, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostBR = tex2D(gpuGridTex, index3D(cL, dc, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR, dc, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostT = lerp(newCostTL, newCostTR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCostB = lerp(newCostBL, newCostBR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCost  = lerp(newCostT, newCostB, cY - 0.5f - int(cY - 0.5f));

	return newCost.x / newCost.y;
}

void __global__ SliceGridKernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const unsigned int w,
	const unsigned int h,
	const unsigned int d,
	const unsigned int dX,
	const unsigned int dY,
	const unsigned int dw,
	const unsigned int dh,
	const unsigned int dc,
	const float sigmaS,
	const float sigmaC)
{
	// for reference:
	//   dim3 sliceBlock(256, 1, 1);
	//   dim3 sliceGrid(c.tile_x * ((c.w + sliceBlock.x - 1) / sliceBlock.x), c.tile_y * c.h, 1);

	const int x = blockDim.x * blockIdx.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	if(x >= d && x < w && y < h) // ignore outside pixels
	{
		// compute coordinates in bilateral grid
		const float cX = x / sigmaS;
		const float cY = y / sigmaS;
		const float cL = getColour1(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x]) / sigmaC; // left image pixel
		const float cR = getColour1(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d]) / sigmaC; // right image pixel

		// write to cost space
		const float newCost = sliceGrid4D(cX, cY, cL, cR, dX, dY, dw, dh, dc);
		((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] = newCost;
	}
}

void RunSliceGridKernel(Config& c)
{
	// convert to texture for fast bilinear interpolation
	c.gpuGrid->bindTexture<cudaReadModeElementType>(gpuGridTex);
	gpuGridTex.filterMode = cudaFilterModeLinear;

	// compute block & grid dimensions
	dim3 sliceBlock(min(256, c.w), 1, 1);
	dim3 sliceGrid(((c.w + sliceBlock.x - 1) / sliceBlock.x), c.h, 1);

	RECORD_KERNEL_LAUNCH("Slice DCB grid kernel", sliceGrid, sliceBlock);

	// slice all grids in parallel
	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		SliceGridKernel<<<sliceGrid, sliceBlock>>>(toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2), c.w, c.h, d, d % c.tile_x, d / c.tile_x, c.dw, c.dh, c.dc, c.sigmaS, c.sigmaC);
	}
	CHECK_KERNEL_ERROR("Slice DCB grid kernel");
	CUDA_CALL(cudaThreadSynchronize());
}


//---- 3. Colour-blind multi-frame DCB grid -------------------------------------------------------

template <bool Init> void __global__ SliceGridKernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const unsigned int w,
	const unsigned int h,
	const unsigned int d,
	const unsigned int dX,
	const unsigned int dY,
	const unsigned int dw,
	const unsigned int dh,
	const unsigned int dc,
	const float sigmaS,
	const float sigmaC,
	const float weight)
{
	// for reference:
	//   dim3 sliceBlock(256, 1, 1);
	//   dim3 sliceGrid(c.tile_x * ((c.w + sliceBlock.x - 1) / sliceBlock.x), c.tile_y * c.h, 1);

	const int x = blockDim.x * blockIdx.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	if(x >= d && x < w && y < h) // ignore outside pixels
	{
		// compute coordinates in bilateral grid
		const float cX = x / sigmaS;
		const float cY = y / sigmaS;
		const float cL = getColour1(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x]) / sigmaC; // left image pixel
		const float cR = getColour1(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d]) / sigmaC; // right image pixel

		// write to cost space
		const float newCost = sliceGrid4D(cX, cY, cL, cR, dX, dY, dw, dh, dc);
		if(Init)
			((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] = weight * newCost;
		else
			((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] += weight * newCost;
	}
}

void RunMultiSliceGridKernel(Config& c, Cuda::DeviceMemoryReference2D<float2>** grids, unsigned int numGrids, const int weighting, const float wa, const float wb)
{
	// compute block & grid dimensions
	dim3 sliceBlock(min(256, c.w), 1, 1);
	dim3 sliceGrid(((c.w + sliceBlock.x - 1) / sliceBlock.x), c.h, 1);

	RECORD_KERNEL_LAUNCH("Multi-Slice DCB grid kernel", sliceGrid, sliceBlock);

	// SliceGridKernel has to be run for every disparity level in every frame.
	// Since there is only one global texture in use, we have to do one frame at a time,
	// unbinding & rebinding the texture in between. That is done in temporal order,
	// with the current, most recent frame processed last. On the first run, we need to
	// initialise the costspace though (template parameter 1).

	// precompute all weights
	float* weights = new float[numGrids];
	float sum = 0.0f;
	switch(weighting)
	{
		case 1: // uniform
		{
			for(unsigned int i = 0; i < numGrids; i++) { weights[i] = 1.0f; sum += weights[i]; }
			break;
		}

		case 2: // simple Gaussian falloff
		{
			for(unsigned int i = 0; i < numGrids; i++) { weights[i] = expf(- 0.5f * i * i / (wa * wa)); sum += weights[i]; }
			break;
		}

		case 3: // simple exponential falloff
		{
			const float q = wa;
			for(unsigned int i = 0; i < numGrids; i++) { weights[i] = powf(q, (float)i); sum += weights[i]; }
			break;
		}

		case 4: // Paris' practical video stream ... WTF?!
		{
			// Adaptive scaling of e and c according to "Scale" paragraph (page 7)
			const float e = wa * wa / 2.0f;
			const float c = -2.0f * logf(1 - expf(wa / wb)) / (wa * wa);
			const float q = 1 - expf(- c * e);
			printf("e = %f; c = %f; q = %f\n", e, c, q);
			for(unsigned int i = 0; i < numGrids; i++) { weights[i] = powf(q, (float)i); sum += weights[i]; }
			break;
		}
	};
	
	// normalise weights
	for(unsigned int i = 0; i < numGrids; i++)
	{
		weights[i] /= sum;
	}	

// first run: initialise cost
	
	// bind to texture for fast bilinear interpolation
	grids[numGrids - 1]->bindTexture<cudaReadModeElementType>(gpuGridTex);
	gpuGridTex.filterMode = cudaFilterModeLinear;

	// slice all grids in parallel
	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		SliceGridKernel<1><<<sliceGrid, sliceBlock>>>(toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2), c.w, c.h, d, d % c.tile_x, d / c.tile_x, c.dw, c.dh, c.dc, c.sigmaS, c.sigmaC, weights[numGrids - 1]);
	}
	CHECK_KERNEL_ERROR("Multi-Slice DCB grid kernel");
	CUDA_CALL(cudaThreadSynchronize());
	grids[numGrids - 1]->unbindTexture<cudaReadModeElementType>(gpuGridTex);

// subsequent runs: add to cost

	for(int grid = (int)numGrids - 2; grid >= 0; grid--)
	{
		// bind to texture for fast bilinear interpolation
		grids[grid]->bindTexture<cudaReadModeElementType>(gpuGridTex);
		gpuGridTex.filterMode = cudaFilterModeLinear;

		// slice all grids in parallel
		for(unsigned int d = 0; d < c.numDisps; d++)
		{
			SliceGridKernel<0><<<sliceGrid, sliceBlock>>>(toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2), c.w, c.h, d, d % c.tile_x, d / c.tile_x, c.dw, c.dh, c.dc, c.sigmaS, c.sigmaC, weights[grid]);
		}
		CHECK_KERNEL_ERROR("Multi-Slice DCB grid kernel");
		CUDA_CALL(cudaThreadSynchronize());
		grids[grid]->unbindTexture<cudaReadModeElementType>(gpuGridTex);
	}

	// clean up
	delete weights;
}


//---- 4. Partially colour-blind per-frame DCB grid -----------------------------------------------

// slices in gpuGridTex and returns the normalised value
float __device__ sliceGrid5D(
	const float cX,
	const float cY,
	const float cL1,
	const float cL2,
	const float cR1,
	const unsigned int dX,
	const unsigned int dY,
	const unsigned int dw,
	const unsigned int dh,
	const unsigned int dc1,
	const unsigned int dc2)
{
	//// nearest neighbour lookup
	//float2 newCost = tex2D(gpuGridTex, index4D(int(cL1), dc1, int(cL2), dc2, int(cX), dw, dX), index3D(int(cR1), dc1, int(cY), dh, dY));
	//float2 newCost = tex2D(gpuGridTex, index4D(cL1, dc1, cL2, dc2, int(cX), dw, dX), index3D(cR1, dc1, int(cY), dh, dY)); // with colour blur

	// Quintilinear interpolation based on 8 bilinear lookups
	// ------------------------------------------------------
	// Quintilinear interpolation is implemented using 2 quadrilinear interpolations:
	// near (N) and far (F) for the integers either side of cL2, the second colour axis of the left image.
	//
	// Each quadrilinear interpolation is based on 4 bilinearly interpolated values
	// around (cX, cY), named as follows:
	//
	//   TL--------TR                  TL - top left
	//   |          |                  TR - top right
	//   |      x <-|--- (cX, cY)
	//   |          |
	//   |          |                  BL - bottom left
	//   BL--------BR                  BR - bottom right
	//
	// Further comments:
	//   1. left-top-near (LTN) is (round(cX) - 1, round(cY) - 1, round(cL2) - 1)
	//   2. Why does this work? It follows the linear interpolation as described in the CUDA Programming Guide (E.2 Linear Filtering).
	//   3. Computation is interleaved to reduce register count
	const float2 newCostNTL = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 - 0.5f)), dc2, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR1, dc1, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostNTR = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 - 0.5f)), dc2, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR1, dc1, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostNT = lerp(newCostNTL, newCostNTR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCostNBL = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 - 0.5f)), dc2, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR1, dc1, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostNBR = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 - 0.5f)), dc2, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR1, dc1, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostNB = lerp(newCostNBL, newCostNBR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCostN  = lerp(newCostNT, newCostNB, cY - 0.5f - int(cY - 0.5f));

	const float2 newCostFTL = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 + 0.5f)), dc2, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR1, dc1, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostFTR = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 + 0.5f)), dc2, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR1, dc1, max(0,      int(cY - 0.5f)), dh, dY));
	const float2 newCostFT = lerp(newCostFTL, newCostFTR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCostFBL = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 + 0.5f)), dc2, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR1, dc1, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostFBR = tex2D(gpuGridTex, index4D(cL1, dc1, max(0,      int(cL2 + 0.5f)), dc2, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR1, dc1, min(dh + 1, int(cY + 0.5f)), dh, dY));
	const float2 newCostFB = lerp(newCostFBL, newCostFBR, cX - 0.5f - int(cX - 0.5f));
	const float2 newCostF  = lerp(newCostFT, newCostFB, cY - 0.5f - int(cY - 0.5f));
	const float2 newCost  = lerp(newCostN, newCostF, cL2 - 0.5f - int(cL2 - 0.5f));

	return newCost.x / newCost.y;
}

template <bool Init> void __global__ SliceGrid2Kernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const unsigned int w,
	const unsigned int h,
	const unsigned int d,
	const unsigned int dX,
	const unsigned int dY,
	const unsigned int dw,
	const unsigned int dh,
	const unsigned int dc1,
	const unsigned int dc2,
	const float sigmaS,
	const float sigmaC1,
	const float sigmaC2)
{
	const int x = blockDim.x * blockIdx.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	if(x >= d && x < w && y < h) // ignore outside pixels
	{
		// compute coordinates in bilateral grid
		const float cX = x / sigmaS;
		const float cY = y / sigmaS;

		// left pixel
		const unsigned int pL = ((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x];
		const float cL1 = getColour1(pL) / sigmaC1; // lightness
		const float cL2 = getColour2(pL) / sigmaC2; // 'colour axis'

		// right pixel
		const unsigned int pR = ((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d];
		const float cR1 = getColour1(pR) / sigmaC1; // lightness

		// write to cost space
		const float newCost = sliceGrid5D(cX, cY, cL1, cL2, cR1, dX, dY, dw, dh, dc1, dc2);
		if(Init)
			((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] = newCost;
		else
			((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] += newCost;
	}
}

void RunSliceGridKernel2(Config2& c)
{
	unsigned int disp = 0;
	for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
	{
		const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);

		// convert to texture for fast bilinear interpolation
		layout->getImage()->bindTexture<cudaReadModeElementType>(gpuGridTex);
		gpuGridTex.filterMode = cudaFilterModeLinear;

		// compute block & grid dimensions
		dim3 sliceBlock(min(256, c.w), 1, 1);
		dim3 sliceGrid(((c.w + sliceBlock.x - 1) / sliceBlock.x), c.h, 1);

		// slice all grids in parallel
		RECORD_KERNEL_LAUNCH("Slice CDCB grid kernel", sliceGrid, sliceBlock);
		for(unsigned int n = 0; n < layout->getNumTiles(); n++)
		{
			SliceGrid2Kernel<1><<<sliceGrid, sliceBlock>>>(
				toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2),
				c.w, c.h, disp + n, n % layout->getTileX(), n / layout->getTileX(), c.dw, c.dh, c.dc1, c.dc2, c.sigmaS, c.sigmaC1, c.sigmaC2);
		}
		CHECK_KERNEL_ERROR("Slice CDCB grid kernel");
		CUDA_CALL(cudaThreadSynchronize());

		// unbind texture
		layout->getImage()->unbindTexture<cudaReadModeElementType>(gpuGridTex);
		disp += layout->getNumTiles();
	}
}