// $Id: DcbProcessGridKernel.cu 834 2009-10-24 11:12:50Z cr333 $

#include "CudaHelperCommon.cuh"
#include "CudaMath.h"
#include "DcbGrid.cuh"
#include "GpuTiledImages.hpp"

// blur along both colour dimensions
void __global__ ProcessGrid1Kernel(float2* gpuGrid, const unsigned int pitch)
{
	// for reference:
	//   dim3 processBlock1(c.dc, c.dc, 1);
	//   dim3 processGrid1(c.tile_x * c.dw, c.tile_y * c.dh, 1);

	const int tx = threadIdx.x;
	const int ty = threadIdx.y;
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;
	const int dc = blockDim.x;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	shared[dc * ty + tx] = gpuGrid[pitch * y + x];
	__syncthreads();

	// horizontal pass (left colour)
	float2          acc  = NORMAL_0 * shared[dc * ty + tx];
	if(tx >= 2)     acc += NORMAL_2 * shared[dc * ty + tx - 2];
	if(tx >= 1)     acc += NORMAL_1 * shared[dc * ty + tx - 1];
	if(tx + 1 < dc) acc += NORMAL_1 * shared[dc * ty + tx + 1];
	if(tx + 2 < dc) acc += NORMAL_2 * shared[dc * ty + tx + 2];
	__syncthreads();

	shared[dc * ty + tx] = acc;
	__syncthreads();

	// vertical pass (right colour)
	                acc  = NORMAL_0 * shared[dc * ty + tx];
	if(ty >= 2)     acc += NORMAL_2 * shared[dc * (ty - 2) + tx];
	if(ty >= 1)     acc += NORMAL_1 * shared[dc * (ty - 1) + tx];
	if(ty + 1 < dc) acc += NORMAL_1 * shared[dc * (ty + 1) + tx];
	if(ty + 2 < dc) acc += NORMAL_2 * shared[dc * (ty + 2) + tx];
	__syncthreads();

	gpuGrid[pitch * y + x] = acc;
}


// blur along x-dimension
void __global__ ProcessGrid2Kernel(float2* gpuGrid, const unsigned int pitch, const unsigned int dX, const unsigned int dY)
{
	// for reference:
	//   dim3 processBlock2(c.dw, 1, 1);
	//   dim3 processGrid2(c.dc, c.dc * c.dh, 1);

	const int x = gridDim.x * blockDim.x * dX + gridDim.x * threadIdx.x + blockIdx.x;
	const int y = gridDim.y * blockDim.y * dY + blockIdx.y;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	shared[threadIdx.x] = gpuGrid[pitch * y + x];
	__syncthreads();

	// apply 5-tap Gaussian blur
	float2                           acc  = NORMAL_0 * shared[threadIdx.x];
	if(threadIdx.x >= 2)             acc += NORMAL_2 * shared[threadIdx.x - 2];
	if(threadIdx.x >= 1)             acc += NORMAL_1 * shared[threadIdx.x - 1];
	if(threadIdx.x + 1 < blockDim.x) acc += NORMAL_1 * shared[threadIdx.x + 1];
	if(threadIdx.x + 2 < blockDim.x) acc += NORMAL_2 * shared[threadIdx.x + 2];

	gpuGrid[pitch * y + x] = acc;
}


// blur along y-dimension
void __global__ ProcessGrid3Kernel(
	float2* gpuGrid, const unsigned int pitch, const unsigned int width,
	const unsigned int offsetX, const unsigned int offsetY,
	const unsigned int deltaY)
{
	// for reference:
	//   dim3 processBlock3(processTileX3, c.dh, 1);
	//   dim3 processGrid3((c.dc * c.dw + processTileX3 - 1) / processTileX3, c.dc, 1);

	const int x = offsetX + blockDim.x * blockIdx.x + threadIdx.x;
	const int y = offsetY + blockIdx.y + deltaY * threadIdx.y;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	const int pixOffset = blockDim.y * threadIdx.x + threadIdx.y;
	if(x < width) shared[pixOffset] = gpuGrid[pitch * y + x];
	__syncthreads();

	// apply 5-tap Gaussian blur
	float2                           acc  = NORMAL_0 * shared[pixOffset];
	if(threadIdx.y >= 2)             acc += NORMAL_2 * shared[pixOffset - 2];
	if(threadIdx.y >= 1)             acc += NORMAL_1 * shared[pixOffset - 1];
	if(threadIdx.y + 1 < blockDim.y) acc += NORMAL_1 * shared[pixOffset + 1];
	if(threadIdx.y + 2 < blockDim.y) acc += NORMAL_2 * shared[pixOffset + 2];

	if(x < width) gpuGrid[pitch * y + x] = acc;
}


void RunProcessGridKernel(Config& c)
{
	// smooth along colour dimensions -------------------------------------------------------------
	dim3 processBlock1(c.dc, c.dc, 1);
	dim3 processGrid1(c.tile_x * c.dw, c.tile_y * c.dh, 1);

	RECORD_KERNEL_LAUNCH("Process DCB grid 1 kernel", processGrid1, processBlock1);

	ProcessGrid1Kernel<<<processGrid1, processBlock1, c.dc * c.dc * sizeof(float2)>>>(c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3);
   
	CHECK_KERNEL_ERROR("Process DCB grid 1 kernel");
	CUDA_CALL(cudaThreadSynchronize());

	// smooth along horizontal (x) dimension ------------------------------------------------------
	dim3 processBlock2(c.dw, 1, 1);
	dim3 processGrid2(c.dc, c.dc * c.dh, 1);

	RECORD_KERNEL_LAUNCH("Process DCB grid 2 kernel", processGrid2, processBlock2);

	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		ProcessGrid2Kernel<<<processGrid2, processBlock2, c.dw * sizeof(float2)>>>(c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3, d % c.tile_x, d / c.tile_x);
	}
	CHECK_KERNEL_ERROR("Process DCB grid 2 kernel");
	CUDA_CALL(cudaThreadSynchronize());

	// smooth along vertical (y) dimension --------------------------------------------------------
	// NB: Conceptually, this kernel is of size 1 x dh. However, as this is pretty inefficient,
	//     we tile together a few of them along the x direction. This tiling factor should ideally
	//     be 16, the size of a half-warp, or some other power of two. Let's set the limit at
	//     256 threads per block, so that we can achieve a good occupancy (on both G80 and GT200).
	int processTileX3 = floorPow2(256 / c.dh);
	dim3 processBlock3(processTileX3, c.dh, 1);
	dim3 processGrid3((c.dc * c.dw + processTileX3 - 1) / processTileX3, c.dc, 1);

	RECORD_KERNEL_LAUNCH("Process DCB grid 3 kernel", processGrid3, processBlock3);

	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		ProcessGrid3Kernel<<<processGrid3, processBlock3, processTileX3 * c.dh * sizeof(float2)>>>(
			c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3, c.dc * c.dw * c.tile_x,
			c.dc * c.dw * (d % c.tile_x), c.dc * c.dh * (d / c.tile_x), c.dc);
	}
	CHECK_KERNEL_ERROR("Process DCB grid 3 kernel");
	CUDA_CALL(cudaThreadSynchronize());
}


//---- 4. Partially colour-blind per-frame DCB grid -----------------------------------------------

// Horizontal Blur
void __global__ ProcessGridHorzKernel(
	const cudaPitchedPtr grid,
	const unsigned int width,
	const unsigned int height,
	const unsigned int offsetX,
	const unsigned int offsetY,
	const unsigned int blockDelta,
	const unsigned int threadDelta)
{
	const int x = offsetX + blockDelta * blockIdx.x + threadDelta * threadIdx.x;
	const int y = offsetY + blockDim.y * blockIdx.y + threadIdx.y;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	const int pixOffset = blockDim.y * threadIdx.y + threadIdx.x;
	if(x < offsetX + width) shared[pixOffset] = ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x];
	else shared[pixOffset] = make_float2(0.0f, 0.0f);
	__syncthreads();

	// apply 5-tap Gaussian blur
	float2                           acc  = NORMAL_0 * shared[pixOffset];
	if(threadIdx.x >= 2)             acc += NORMAL_2 * shared[pixOffset - 2];
	if(threadIdx.x >= 1)             acc += NORMAL_1 * shared[pixOffset - 1];
	if(threadIdx.x + 1 < blockDim.x) acc += NORMAL_1 * shared[pixOffset + 1];
	if(threadIdx.x + 2 < blockDim.x) acc += NORMAL_2 * shared[pixOffset + 2];

	if(x < offsetX + width) ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x] = acc;
}

// Horizontal Blur (special version for 2nd colour axis)
void __global__ ProcessGridHorz2Kernel(
	const cudaPitchedPtr grid,
	const unsigned int width,
	const unsigned int height,
	const unsigned int offsetX,
	const unsigned int offsetY,
	const unsigned int dc1,
	const unsigned int dc2)
{
	const int x = offsetX + dc1 * dc2 * (blockIdx.x / dc1) + dc1 * threadIdx.x + (blockIdx.x % dc1);
	const int y = offsetY + blockDim.y * blockIdx.y + threadIdx.y;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	const int pixOffset = blockDim.y * threadIdx.y + threadIdx.x;
	if(x < offsetX + width) shared[pixOffset] = ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x];
	else shared[pixOffset] = make_float2(0.0f, 0.0f);
	__syncthreads();

	// apply 5-tap Gaussian blur
	float2                           acc  = NORMAL_0 * shared[pixOffset];
	if(threadIdx.x >= 2)             acc += NORMAL_2 * shared[pixOffset - 2];
	if(threadIdx.x >= 1)             acc += NORMAL_1 * shared[pixOffset - 1];
	if(threadIdx.x + 1 < blockDim.x) acc += NORMAL_1 * shared[pixOffset + 1];
	if(threadIdx.x + 2 < blockDim.x) acc += NORMAL_2 * shared[pixOffset + 2];

	if(x < offsetX + width) ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x] = acc;
}

// Vertical Blur
void __global__ ProcessGridVertKernel(
	const cudaPitchedPtr grid,
	const unsigned int width,
	const unsigned int height,
	const unsigned int offsetX,
	const unsigned int offsetY,
	const unsigned int blockDelta,
	const unsigned int threadDelta)
{
	const int x = offsetX + blockDim.x * blockIdx.x + threadIdx.x;
	const int y = offsetY + blockDelta * blockIdx.y + threadDelta * threadIdx.y;

	// initialise shared memory
	extern __shared__ char array[];
	float2* shared = (float2*)array;
	const int pixOffset = blockDim.y * threadIdx.x + threadIdx.y;
	if(x < width + offsetX) shared[pixOffset] = ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x];
	else shared[pixOffset] = make_float2(0.0f, 0.0f);
	__syncthreads();

	// apply 5-tap Gaussian blur
	float2                           acc  = NORMAL_0 * shared[pixOffset];
	if(threadIdx.y >= 2)             acc += NORMAL_2 * shared[pixOffset - 2];
	if(threadIdx.y >= 1)             acc += NORMAL_1 * shared[pixOffset - 1];
	if(threadIdx.y + 1 < blockDim.y) acc += NORMAL_1 * shared[pixOffset + 1];
	if(threadIdx.y + 2 < blockDim.y) acc += NORMAL_2 * shared[pixOffset + 2];

	if(x < width + offsetX) ((float2*)grid.ptr)[(grid.pitch >> 3) * y + x] = acc;
}

void RunProcessGridKernel2(Config2& c)
{
	// smooth along cL1 (left image lightness) -------------------------------------------------------
	{
		dim3 process1Block(c.dc1, 1, 1);
		dim3 process1Grid(c.dw * c.dc2, c.dh * c.dc1, 1);

		RECORD_KERNEL_LAUNCH("Process CDCB grid 1", process1Grid, process1Block);
		for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
		{
			const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);
			for(unsigned int n = 0; n < layout->getNumTiles(); n++)
			{
				ProcessGridHorzKernel<<<process1Grid, process1Block, process1Block.x * process1Block.y * sizeof(float2)>>>(
					toPitchedPtr(layout->getImage()), layout->getTileWidth(), layout->getTileHeight(),
					layout->getXForTile(n), layout->getYForTile(n), c.dc1, 1);
			}
		}
		CHECK_KERNEL_ERROR("Process CDCB grid 1");
		CUDA_CALL(cudaThreadSynchronize());
	}

	// smooth along cL2 (left image colour axis) -----------------------------------------------------
	{
		dim3 process2Block(c.dc2, 1, 1);
		dim3 process2Grid(c.dw * c.dc1, c.dh * c.dc1, 1);

		RECORD_KERNEL_LAUNCH("Process CDCB grid 2", process2Grid, process2Block);
		for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
		{
			const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);
			for(unsigned int n = 0; n < layout->getNumTiles(); n++)
			{
				ProcessGridHorz2Kernel<<<process2Grid, process2Block, process2Block.x * process2Block.y * sizeof(float2)>>>(
					toPitchedPtr(layout->getImage()), layout->getTileWidth(), layout->getTileHeight(),
					layout->getXForTile(n), layout->getYForTile(n), c.dc1, c.dc2);
			}
		}
		CHECK_KERNEL_ERROR("Process CDCB grid 2");
		CUDA_CALL(cudaThreadSynchronize());
	}

	// smooth along cX (horizontal pixels) --------------------------------------------------------
	{
		dim3 process3Block(c.dw, 1, 1);
		dim3 process3Grid(c.dc1 * c.dc2, c.dh * c.dc1, 1);

		RECORD_KERNEL_LAUNCH("Process CDCB grid 3", process3Grid, process3Block);
		for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
		{
			const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);
			for(unsigned int n = 0; n < layout->getNumTiles(); n++)
			{
				ProcessGridHorzKernel<<<process3Grid, process3Block, process3Block.x * process3Block.y * sizeof(float2)>>>(
					toPitchedPtr(layout->getImage()), layout->getTileWidth(), layout->getTileHeight(),
					layout->getXForTile(n), layout->getYForTile(n), 1, c.dc1 * c.dc2);
			}
		}
		CHECK_KERNEL_ERROR("Process CDCB grid 3");
		CUDA_CALL(cudaThreadSynchronize());
	}

	// smooth along cR1 (right image lightness) ---------------------------------------------------
	{
		dim3 process4Block(1, c.dc1, 1);
		dim3 process4Grid(c.dw * c.dc2 * c.dc1, c.dh, 1);

		RECORD_KERNEL_LAUNCH("Process CDCB grid 4", process4Grid, process4Block);
		for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
		{
			const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);
			for(unsigned int n = 0; n < layout->getNumTiles(); n++)
			{
				ProcessGridVertKernel<<<process4Grid, process4Block, process4Block.x * process4Block.y * sizeof(float2)>>>(
					toPitchedPtr(layout->getImage()), layout->getTileWidth(), layout->getTileHeight(),
					layout->getXForTile(n), layout->getYForTile(n), c.dc1, 1);
			}
		}
		CHECK_KERNEL_ERROR("Process CDCB grid 4");
		CUDA_CALL(cudaThreadSynchronize());
	}

	// smooth along cY (vertical pixels) ----------------------------------------------------------
	{
		dim3 process5Block(1, c.dh, 1);
		dim3 process5Grid(c.dw * c.dc2 * c.dc1, c.dc1, 1);

		RECORD_KERNEL_LAUNCH("Process CDCB grid 5", process5Grid, process5Block);
		for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
		{
			const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);
			for(unsigned int n = 0; n < layout->getNumTiles(); n++)
			{
				ProcessGridVertKernel<<<process5Grid, process5Block, process5Block.x * process5Block.y * sizeof(float2)>>>(
					toPitchedPtr(layout->getImage()), layout->getTileWidth(), layout->getTileHeight(),
					layout->getXForTile(n), layout->getYForTile(n), 1, c.dc1);
			}
		}
		CHECK_KERNEL_ERROR("Process CDCB grid 5");
		CUDA_CALL(cudaThreadSynchronize());
	}
}