// $Id: DcbCreateGridKernel.cu 835 2009-10-24 12:30:21Z cr333 $

#include "CudaHelperCommon.cuh"
#include "CudaMath.h"
#include "DcbGrid.cuh"
#include "GpuTiledImages.hpp"

template <bool Init> void __global__ CreateGridKernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	float2* gpuGrid,
	const size_t pitch,
	const unsigned int w,
	const unsigned int h,
	const unsigned int d,
	const unsigned int tile_x,
	const unsigned int dc,
	const float sigmaS,
	const float sigmaC)
{
	// for reference:
	//   dim3 createBlock((int)ceil(c.p.aggr.sigmaS), (int)ceil(c.p.aggr.sigmaS), 1);
	//   dim3 createGrid(c.dw, c.dh, 1);

	const int dw = gridDim.x;
	const int dh = gridDim.y;

	// initialise shared memory
	extern __shared__ char array[];
	int* shared = (int*)array;
	for(int y = threadIdx.y; y < dc; y += blockDim.y)
	{
		for(int x = threadIdx.x; x < dc; x += blockDim.x)
		{
			shared[dc * y + x] = 0;
			shared[dc * dc + dc * y + x] = 0;
		}
	}
	__syncthreads();

#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ < 120

	// go single threaded to avoid contention when reading from or writing to shared memory
	if(threadIdx.x == 0 && threadIdx.y == 0)
	{
		for(int yi = blockIdx.y * blockDim.y; yi < blockIdx.y * blockDim.y + blockDim.y; yi++)
		{
			if(yi < h)
			{
				for(int xi = blockIdx.x * blockDim.x; xi < blockIdx.x * blockDim.x + blockDim.x; xi++)
				{
					if(xi < w && xi >= d)
					{
						const int cL = int(getColour1(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi]) / sigmaC); // left image pixel
						const int cR = int(getColour1(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d]) / sigmaC); // right image pixel
						const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi]; // cost space value

						// NB: both components scaled by same constant
						shared[dc * cR + cL] += int(100 * iC);
						shared[dc * dc + dc * cR + cL] += 100;
					}
				}
			}
		}
	}

#else // compiling for compute capability 1.2 or higher, i.e. have atomic adds to shared memory
	
	const int xi = blockIdx.x * blockDim.x + threadIdx.x;
	const int yi = blockIdx.y * blockDim.y + threadIdx.y;

	if(xi < w && xi >= d && yi < h)
	{
		const int cL = int(getColour1(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi]    ) / sigmaC); // left image pixel
		const int cR = int(getColour1(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d]) / sigmaC); // right image pixel
		const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi]; // cost space value

		// needs compute capability 1.2
		// NB: both components scaled by same constant
		atomicAdd(&shared[dc * cR + cL], int(100 * iC));
		atomicAdd(&shared[dc * dc + dc * cR + cL], 100);
	}

#endif // end of code branch for different compute capabilities

	__syncthreads();
	
	// copy data from shared memory to gpuGrid
	for(int y = threadIdx.y; y < dc; y += blockDim.y)
	{
		for(int x = threadIdx.x; x < dc; x += blockDim.x)
		{
			if(Init)
				gpuGrid[pitch *
					index3D(y, dc, blockIdx.y, dh, d / tile_x) +
					index3D(x, dc, blockIdx.x, dw, d % tile_x)
				] = make_float2(shared[dc * y + x], shared[dc * dc + dc * y + x]);
			else
				gpuGrid[pitch *
					index3D(y, dc, blockIdx.y, dh, d / tile_x) +
					index3D(x, dc, blockIdx.x, dw, d % tile_x)
				] += make_float2(shared[dc * y + x], shared[dc * dc + dc * y + x]);
		}
	}
}


void RunCreateGridKernel(Config& c)
{
	assert(c.gpuGrid != NULL);

	// compute block & grid dimensions
	dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
	dim3 createGrid(c.dw, c.dh, 1);

	RECORD_KERNEL_LAUNCH("Create DCB grid kernel", createGrid, createBlock);

	// create grid for all disparities
	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		CreateGridKernel<1><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
			toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2),
			c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
			c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
	}

	CHECK_KERNEL_ERROR("Create DCB grid kernel");
	CUDA_CALL(cudaThreadSynchronize());
}


//---- 2. Colour-blind double-frame DCB grid (doesn't work well) ----------------------------------

void RunCreateGridKernel(Config& c,
	const Cuda::DeviceMemoryReference3D<float>& prevCost,
	const Cuda::DeviceMemoryReference2D<const unsigned int>& prevImageL,
	const Cuda::DeviceMemoryReference2D<const unsigned int>& prevImageR)
{
	assert(c.gpuGrid != NULL);

	// compute block & grid dimensions
	dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
	dim3 createGrid(c.dw, c.dh, 1);

// add first image
	RECORD_KERNEL_LAUNCH("Create DCB grid kernel #1", createGrid, createBlock);

	// create grid for all disparities
	for(unsigned int d = 0; d < c.numDisps; d++)
	{
		CreateGridKernel<1><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
			toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2),
			c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
			c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
	}

	CHECK_KERNEL_ERROR("Create DCB grid kernel #1");
	CUDA_CALL(cudaThreadSynchronize());

// add second image
	printf("Pre-SECOND %p %p %p\n", prevCost.getBuffer(), prevImageL.getBuffer(), prevImageR.getBuffer());
	if(prevCost.getBuffer() != NULL && prevImageL.getBuffer() != NULL && prevImageR.getBuffer() != NULL)
	{
		RECORD_KERNEL_LAUNCH("Create DCB grid kernel #2", createGrid, createBlock);
		printf("SECOND\n");

		// create grid for all disparities
		for(unsigned int d = 0; d < c.numDisps; d++)
		{
			CreateGridKernel<0><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
				toPitchedPtr(prevCost), toPitchedPtr(prevImageL), toPitchedPtr(prevImageR),
				c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
				c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
		}

		CHECK_KERNEL_ERROR("Create DCB grid kernel #2");
		CUDA_CALL(cudaThreadSynchronize());
	}
}


//---- 4. Partially colour-blind per-frame DCB grid -----------------------------------------------

void __global__ CreateGridKernel2(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const cudaPitchedPtr gpuGrid,
	const unsigned int w,
	const unsigned int h,
	const unsigned int d,
	const unsigned int n, // tile within a layout
	const unsigned int tile_x,
	const unsigned int dc1,
	const unsigned int dc2,
	const float sigmaS,
	const float sigmaC1,
	const float sigmaC2)
{
	// for reference:
	//   dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
	//   dim3 createGrid(c.dw, c.dh, 1);

	const int dw = gridDim.x;
	const int dh = gridDim.y;

	// initialise shared memory
	extern __shared__ char array[];
	int* shared = (int*)array;
	for(int y = threadIdx.y; y < dc1; y += blockDim.y)
	{
		for(int x = threadIdx.x; x < dc1 * dc2; x += blockDim.x)
		{
			shared[index2D(x, dc1 * dc2, y)] = 0;
			shared[dc1 * dc2 * dc1 + index2D(x, dc1 * dc2, y)] = 0;
		}
	}
	__syncthreads();

//#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ < 120
//
//	// go single threaded to avoid contention when reading from or writing to shared memory
//	if(threadIdx.x == 0 && threadIdx.y == 0)
//	{
//		for(int yi = blockIdx.y * blockDim.y; yi < blockIdx.y * blockDim.y + blockDim.y; yi++)
//		{
//			if(yi < h)
//			{
//				for(int xi = blockIdx.x * blockDim.x; xi < blockIdx.x * blockDim.x + blockDim.x; xi++)
//				{
//					if(xi < w && xi >= d)
//					{
//						const int cL = int(getColour1(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi]) / sigmaC1); // left image pixel
//						const int cR = int(getColour1(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d]) / sigmaC1); // right image pixel
//						const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi]; // cost space value
//
//						// NB: both components scaled by same constant
//						shared[index2D(cL, dc1, cR)] += int(100 * iC);
//						shared[dc1 * dc2 * dc1 + index2D(cL, dc1, cR)] += 100;
//					}
//				}
//			}
//		}
//	}
//
//#else // compiling for compute capability 1.2 or higher, i.e. have atomic adds to shared memory
	
	const int xi = blockIdx.x * blockDim.x + threadIdx.x;
	const int yi = blockIdx.y * blockDim.y + threadIdx.y;

	if(xi < w && xi >= d && yi < h)
	{
		// left pixel
		const unsigned int pL = ((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi];
		const int cL1 = int(getColour1(pL) / sigmaC1); // lightness
		const int cL2 = int(getColour2(pL) / sigmaC2); // 'colour axis'

		// right pixel
		const unsigned int pR = ((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d];
		const int cR1 = int(getColour1(pR) / sigmaC1); // lightness

		// cost space value
		const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi];

		// needs compute capability 1.2
		// NB: both components scaled by same constant
		atomicAdd(&shared[index3D(cL1, dc1, cL2, dc2, cR1)], int(100 * iC));
		atomicAdd(&shared[dc1 * dc2 * dc1 + index3D(cL1, dc1, cL2, dc2, cR1)], 100);
	}

//#endif // end of code branch for different compute capabilities

	__syncthreads();
	
	// copy data from shared memory to gpuGrid
	for(int y = threadIdx.y; y < dc1; y += blockDim.y)
	{
		for(int x = threadIdx.x; x < dc1 * dc2; x += blockDim.x)
		{
			((float2*)gpuGrid.ptr)[(gpuGrid.pitch >> 3) *
				index3D(y, dc1, blockIdx.y, dh, n / tile_x) +
				index3D(x, dc1 * dc2, blockIdx.x, dw, n % tile_x)
			] = make_float2(shared[index2D(x, dc1 * dc2, y)], shared[dc1 * dc2 * dc1 + index2D(x, dc1 * dc2, y)]);
		}
	}
}

void RunCreateGridKernel2(Config2& c)
{
	assert(c.gpuGrids != NULL);

	// compute block & grid dimensions
	dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
	dim3 createGrid(c.dw, c.dh, 1);

	unsigned int disp = 0;
	RECORD_KERNEL_LAUNCH("CreateGridKernel2", createGrid, createBlock);
	for(unsigned int l = 0; l < c.gpuGrids->getNumLayouts(); l++)
	{
		const GpuTiledImages2D<float2>* layout = c.gpuGrids->getLayout(l);

		// create grid for all disparities
		for(unsigned int n = 0; n < layout->getNumTiles(); n++)
		{
			CreateGridKernel2<<<createGrid, createBlock, c.dc1 * c.dc2 * c.dc1 * 2 * sizeof(int)>>>(
				toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2), toPitchedPtr(layout->getImage()),
				c.w, c.h, disp + n, n, layout->getTileX(), c.dc1, c.dc2, c.sigmaS, c.sigmaC1, c.sigmaC2);
		}
		disp += layout->getNumTiles();
	}
	CHECK_KERNEL_ERROR("CreateGridKernel2");
	CUDA_CALL(cudaThreadSynchronize());
}