// $Id: NaiveDcb.cu 857 2009-11-02 10:16:14Z cr333 $

#include <exception>
#include <cutil.h>
#include "cudatemplates/copy.hpp"
#include "cudatemplates/devicememory.hpp"
#include "cudatemplates/devicememorypitched.hpp"
#include "cudatemplates/devicememoryreference.hpp"
#include "cudamath.h"
#include "CudaHelperCommon.cuh"

inline __device__ float fastGaussianXSquared(const float xsquared, const float sigma)
{
	return __expf(- xsquared / (2 * sigma * sigma));
}

inline __device__ float colourGaussian(const float3 c1, const float3 c2, const float sigma)
{
    //// normal Gaussian
    //return __expf(- (
    //    ((float)c1.x - (float)c2.x) * ((float)c1.x - (float)c2.x) +
    //    ((float)c1.y - (float)c2.y) * ((float)c1.y - (float)c2.y) +
    //    ((float)c1.z - (float)c2.z) * ((float)c1.z - (float)c2.z)
    //) / (2 * sigma * sigma));

    // (colour)Lab Gaussian
    const float3 lab1 = xyz2lab(rgb2xyz(srgb2rgb(c1 / 255.0f)));
    const float3 lab2 = xyz2lab(rgb2xyz(srgb2rgb(c2 / 255.0f)));
    //lab1.y = 0.0f; lab1.z = 0.0f; lab2.y = 0.0f; lab2.z = 0.0f; // only use L
    return __expf(- dot((lab1 - lab2), (lab1 - lab2)) / (2 * sigma * sigma));

    //// green Gaussian
    //return __expf(- (
    //    ((float)c1.y - (float)c2.y) * ((float)c1.y - (float)c2.y)
    //) / (2 * sigma * sigma));
    
    //// grey Gaussian
    //float c1g = 0.2126729f * c1.x + 0.7151522f * c1.y + 0.0721750f * c1.z;
    //float c2g = 0.2126729f * c2.x + 0.7151522f * c2.y + 0.0721750f * c2.z;
    //return __expf(- (c1g - c2g) * (c1g - c2g) / (2 * sigma * sigma));
}


__global__ void NaiveDCBAggregationKernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuOutCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const unsigned int width, const unsigned int height, const unsigned int d, const int radius, const float sigmaS, const float sigmaC)
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;

	if(x < width && y < height) // only pixels inside the images
	{
		if(x < d)
		{
			// Some disparity values map the left-most pixels (in the left image) outside the
			// corresponding right image. So just copy their costs from the input cost space.
			((float*)gpuOutCost.ptr)[(gpuOutCost.pitch >> 2) * (height * d + y) + x] = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (height * d + y) + x];
		}
		else // for the remaining pixels (which have corresponding pixels in the right view)
		{
			// homogeneous accumulator
			float2 acc = make_float2(0.0f, 0.0f);

			// read centre pixels only once
			const float3 pixL1 = select_xyz<float4, float3>(unpack_xyzw<float4>(((int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x]));
			const float3 pixR1 = select_xyz<float4, float3>(unpack_xyzw<float4>(((int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d]));

			// loop over all pixel in the neighbourhood
			for(int dy = -int(radius); dy <= radius; dy++)
			{
				if(y + dy >= 0 && y + dy < height) // check bounds of cost space
				{
					for(int dx = -int(radius); dx <= radius; dx++)
					{
						if(x + dx >= d && x + dx < width) // check bounds of cost space
						{
							// only consider pixels in neighbourhood that have corresponding pixels in the right image
							float w = 1.0; // start with uniform weight

							// distance weighting
							w *= fastGaussianXSquared(dx * dx + dy * dy, sigmaS);

							// colour weighting (left image)
							const float3 pixL2 = select_xyz<float4, float3>(unpack_xyzw<float4>(((int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * (y + dy) + (x + dx)]));
							w *= colourGaussian(pixL1, pixL2, sigmaC);

							// colour weighting (right image)
							const float3 pixR2 = select_xyz<float4, float3>(unpack_xyzw<float4>(((int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * (y + dy) + (x + dx - d)]));
							w *= colourGaussian(pixR1, pixR2, sigmaC);
							
							acc += make_float2(w * ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (height * d + (y + dy)) + (x + dx)], w);
						}
					}
				}
			}

			unsigned int offset = (gpuOutCost.pitch >> 2) * (height * d + y) + x;
			((float*)gpuOutCost.ptr)[offset] = acc.x / acc.y; // weighted average
		}
	}
}


void RunAggregationNaiveDCB(
	const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
	const unsigned int* gpuImg1,
	const unsigned int* gpuImg2,
	const unsigned int pitch,
	const unsigned int w, const unsigned int h,
	const unsigned int radius, const float sigmaS, const float sigmaC
	)
{
	// wrap memory in cudatemplates
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);

	try
	{
		// allocate some temporary memory
		Cuda::DeviceMemoryPitched3D<float> ctTempCost(w, h, numDisps);

		// using wide blocks for maximum coalescing
		dim3 aggBlock(32, 8, 1);
		dim3 aggGrid((w + aggBlock.x - 1) / aggBlock.x, (h + aggBlock.y - 1) / aggBlock.y, 1);

		RECORD_KERNEL_LAUNCH("Naive DCB aggregation kernel", aggGrid, aggBlock);
		for(unsigned int d = 0; d < numDisps; d++)
		{
			NaiveDCBAggregationKernel<<<aggGrid, aggBlock>>>(
				toPitchedPtr(ctGpuCost), toPitchedPtr(ctTempCost), toPitchedPtr(ctGpuImg1), toPitchedPtr(ctGpuImg2),
				w, h, d, radius, sigmaS, sigmaC);

			// need to check after every run, as the total runtime might exceed the 2 second timeout in Windows
			CHECK_KERNEL_ERROR("Naive DCB aggregation kernel");
			CUDA_CALL(cudaThreadSynchronize());
		}
		
		// copy results back and free temporary memory
		Cuda::copy(ctGpuCost, ctTempCost);
		ctTempCost.free();
	}
	catch(const std::exception &e)
	{
		fprintf(stderr, "Error: %s", e.what());
	}
}