// $Id: YoonKweonKernel.cu 855 2009-11-01 13:13:39Z cr333 $

// redirect stdout & stderr to files
//#define REDIRECT_OUTPUTS
#define VERBOSE_OUTPUT 0
#define YK_STANDALONE

#include <exception>
#include <cutil.h>
#include "cudatemplates/copy.hpp"
#include "cudatemplates/devicememory.hpp"
#include "cudatemplates/devicememorypitched.hpp"
#include "cudatemplates/devicememoryreference.hpp"
#include "cudamath.h"

#ifdef YK_STANDALONE
#define AGG_TILE_SIZE 16
#include "CudaHelperCommon.cuh"
#else
#include "settings.h"
#include "stereo.h"
#endif

// spatial weight: plot exp(-x/17.5) and exp(-x^2/(2*16*16)), x=0..20

inline __device__ float CostYK(const int dx, const int dy, const uchar4 c1, const uchar4 c2, const float gammaC, const float gammaP)
{
	//const float deltaC = 0; // ignore colour difference

	//// AD in RGB
	//const float deltaC =
	//    fabsf(float(c1.x) - float(c2.x)) +
	//    fabsf(float(c1.y) - float(c2.y)) +
	//    fabsf(float(c1.z) - float(c2.z));

	//// Euclidean distance in RGB
	//const float deltaC = Euclidean(
	//    uchar4_to_float3(c1),
	//    uchar4_to_float3(c2)
	//);

	//// Euclidean distance in Lab, assuming sRGB
	//const float deltaC = Euclidean(
	//    xyz2lab(rgb2xyz(srgb2rgb(uchar4_to_float3(c1)))),
	//    xyz2lab(rgb2xyz(srgb2rgb(uchar4_to_float3(c2))))
	//);

	// Euclidean distance in Lab, assuming linear RGB
	const float deltaC = Euclidean(
		xyz2lab(rgb2xyz(uchar4_to_float3(c1))),
		xyz2lab(rgb2xyz(uchar4_to_float3(c2)))
	);

	// spatial distance
	const float deltaP = sqrtf(float(dx * dx + dy * dy));

	return __expf(-(deltaC / gammaC + deltaP / gammaP)); // Yoon & Kweon
	//return __expf(-(deltaC * deltaC / (2 * gammaC * gammaC))) * sqrtf(__expf(-(deltaP * deltaP / (2 * gammaP * gammaP)))); // DCB
}

//inline __device__ float Gs(const float x) { return fastGaussian2<SIGMA_S>(x); }
//inline __device__ float Gc(const float x) { return fastGaussian(x, SIGMA_C); }

__global__ void YoonKweonAggregationKernel(
	const cudaPitchedPtr gpuCost,
	const cudaPitchedPtr gpuOutCost,
	const cudaPitchedPtr gpuImg1,
	const cudaPitchedPtr gpuImg2,
	const unsigned int width, const unsigned int height, const unsigned int d, const int radius, const float gammaC, const float gammaP)
{
	//int x = (blockIdx.x / NUM_DISPS) * blockDim.x + threadIdx.x;
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;
	//int d = blockIdx.x % NUM_DISPS;

	if(x < width && y < height) // only pixels inside the images
	{
		if(x < d)
		{
			// Some disparity values map the left-most pixels (in the left image) outside the
			// corresponding right image. So just copy their costs from the input cost space.
			((float*)gpuOutCost.ptr)[(gpuOutCost.pitch >> 2) * (height * d + y) + x] = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (height * d + y) + x];
		}
		else // for the remaining pixels (which have corresponding pixels in the right view)
		{
			// homogeneous accumulator
			float2 acc = make_float2(0.0f, 0.0f);

			// read centre pixels only once
			const uchar4 pixL1 = int_to_uchar4(((int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x]);
			const uchar4 pixR1 = int_to_uchar4(((int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d]);

			// loop over all pixel in the neighbourhood
			for(int dy = -int(radius); dy <= radius; dy++)
			{
				if(y + dy >= 0 && y + dy < height) // check bounds of cost space
				{
					for(int dx = -int(radius); dx <= radius; dx++)
					{
						if(x + dx >= 0 && x + dx < width) // check bounds of cost space
						{
							float w = 1.0; // start with uniform weight

							const uchar4 pixL2 = int_to_uchar4(((int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * (y + dy) + (x + dx)]);
							w *= CostYK(dx, dy, pixL1, pixL2, gammaC, gammaP);

							if(x + dx >= d)
							{
								const uchar4 pixR2 = int_to_uchar4(((int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * (y + dy) + (x + dx - d)]);
								w *= CostYK(dx, dy, pixR1, pixR2, gammaC, gammaP);
							}
							else
							{
								w *= 0.0f;
							}

							acc += make_float2(w * ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (height * d + (y + dy)) + (x + dx)], w);
						}
					}
				}
			}

			unsigned int offset = (gpuOutCost.pitch >> 2) * (height * d + y) + x;
			((float*)gpuOutCost.ptr)[offset] = acc.x / acc.y; // weighted average
		}
	}
}


void RunAggregationYoonKweon(
#ifdef YK_STANDALONE
	const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
	const unsigned int* gpuImg1,
	const unsigned int* gpuImg2,
	const unsigned int pitch,
#else
	Cuda::DeviceMemory<float, 3>& ctGpuCost,
	const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg1,
	const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg2,
#endif
	const unsigned int w, const unsigned int h,
#ifdef YK_STANDALONE
	const int radius, const float gammaP, const float gammaC
#else
	Parameters p
#endif
	)
{
#ifdef YK_STANDALONE
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);
#else
	const int radius = 17;
	const float gammaP = p.aggr.gammaP;
	const float gammaC = p.aggr.gammaC;
	const unsigned int numDisps = p.numDisps;
	const bool verbose = p.verbose;
#endif

#ifdef REDIRECT_OUTPUTS
	// redirect stdout & stderr to log files
	FILE* new_stdout = freopen("YK-stdout.txt", "w", stdout);
	FILE* new_stderr = freopen("YK-stderr.txt", "w", stderr);

	if (new_stdout == NULL || new_stderr == NULL)
		exit(1);
#endif // REDIRECT_OUTPUTS

	try
	{
		// allocate some temporary memory
		Cuda::DeviceMemoryPitched3D<float> ctTempCost(w, h, numDisps);

		// using regular square tiles for blocks
		dim3 aggBlock(AGG_TILE_SIZE, AGG_TILE_SIZE, 1);
		//dim3 aggGrid(NUM_DISPS * ((w + aggBlock.x - 1) / aggBlock.x), (h + aggBlock.y - 1) / aggBlock.y, 1);
		dim3 aggGrid((w + aggBlock.x - 1) / aggBlock.x, (h + aggBlock.y - 1) / aggBlock.y, 1);

		RECORD_KERNEL_LAUNCH("Yoon Kweon aggregation kernel", aggGrid, aggBlock);
		for(unsigned int d = 0; d < numDisps; d++)
		{
			YoonKweonAggregationKernel<<<aggGrid, aggBlock>>>(
				toPitchedPtr(ctGpuCost), toPitchedPtr(ctTempCost), toPitchedPtr(ctGpuImg1), toPitchedPtr(ctGpuImg2),
				w, h, d, radius, gammaC, gammaP);

			// need to check after every run, as the total runtime might exceed the 2 second timeout in Windows
			CHECK_KERNEL_ERROR("Yoon Kweon aggregation kernel");
			CUDA_CALL(cudaThreadSynchronize());
		}
		
		// copy results back and free temporary memory
		Cuda::copy(ctGpuCost, ctTempCost);
		ctTempCost.free();
	}
	catch(const std::exception &e)
	{
		fprintf(stderr, "Error: %s", e.what());
	}

#ifdef REDIRECT_OUTPUTS
	fflush(new_stdout); fclose(new_stdout);
	fflush(new_stderr); fclose(new_stderr);
#endif
}

// clean up
#undef YK_STANDALONE
#undef VERBOSE_OUTPUT
#undef REDIRECT_OUTPUTS