// $Id: CostBlurKernel.cu 792 2009-10-01 18:24:11Z daho2 $
#include "UnmanagedDownsamplers.h"
#include "CudaHelperCommon.cuh"

// See http://cnx.org/content/m11067/latest/ for details
__host__ __device__ float GaussianWeight(float sigma, int n) // n > 0
{
	float x = (0.5f + (float)n) / sigma;
	float xprev = (0.5f + (float)(n - 1)) / sigma;

	return expf(-xprev * xprev / 2.0f) / (1.64f * xprev + sqrtf(0.76f * xprev * xprev + 4.0f)) - 
		expf(-x * x / 2.0f) / (1.64f * x + sqrtf(0.76f * x * x + 4.0f));
}
__host__ __device__ float GaussianWeightZero(float sigma)
{
	float x = 0.5f / sigma;
	return 1.0f - 2.0f * expf(-x * x / 2.0f) / (1.64f * x + sqrtf(0.76f * x * x + 4.0f));
}

__global__ void CostBlurKernel(const cudaPitchedPtr costIn, const int depth, const float sigma, 
								const int kernelHalfSize, const int copyFwdThreshold, const float normFactor, const cudaPitchedPtr costOut)
{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	extern __shared__ float shared_vals[];

	for(int d = 0; d < depth; ++d)
	{
		// Read in the main 'responsibility' values
		float costVal = shared_vals[threadIdx.x + copyFwdThreshold] = ((x >= costIn.xsize / sizeof(float)) ? 0 : ACCESS_3D(costIn, x, blockIdx.y, d));

		// Read in the edge values (not all threads)
		if(threadIdx.x < 2 * copyFwdThreshold)
		{
			int readIdx, saveIdx;
			bool doSave;

			// Set up the details for the memory access operation
			if(threadIdx.x < copyFwdThreshold)
			{
				readIdx = x + blockDim.x;
				saveIdx = blockDim.x + threadIdx.x + copyFwdThreshold;
				doSave = (readIdx < costIn.xsize / sizeof(float));
			}
			else
			{
				readIdx = x - 2 * copyFwdThreshold;
				saveIdx = threadIdx.x - copyFwdThreshold;
				doSave = (readIdx >= 0);
			}

			// Read in the edge values
			shared_vals[saveIdx] = (!doSave ? 0 : ACCESS_3D(costIn, readIdx, blockIdx.y, d));
		}

		__syncthreads();

		// Apply the blur kernel
		if(x < costIn.xsize / sizeof(float))
		{
			costVal *= GaussianWeightZero(sigma);

			for(int i = 1; i < kernelHalfSize; ++i)
				costVal += GaussianWeight(sigma, i) * 
					(shared_vals[threadIdx.x + copyFwdThreshold + i] + shared_vals[threadIdx.x + copyFwdThreshold - i]);
			
			// Uncoalesced write
			ACCESS_3D(costOut, blockIdx.y, x, d) = costVal * normFactor;
		}

		__syncthreads();
	}
}

void RunCostBlurKernel(const cudaPitchedPtr & costIn, int depth, float sigma, const cudaPitchedPtr & costOut)
{
	int kernelHalfSize = max((int)(sigma + 2), 3);

	// Sum all of the weights in order to get an appropriate normalizing factor
	float sumGaussianWeights = GaussianWeightZero(sigma);
	for(int i = 1; i < kernelHalfSize; ++i)
		sumGaussianWeights += 2 * GaussianWeight(sigma, i);

	// The copy forward threshold determines how many elements each side should be loaded
	int copyFwdThreshold = 16 * ((kernelHalfSize - 1) / 16 + 1);
	
	dim3 blockDimension(128, 1);
	dim3 gridDimension((costIn.xsize / sizeof(float) - 1) / blockDimension.x + 1, costIn.ysize);
	int sharedMemBytes = (blockDimension.x + 2 * copyFwdThreshold) * sizeof(float);

	RECORD_KERNEL_LAUNCH("Cost space blur kernel", gridDimension, blockDimension);

	CostBlurKernel<<<gridDimension, blockDimension, sharedMemBytes>>>(costIn, depth, sigma, kernelHalfSize, copyFwdThreshold, 
		1.0f / sumGaussianWeights, costOut);

	CHECK_KERNEL_ERROR("Cost space blur kernel");
}