// $Id: ImageXBlurKernel.cu 792 2009-10-01 18:24:11Z daho2 $
#include "UnmanagedPreProcessors.h"
#include "CudaHelperCommon.cuh"

// See http://cnx.org/content/m11067/latest/ for details
__device__ float CalculateGaussianWeight(float sigma, int n) // n > 0
{
	float x = (0.5f + (float)n) / sigma;
	float xprev = (0.5f + (float)(n - 1)) / sigma;

	return expf(-xprev * xprev / 2.0f) / (1.64f * xprev + sqrtf(0.76f * xprev * xprev + 4.0f)) - 
		expf(-x * x / 2.0f) / (1.64f * x + sqrtf(0.76f * x * x + 4.0f));
}
__device__ float CalculateGaussianWeightZero(float sigma)
{
	float x = 0.5f / sigma;
	return 1.0f - 2.0f * expf(-x * x / 2.0f) / (1.64f * x + sqrtf(0.76f * x * x + 4.0f));
}

__global__ void ImageXBlurKernel(unsigned int* imageIn, int imageInStride, unsigned int* imageOut, int imageOutStride, 
								 int width, int height, float sigma, int kernelHalfSize, int copyFwdThreshold)
{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	extern __shared__ unsigned int shared_vals[];

	// Read in the main 'responsibility' values
	unsigned int pixVal = shared_vals[threadIdx.x + copyFwdThreshold] = (x >= width ? 0 : imageIn[imageInStride * blockIdx.y + x]);

	// Read in the edge values (not all threads)
	if(threadIdx.x < 2 * copyFwdThreshold)
	{
		int readIdx, saveIdx;
		bool doSave;

		// Set up the details for the memory access operation
		if(threadIdx.x < copyFwdThreshold)
		{
			readIdx = x + blockDim.x;
			saveIdx = blockDim.x + threadIdx.x + copyFwdThreshold;
			doSave = (readIdx < width);
		}
		else
		{
			readIdx = x - 2 * copyFwdThreshold;
			saveIdx = threadIdx.x - copyFwdThreshold;
			doSave = (readIdx >= 0);
		}

		// Read in the edge values
		shared_vals[saveIdx] = (!doSave ? 0 : imageIn[imageInStride * blockIdx.y + readIdx]);
	}

	__syncthreads();

	// Apply the blur kernel
	if(x < width)
	{
		float weight = CalculateGaussianWeightZero(sigma);
		float rval = (float)((pixVal >> 16) & 0xFF) * weight;
		float gval = (float)((pixVal >> 8) & 0xFF) * weight;
		float bval = (float)(pixVal & 0xFF) * weight;

		// Compute the normalizing factor as the kernel is applied
		float sumWeight = weight;

		for(int i = 1; i < kernelHalfSize; ++i)
		{
			weight = CalculateGaussianWeight(sigma, i);

			unsigned int rightVal = shared_vals[threadIdx.x + copyFwdThreshold + i];
			unsigned int leftVal = shared_vals[threadIdx.x + copyFwdThreshold - i];

			// Weight each colour-component value independently
			rval += (float)((rightVal >> 16) & 0xFF) * weight;
			rval += (float)((leftVal >> 16) & 0xFF) * weight;
			gval += (float)((rightVal >> 8) & 0xFF) * weight;
			gval += (float)((leftVal >> 8) & 0xFF) * weight;
			bval += (float)(rightVal & 0xFF) * weight;
			bval += (float)(leftVal & 0xFF) * weight;

			// N.B. the following uses the default conversion bool->int
			// The out-of-bounds weights are set to zero, so the sum should not be modified to preserrve brightness
			sumWeight += (x + i < width) * weight + (x - i >= 0) * weight;
		}
		
		// Uncoalesced write - due to co-ordinates transpose action
		imageOut[imageOutStride * x + blockIdx.y] = (unsigned int)(
			(((unsigned int)min(rval / sumWeight, 255.0f) & 0xFF) << 16) + 
			(((unsigned int)min(gval / sumWeight, 255.0f) & 0xFF) << 8) + 
			((unsigned int)min(bval / sumWeight, 255.0f) & 0xFF));
	}
}

void RunImageXBlurKernel(unsigned int* inputImage, int inputImageStride, unsigned int* outputImage, int outputImageStride, 
						 int width, int height, float sigma)
{
	int kernelHalfSize = max((int)(sigma + 2), 3);

	// The copy forward threshold determines how many elements each side should be loaded
	int copyFwdThreshold = 16 * ((kernelHalfSize - 1) / 16 + 1);
	
	dim3 blockDimension(128, 1);
	dim3 gridDimension((width - 1) / blockDimension.x + 1, height);
	int sharedMemBytes = (blockDimension.x + 2 * copyFwdThreshold) * sizeof(unsigned int);

	RECORD_KERNEL_LAUNCH("Image blur kernel", gridDimension, blockDimension);

	ImageXBlurKernel<<<gridDimension, blockDimension, sharedMemBytes>>>(inputImage, inputImageStride, outputImage, outputImageStride, 
		width, height, sigma, kernelHalfSize, copyFwdThreshold);

	CHECK_KERNEL_ERROR("Image blur kernel");
}