#include "GpuCostSpace.h"
#include "CudaHelperCommon.cuh"

void GpuCostSpace::Create(GpuCostSpaceType type, int width, int height, int depth)
{
	if(type != gridType || width != w || height != h || depth != d)
	{
		Destroy();

		LOG_EVENT("Creating cost space");
		switch(type)
		{
		case COST_SPACE_TYPE_ANY: //fall-through
		case COST_SPACE_TYPE_SINGLE:
			extent.width = width * sizeof(float);
			extent.height = height;
			extent.depth = depth;
			break;
		}

		CUDA_CALL(cudaMalloc3D(&gpuGrid, extent));
		w = width;
		h = height;
		d = depth;
		gridType = type;
	}
}

void GpuCostSpace::Destroy()
{
	LOG_EVENT("Destroying cost space");
	CUDA_FREE(gpuGrid.ptr);
	w = h = d = 0;
}

void GpuCostSpace::SwapData(GpuCostSpace & other)
{
	Swap<cudaPitchedPtr>(gpuGrid, other.gpuGrid);
	Swap<GpuCostSpaceType>(gridType, other.gridType);
	Swap<int>(w, other.w);
	Swap<int>(h, other.h);
	Swap<int>(d, other.d);
}

void GpuCostSpace::AsyncCopyFrom(const GpuCostSpace* const other, const GpuExecutionStream & stream)
{
	SizeToMatch(*other);

	cudaMemcpy3DParms params = { 0 };

	params.srcPtr = other->gpuGrid;
	params.dstPtr = gpuGrid;
	params.extent = extent;
	params.kind = cudaMemcpyDeviceToDevice;

	//cudaMemcpy3DAsync(&params, stream.GetStream());
	CUDA_CALL(cudaMemcpy3D(&params));
}

void GpuCostSpace::SizeToMatch(const GpuCostSpace & other)
{
	Create(other.gridType, other.GetWidth(), other.GetHeight(), other.GetDepth());
}

#pragma region Copying out
void GpuCostSpace::CopyDeviceSpaceToHost(const cudaPitchedPtr & devicePtr, float* hostPtr)
{
	CUDA_CALL(cudaMemcpy2D(hostPtr, w * sizeof(float), devicePtr.ptr, devicePtr.pitch, w * sizeof(float), h * d, cudaMemcpyDeviceToHost));
}

__global__ void TransformBytesToFloatsKernel(const cudaPitchedPtr bytesSpaceIn, const int maxDepth, const int maxDepthVal, const cudaPitchedPtr floatsSpaceOut)
{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;

	if(x < bytesSpaceIn.xsize / sizeof(unsigned int) && y < bytesSpaceIn.ysize)
	{
		// Perform the simple conversion of 4* depth-wise packed bytes to unpacked floats
		for(int dval = 0; dval < maxDepthVal; ++dval)
		{
			unsigned int costVal = ACCESS_3D_UINT(bytesSpaceIn, x, y, dval);

#pragma unroll 4
			for(int i = 3; i >= 0; --i)
				if(4 * dval + 3 - i < maxDepth)
					ACCESS_3D(floatsSpaceOut, x, y, 4 * dval + 3 - i) = ((float)((costVal >> (8 * i)) & 0xFF)) / 255.0f;
		}
	}
}

void GpuCostSpace::CopyDataOut(float* result)
{
	switch(gridType)
	{
	case COST_SPACE_TYPE_ANY: //fall-through
	case COST_SPACE_TYPE_SINGLE:
		CopyDeviceSpaceToHost(gpuGrid, result);
		break;
	}
}
#pragma endregion

#pragma region Down-sampling

__global__ void DownSampleCostsFloat(const cudaPitchedPtr costIn, int dsFactor, int depth, const cudaPitchedPtr costOut)
{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;

	if(x < costOut.xsize / sizeof(float) && y < costOut.ysize)
		for(int d = 0; d < depth; ++d)
			ACCESS_3D(costOut, x, y, d) = ACCESS_3D(costIn, dsFactor * x, dsFactor * y, d);
}

void GpuCostSpace::DownsampleFrom(const GpuCostSpace* const other, int downsampleFactor)
{
	Create(other->GetType(), other->w / downsampleFactor, other->h / downsampleFactor, other->d);

	dim3 blockDimension(32, 8);
	dim3 gridDimension((w - 1) / blockDimension.x + 1, (h - 1) / blockDimension.y + 1);

	switch(gridType)
	{
	case COST_SPACE_TYPE_ANY: //fall-through
	case COST_SPACE_TYPE_SINGLE:

		RECORD_KERNEL_LAUNCH("Down-sample (floats) kernel", gridDimension, blockDimension);
		DownSampleCostsFloat<<<gridDimension, blockDimension>>>(other->gpuGrid, downsampleFactor, d, gpuGrid);
		CHECK_KERNEL_ERROR("Down-sample (floats) kernel");

		break;
	}
}

#pragma endregion