// $Id: DcbGrid.cu 860 2009-11-03 13:42:55Z cr333 $

// redirect stdout & stderr to files
//#define REDIRECT_OUTPUTS
#define VERBOSE_OUTPUT 0

#include <exception>
#include "cudatemplates/devicememorypitched.hpp"
#include "GpuTiledImages.hpp"
#include "DcbGrid.cuh"


// Computes a layout of <count> tiles of size <width> x <height>,
// to fit into a square of length <max_length> x <max_length>.
// The resulting tiling will be <tile_x> x <tile_y>.
// NB: <tile_x> x <tile_y> can be larger than <count> for non-rectangular layouts.
void ComputeLayout(const unsigned int width, const unsigned int height, const unsigned int count,
				   const unsigned int max_length, unsigned int& tile_x, unsigned int& tile_y)
{
	// First try to find a rectangular layout for all tiles.
	// For example: 3 x 4 layout of 12 tiles:
	//   +--+--+--+~~~
	//   +--+--+--+~~~
	//   +--+--+--+~~~
	//   +--+--+--+~~~
	//   ~~~~~~~~~~~~~ <- unused space
	//   ~~~~~~~~~~~~~
	const int max_tile_x = max_length / width;
	for(int cols = min(int(sqrtf((float)count)+0.1f), max_tile_x); cols > 0; cols--) // add in fiddle factor of 0.1 to avoid potential rounding issues
	{
		if(cols * (count / cols) == count)
		{
			tile_x = cols;
			tile_y = count / cols;

			// only return if it fits inside the big square
			if(width * tile_x < max_length && height * tile_y < max_length)
				return;
		}
	}

	// Avoid pathological cases by tiling in reading order (which may be inefficient).
	// For example: 3 x 4 layout of 11 tiles:
	//   +---+---+---+
	//   +---+---+---+
	//   +---+---+---+
	//   +---+---+#### <- not used, but inside the <tile_x> x <tile_y> rectangle
	//   ~~~~~~~~~~~~~ <- unused space
	//   ~~~~~~~~~~~~~
	tile_x = min(count, max_tile_x);
	tile_y = (count - 1) / tile_x + 1;

	// check if this is valid
	assert(width * tile_x < max_length);
	assert(height * tile_y < max_length);
}


// Compute all dimensions (for internal use)
void CalculateGridTextureSize(Config& c)
{
	//// compute dimensions of bilateral grid (using rounding), e.g. downsampled width:
	////   x = 0..round(x_max/s_s) => round(x_max/s_s) + 1 levels
	//int dw = int((w - 1) / p.aggrDCBSigmaS + 0.5f) + 1;
	//int dh = int((h - 1) / p.aggrDCBSigmaS + 0.5f) + 1;
	//int dc = int(255.0f / p.aggrDCBSigmaC + 0.5f) + 1;

	// compute dimensions of bilateral grid (using floor), e.g. downsampled width:
	//   x = 0..floor(x_max/s_s) => floor(x_max/s_s) + 1 levels
	c.dw = (unsigned int)((c.w - 1) / c.sigmaS) + 1;
	c.dh = (unsigned int)((c.h - 1) / c.sigmaS) + 1;
	c.dc = (unsigned int)(100.0f / c.sigmaC) + 1;

	// compute layout to fit in 8192 x 8192 texture
	ComputeLayout(c.dw * c.dc, c.dh * c.dc, c.numDisps, 1 << 13, c.tile_x, c.tile_y);
}


// just computes texture size
void CalculateGridTextureSize(
	const unsigned int w, const unsigned int h, const unsigned int numDisps,
	const float sigmaS, const float sigmaC,
	unsigned int& texWidth, unsigned int& texHeight)
{
	Config c = { NULL, NULL, NULL, NULL, w, h, numDisps, sigmaS, sigmaC, 0, 0, 0, 0, 0 };
	CalculateGridTextureSize(c);
	texWidth  = c.dw * c.dc * c.tile_x;
	texHeight = c.dh * c.dc * c.tile_y;

#ifdef _DEBUG
	// print texture size
	printf("GridTexture(%u x %u x %u, %.1f, %.1f) => dw=%u, dh=%u, dc=%u => tile %ux%u => %u x %u (%.1f MB)\n",
		w, h, numDisps, sigmaS, sigmaC, c.dw, c.dh, c.dc, c.tile_x, c.tile_y, texWidth, texHeight, 8 * texWidth * texHeight / 1024.0f / 1024.0f);
#endif // _DEBUG
}


void RunAggregationDCBGrid(
#ifdef DCBGRID_STANDALONE
	const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
	const unsigned int* gpuImg1,
	const unsigned int* gpuImg2,
	const int pitch,
	float2* gpuGrid,
	const unsigned int gridWidth,
	const unsigned int gridHeight,
	const unsigned int gridPitch,
#else
	const Cuda::DeviceMemory<float, 3>& ctGpuCost,
	const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg1,
	const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg2,
	const Cuda::DeviceMemory<float2, 2>* ctGrid,
#endif
	unsigned int w, unsigned int h,
#ifdef DCBGRID_STANDALONE
	float sigmaS, float sigmaC
#else
	Parameters p
#endif
	)
{
#ifdef DCBGRID_STANDALONE
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	Cuda::DeviceMemoryReference2D<float2>* ctGrid = NULL;
	if(gpuGrid != NULL)
	{
		ctGrid = new Cuda::DeviceMemoryReference2D<float2>(gridWidth, gridHeight, gpuGrid);
		ctGrid->setPitch(gridPitch);
	}
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);
	const bool verbose = VERBOSE_OUTPUT;
#else
	const float sigmaS = p.aggr.sigmaS;
	const float sigmaC = p.aggr.sigmaC;
	const int numDisps = p.numDisps;
	const bool verbose = p.verbose;
#endif

#ifdef REDIRECT_OUTPUTS
	// redirect stdout & stderr to log files
	FILE* new_stdout = freopen("DCB-grid-stdout.txt", "w", stdout);
	FILE* new_stderr = freopen("DCB-grid-stderr.txt", "w", stderr);

	if (new_stdout == NULL || new_stderr == NULL)
		exit(1);
#endif // REDIRECT_OUTPUTS

	Config c = { &ctGpuCost, &ctGpuImg1, &ctGpuImg2, ctGrid, w, h, numDisps, sigmaS, sigmaC, 0, 0, 0, 0, 0 };

	// remember if we need to free the memory if allocated locally
	bool freeGrid = (c.gpuGrid == NULL);
	
	// c.gpuGrid is a DeviceMemoryReference2D and as such cannot be freed,
	// even though it is allocated using DeviceMemoryPitched2D.
	// We therefore keep an additional pointer to it, to free it afterwards.
	Cuda::DeviceMemoryPitched2D<float2>* gridToFree = NULL;

	try
	{
		// compute dimensions of bilateral grid
		CalculateGridTextureSize(c);

		// allocate grid memory
		if(freeGrid)
		{
			gridToFree = new Cuda::DeviceMemoryPitched2D<float2>(c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
			c.gpuGrid = gridToFree;
		}

		if(verbose)
		{
			printf("\nRunning DCB-Grid with s_s = %.1f, s_c = %.1f\n", c.sigmaS, c.sigmaC);
			printf("  => (%i x %i x %i) x (%i x %i x %i) = %i x %i texture\n", c.dc, c.dw, c.tile_x, c.dc, c.dh, c.tile_y, c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
		}

		// create grid ----------------------------------------------------------------------------
		RunCreateGridKernel(c);

		//// debug: save grid as huge PGM image
		//{
		//    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
		//    Cuda::copy(*(c.gpuGrid), hostGrid);
		//    cutSavePGMf("out/DCBgrid-pre-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
		//}

		// process grid ---------------------------------------------------------------------------
		RunProcessGridKernel(c);

		//// debug: save grid as huge PGM image
		//{
		//    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
		//    Cuda::copy(*(c.gpuGrid), hostGrid);
		//    cutSavePGMf("out/DCBgrid-post-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
		//}

		// slice grid -----------------------------------------------------------------------------
		RunSliceGridKernel(c);

		if(freeGrid)
		{
			gridToFree->free();
			delete c.gpuGrid;
		}
	}
	catch(const std::exception &e)
	{
		fprintf(stderr, "Error: %s", e.what());
	}

#ifdef REDIRECT_OUTPUTS
	fflush(new_stdout); fclose(new_stdout);
	fflush(new_stderr); fclose(new_stderr);
#endif
}


//==== 3. Colour-blind multi-frame DCB grid =======================================================

// The correct number of grids must be provided in gpuGrids.
// The previous grid should be stored in gpuGrids[1],
// and the new grid will be in gpuGrids[0].
void RunAggregationTDCBGrid(
	const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
	const unsigned int* gpuImg1,
	const unsigned int* gpuImg2,
	const int pitch,
	float2** gpuGrids,
	const unsigned int numGrids,
	const unsigned int gridWidth,
	const unsigned int gridHeight,
	const unsigned int gridPitch,
	unsigned int w, unsigned int h,
	float sigmaS, float sigmaC, const int weighting, const float wa, const float wb
	)
{
	// wrap all memory pointers using cudatemplates
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	Cuda::DeviceMemoryReference2D<float2>** ctGrids = new Cuda::DeviceMemoryReference2D<float2>*[numGrids];
	for(unsigned int i = 0; i < numGrids; i++)
	{
		ctGrids[i] = new Cuda::DeviceMemoryReference2D<float2>(gridWidth, gridHeight, gpuGrids[i]);
		ctGrids[i]->setPitch(gridPitch);
	}
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);
	const bool verbose = VERBOSE_OUTPUT;

	Config c = { &ctGpuCost, &ctGpuImg1, &ctGpuImg2, ctGrids[0], w, h, numDisps, sigmaS, sigmaC, 0, 0, 0, 0, 0 };

	// compute dimensions of bilateral grid
	CalculateGridTextureSize(c);

	if(verbose)
	{
		printf("\nRunning DCB-Grid with s_s = %.1f, s_c = %.1f\n", c.sigmaS, c.sigmaC);
		printf("  => (%i x %i x %i) x (%i x %i x %i) = %i x %i texture\n", c.dc, c.dw, c.tile_x, c.dc, c.dh, c.tile_y, c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
	}

	// create grid ----------------------------------------------------------------------------
	RunCreateGridKernel(c);

	// process grid ---------------------------------------------------------------------------
	RunProcessGridKernel(c);

	// slice grid -----------------------------------------------------------------------------
	RunMultiSliceGridKernel(c, ctGrids, numGrids, weighting, wa, wb);
}


//==== 4. Partially colour-blind per-frame DCB grid ===============================================

void RunAggregationDCBGrid2(
	const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
	const unsigned int* gpuImg1,
	const unsigned int* gpuImg2,
	const int pitch,
	GpuTiledImages3D<float2>* gpuGrids,
	unsigned int w, unsigned int h,
	float sigmaS, float sigmaC1, float sigmaC2)
{
	assert(gpuCost.ptr != NULL);
	assert(gpuImg1 != NULL);
	assert(gpuImg2 != NULL);
	assert(gpuGrids != NULL);

	// wrap all memory pointers using cudatemplates
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);

	assert(ctGpuImg1.size == ctGpuImg2.size);
	assert(ctGpuCost.size[0] == ctGpuImg1.size[0]);
	assert(ctGpuCost.size[1] == ctGpuImg1.size[1]);
	assert(ctGpuCost.size[2] == numDisps);

	Config2 c = { &ctGpuCost, &ctGpuImg1, &ctGpuImg2, gpuGrids, w, h, numDisps, sigmaS, sigmaC1, sigmaC2, 0, 0, 0, 0 };

	// compute dimensions of bilateral grid (using floor), e.g. downsampled width:
	//   x = 0..floor(x_max/s_s) => floor(x_max/s_s) + 1 levels
	c.dw = (unsigned int)((c.w - 1) / c.sigmaS) + 1;
	c.dh = (unsigned int)((c.h - 1) / c.sigmaS) + 1;
	c.dc1 = (unsigned int)(100.0f / c.sigmaC1) + 1;
	c.dc2 = (unsigned int)(100.0f / c.sigmaC2) + 1;

	// allocate space for the grids
	gpuGrids->allocGrids(c.dw * c.dc1 * c.dc2, c.dh * c.dc1, numDisps);

	// create grid ----------------------------------------------------------------------------
	RunCreateGridKernel2(c);

	// process grid ---------------------------------------------------------------------------
	RunProcessGridKernel2(c);

	// slice grid -----------------------------------------------------------------------------
	RunSliceGridKernel2(c);
}