// $Id: GpuTiledImages.hpp 859 2009-11-03 13:41:30Z cr333 $
#ifndef GPUTILEDIMAGES_HPP_INCLUDED
#define GPUTILEDIMAGES_HPP_INCLUDED

// ignore static_assert
#define CUDA_STATIC_ASSERT_H
#ifndef CUDA_STATIC_ASSERT
#define CUDA_STATIC_ASSERT(x) ;
#endif

#include "cudatemplates/devicememorypitched.hpp"

// undo ignore
#undef CUDA_STATIC_ASSERT_H
#undef CUDA_STATIC_ASSERT

// Describes a layout of tiles of size <width> x <height> within a 2D array of maximum size <max_length> x <max_length>.
template <typename T> class GpuTiledImages2D
{
private:
	Cuda::DeviceMemoryPitched2D<T>* gpuImage;
	unsigned int width;
	unsigned int height;
	unsigned int max_length;
	unsigned int tile_x;
	unsigned int tile_y;
	unsigned int num_tiles;

	// Example:
	// +---+---+---+   width  = 4 (characters)   tile_x = 3   num_tiles = 11
	// +---+---+---+   height = 1 (characters)   tile_y = 4
	// +---+---+---+
	// +---+---+#### <- not used, but inside the <tile_x> x <tile_y> rectangle
	// ~~~~~~~~~~~~~ <- unused space
	// ~~~~~~~~~~~~~

public:
	GpuTiledImages2D<typename T>(const unsigned int max_length)
		: gpuImage(NULL), width(0), height(0), max_length(max_length), tile_x(0), tile_y(0), num_tiles(0)
	{
	}

	GpuTiledImages2D<typename T>(const unsigned int width, const unsigned int height, const unsigned int max_length)
		: gpuImage(NULL), width(width), height(height), max_length(max_length), tile_x(0), tile_y(0), num_tiles(0)
	{
	}

	~GpuTiledImages2D()
	{
		freeImage();
	}

//---- Getter & Setters ---------------------------------------------------------------------------

	// getters
	inline Cuda::DeviceMemoryPitched2D<T>* getImage() const { return gpuImage; }
	inline unsigned int getMaxLength() const { return max_length; }
	inline unsigned int getTileWidth() const { return width; }
	inline unsigned int getTileHeight() const { return height; }
	inline unsigned int getTileX() const { return tile_x; }
	inline unsigned int getTileY() const { return tile_y; }
	inline unsigned int getNumTiles() const { return num_tiles; }
	inline unsigned int getXForTile(const unsigned int tile) const { assert(tile < num_tiles); return width * (tile % tile_x); }
	inline unsigned int getYForTile(const unsigned int tile) const { assert(tile < num_tiles); return height * (tile / tile_x); }

	// setters
	inline void setWidth(const unsigned int x) { assert(gpuImage == NULL); width = x; } // cannot change size after allocation
	inline void setHeight(const unsigned int x) { assert(gpuImage == NULL); height = x; }
	inline void setSize(const unsigned int w, const unsigned int h) { setWidth(w); setHeight(h); }
	inline void setMaxLength(const unsigned int x) { assert(x >= max_length); max_length = x; }

//---- Layouting routines -------------------------------------------------------------------------

	unsigned int computeMaxTiles() const
	{
		assert(width > 0);
		assert(height > 0);
		
		return (max_length / width) * (max_length / height);
	}

	// Computes a layout of <count> tiles of the configured size <width> x <height>,
	// to fit into a square of length <max_length> x <max_length>.
	// The resulting tiling will be <tile_x> x <tile_y>.
	// Return true if the layout was successful, otherwise false;
	bool computeLayout(const unsigned int count)
	{
		// First try to find a rectangular layout for all tiles.
		// For example: 3 x 4 layout of 12 tiles:
		//   +--+--+--+~~~
		//   +--+--+--+~~~
		//   +--+--+--+~~~
		//   +--+--+--+~~~
		//   ~~~~~~~~~~~~~ <- unused space
		//   ~~~~~~~~~~~~~
		const int max_tile_x = max_length / width;
		for(int cols = min(int(sqrtf((float)count)+0.1f), max_tile_x); cols > 0; cols--) // add in fiddle factor of 0.1 to avoid potential rounding issues
		{
			if(cols * (count / cols) == count)
			{
				tile_x = cols;
				tile_y = count / cols;
				num_tiles = count;

				// only return if it fits inside the big square
				if(width * tile_x < max_length && height * tile_y < max_length)
					return true;
			}
		}

		// Avoid pathological cases by tiling in reading order (which may be inefficient).
		// For example: 3 x 4 layout of 11 tiles:
		//   +---+---+---+
		//   +---+---+---+
		//   +---+---+---+
		//   +---+---+#### <- not used, but inside the <tile_x> x <tile_y> rectangle
		//   ~~~~~~~~~~~~~ <- unused space
		//   ~~~~~~~~~~~~~
		tile_x = min(count, max_tile_x);
		tile_y = (count - 1) / tile_x + 1;
		num_tiles = count;

		// check if this is valid
		if(width * tile_x < max_length && height * tile_y < max_length)
			return true;

		// reset state and indicate failure to layout
		tile_x = 0;
		tile_y = 0;
		num_tiles = 0;
		return false;
	}
	
//---- Memory (de-)allocation ---------------------------------------------------------------------

	// allocate space for current layout
	void allocImage()
	{
		freeImage();

		assert(tile_x > 0);
		assert(tile_y > 0);
		assert(width > 0);
		assert(height > 0);
		assert(tile_x * width <= max_length);
		assert(tile_y * height <= max_length);

		printf("GpuTiledImages2D: allocating (%u x %u) x (%u x %u) = %u x %u (%.1f MB)\n",
			tile_x, width, tile_y, height, tile_x * width, tile_y * height,
			sizeof(T) * tile_x * width * tile_y * height / 1024.0f / 1024.0f);
		gpuImage = new Cuda::DeviceMemoryPitched2D<T>(tile_x * width, tile_y * height);
	}

	// free the allocated space
	inline void freeImage()
	{
		if(gpuImage != NULL)
		{
			printf("GpuTiledImages2D: freeing (%u x %u) x (%u x %u) = %u x %u (%.1f MB)\n",
				tile_x, width, tile_y, height, tile_x * width, tile_y * height,
				sizeof(T) * tile_x * width * tile_y * height / 1024.0f / 1024.0f);
			gpuImage->free();
			delete gpuImage;
			gpuImage = NULL;
		}
	}
};

// Tiles small images across multiple 2D arrays, as necessary.
template <typename T> class GpuTiledImages3D
{
private:
	GpuTiledImages2D<T>** layouts;
	unsigned int num_layouts;

	unsigned int width;
	unsigned int height;
	unsigned int max_length;
	unsigned int num_tiles;

public:
	GpuTiledImages3D(const unsigned int max_length)
		: layouts(NULL), num_layouts(0), width(0), height(0), max_length(max_length), num_tiles(0)
	{
	}
	
	GpuTiledImages3D(const unsigned int width, const unsigned int height, const unsigned int max_length)
		: layouts(NULL), num_layouts(0), width(width), height(height), max_length(max_length), num_tiles(0)
	{
	}

	~GpuTiledImages3D()
	{
		freeLayouts();
		delete[] layouts;
	}

//---- Getter & Setters ---------------------------------------------------------------------------

	// getters
	//inline Cuda::DeviceMemoryPitched2D<T>* getImage() const { return gpuImage; }
	inline GpuTiledImages2D<T>* getLayout(const unsigned int index) const { assert(index < num_layouts); assert(layouts != NULL); return layouts[index]; }
	inline unsigned int getNumLayouts() const { return num_layouts; }
	inline unsigned int getWidth() const { return width; }
	inline unsigned int getHeight() const { return height; }
	inline unsigned int getMaxLength() const { return max_length; }
	inline unsigned int getNumTiles() const { return num_tiles; }

	//// setters
	//inline void setWidth(const unsigned int x) { width = x; }
	//inline void setHeight(const unsigned int x) { height = x; }
	//inline void setSize(const unsigned int w, const unsigned int h) { width = w; height = h; }
	//inline void setMaxLength(const unsigned int x) { max_length = x; }

//---- Memory (de-)allocation ---------------------------------------------------------------------

	inline void freeLayouts()
	{
		if(layouts != NULL)
		{
			for(unsigned int i = 0; i < num_layouts; i++)
			{
				delete layouts[i];
				layouts[i] = NULL;
			}
		}
		num_layouts = 0;
	}

	void allocGrids(const unsigned int _width, const unsigned int _height, const unsigned int _num_tiles)
	{
		if(_width == width && _height == height && _num_tiles == num_tiles)
		{
			return; // nothing to do
		}

		printf("allocGrid(%u, %u, %u)\n", _width, _height, _num_tiles);
		flushall();

		assert(_width < (1 << 13));
		assert(_height < (1 << 13));

		// basic realloc
		freeLayouts();

		width = _width;
		height = _height;
		layouts = new GpuTiledImages2D<float2>*[16];

		// create as many layouts as necessayr to hold all tiles
		int tiles_todo = _num_tiles;
		while(tiles_todo > 0 && num_layouts < 16)
		{
			// create a new layout
			GpuTiledImages2D<float2>* layout = new GpuTiledImages2D<float2>(width, height, 1<<13);

			// allocate as many tiles as possible/necessary
			unsigned int max_tiles = layout->computeMaxTiles();
			assert(layout->computeLayout(min(tiles_todo, max_tiles)));
			printf("%u. ", num_layouts);
			layout->allocImage();
			flushall();

			tiles_todo -= layout->getNumTiles();
			layouts[num_layouts] = layout;
			num_layouts++;
		}

		num_tiles = _num_tiles;
	}


};

#endif // GPUTILEDIMAGES_HPP_INCLUDED