// $Id: RunDCBGridAgg.cu 795 2009-10-02 10:53:26Z cr333 $

// redirect stdout & stderr to files
//#define REDIRECT_OUTPUTS
#define VERBOSE_OUTPUT 0
#define DCBGRID_STANDALONE

#include <exception>
#include <cutil.h>
#include <cutil_inline_runtime.h>
#include <device_functions.h>
#include "cudatemplates/copy.hpp"
#include "cudatemplates/devicememory.hpp"
#include "cudatemplates/devicememoryreference.hpp"
#include "cudamath.h"
#include "CudaHelperCommon.cuh"

#ifdef DCBGRID_STANDALONE
#include "UnmanagedAggregators.h"
#else
#include "settings.h"
#include "stereo.h"
#endif

//#define NORMAL_0 1.0f
//#define NORMAL_1 0.60653066f
//#define NORMAL_2 0.135335283f
#define NORMAL_0 1.0f
#define NORMAL_1 __expf(-1.0f/1.0f)
#define NORMAL_2 __expf(-4.0f/1.0f)

inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
    const float s = 1.0f - t;
    return make_float2(s * a.x + t * b.x, s * a.y + t * b.y);
}

//// just gets the green component (which is the biggest contributor to grey)
//inline __device__ __host__ float getColour(const unsigned int pixel)
//{
//    return (pixel >> 8) & 0xff;
//}

//// luminance (assuming linear sRGB primaries)
//inline __device__ __host__ float getColour(const unsigned int pixel)
//{
//    return 0.2126729f * (pixel & 0xff) + 0.7151522f * ((pixel >>  8) & 0xff) + 0.0721750f * ((pixel >> 16) & 0xff);
//}

//// luminance (assuming sRGB primaries)
//inline __device__ float getColour(const unsigned int pixel)
//{
//    return
//        0.2126729f * 255.0f * __powf(( pixel        & 0xff) / 255.0, 1/2.2f) +
//        0.7151522f * 255.0f * __powf(((pixel >>  8) & 0xff) / 255.0, 1/2.2f) +
//        0.0721750f * 255.0f * __powf(((pixel >> 16) & 0xff) / 255.0, 1/2.2f);
//}

// lightness (assuming sRGB primaries)
inline __device__ float getColour(const unsigned int pixel)
{
    // convert RGB32 to float3 with 0..1 components
    float3 c = make_float3(float(pixel & 0xff) / 255.0f, float((pixel >>  8) & 0xff) / 255.0f, float((pixel >> 16) & 0xff) / 255.0f);

    // sRGB to linear RGB
    c = make_float3(
		c.x <= 0.04045f ? c.x / 12.92f : __powf((c.x + 0.055f) / 1.055f, 2.4f),
		c.y <= 0.04045f ? c.y / 12.92f : __powf((c.y + 0.055f) / 1.055f, 2.4f),
		c.z <= 0.04045f ? c.z / 12.92f : __powf((c.z + 0.055f) / 1.055f, 2.4f)
	);

    // linear RGB to Y
    float y = 0.2126729f * c.x + 0.7151522f * c.y + 0.0721750f * c.z;

    // Y to L
	return 116.0f * (y > 216.0f / 24389.0f ? cbrtf(y) : (24389.0f / 27.0f * y + 16.0f) / 116.0f) - 16.0f;
}

//// unsafe macros for 2D/3D indexing
//#define index2D(x1, w1, x2) ((x2) * (w1) + (x1))
//#define index3D(x1, w1, x2, w2, x3) (((x3) * (w2) + (x2)) * (w1) + (x1))

// type-safe inline functions for 2D/3D indexing
template <typename T, typename U> inline __device__ T index2D(
    const T x1, const unsigned int w1,
    const U x2)
{ return w1 * x2 + x1; }

template <typename T, typename U, typename V> inline __device__ T index3D(
    const T x1, const unsigned int w1,
    const U x2, const unsigned int w2,
    const V x3)
{ return (x3 * w2 + x2) * w1 + x1; }


texture<float2, 2, cudaReadModeElementType> gpuGridTex;
extern __shared__ char array[];


template <bool Init> void __global__ CreateGridKernel(
    const cudaPitchedPtr gpuCost,
    const cudaPitchedPtr gpuImg1,
    const cudaPitchedPtr gpuImg2,
    float2* gpuGrid,
    const size_t pitch,
    const unsigned int w,
    const unsigned int h,
    const unsigned int d,
    const unsigned int tile_x,
    const unsigned int dc,
    const float sigmaS,
    const float sigmaC)
{
    // for reference:
    //   dim3 createBlock((int)ceil(c.p.aggr.sigmaS), (int)ceil(c.p.aggr.sigmaS), 1);
    //   dim3 createGrid(c.dw, c.dh, 1);

    const int dw = gridDim.x;
    const int dh = gridDim.y;

    // initialise shared memory
    int* shared = (int*)array;
    for(int y = threadIdx.y; y < dc; y += blockDim.y)
    {
        for(int x = threadIdx.x; x < dc; x += blockDim.x)
        {
            shared[dc * y + x] = 0;
            shared[dc * dc + dc * y + x] = 0;
        }
    }
    __syncthreads();

#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ < 120

    // go single threaded to avoid contention when reading from or writing to shared memory
    if(threadIdx.x == 0 && threadIdx.y == 0)
    {
        for(int yi = blockIdx.y * blockDim.y; yi < blockIdx.y * blockDim.y + blockDim.y; yi++)
        {
            if(yi < h)
            {
                for(int xi = blockIdx.x * blockDim.x; xi < blockIdx.x * blockDim.x + blockDim.x; xi++)
                {
                    if(xi < w && xi >= d)
                    {
                        const int cL = int(getColour(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi]) / sigmaC); // left image pixel
                        const int cR = int(getColour(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d]) / sigmaC); // right image pixel
                        const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi]; // cost space value

                        // NB: both components scaled by same constant
                        shared[dc * cR + cL] += int(100 * iC);
                        shared[dc * dc + dc * cR + cL] += 100;
                    }
                }
            }
        }
    }

#else // compiling for compute capability 1.2 or higher, i.e. have atomic adds to shared memory
    
    const int xi = blockIdx.x * blockDim.x + threadIdx.x;
    const int yi = blockIdx.y * blockDim.y + threadIdx.y;

    if(xi < w && xi >= d && yi < h)
    {
        const int cL = int(getColour(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * yi + xi]    ) / sigmaC); // left image pixel
        const int cR = int(getColour(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * yi + xi - d]) / sigmaC); // right image pixel
        const float iC = ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + yi) + xi]; // cost space value

        // needs compute capability 1.2
        // NB: both components scaled by same constant
        atomicAdd(&shared[dc * cR + cL], int(100 * iC));
        atomicAdd(&shared[dc * dc + dc * cR + cL], 100);
    }

#endif // end of code branch for different compute capabilities

    __syncthreads();
    
    // copy data from shared memory to gpuGrid
    for(int y = threadIdx.y; y < dc; y += blockDim.y)
    {
        for(int x = threadIdx.x; x < dc; x += blockDim.x)
        {
            if(Init)
                gpuGrid[pitch *
                    index3D(y, dc, blockIdx.y, dh, d / tile_x) +
                    index3D(x, dc, blockIdx.x, dw, d % tile_x)
                ] = make_float2(shared[dc * y + x], shared[dc * dc + dc * y + x]);
            else
                gpuGrid[pitch *
                    index3D(y, dc, blockIdx.y, dh, d / tile_x) +
                    index3D(x, dc, blockIdx.x, dw, d % tile_x)
                ] += make_float2(shared[dc * y + x], shared[dc * dc + dc * y + x]);
        }
    }
}


// blur along both colour dimensions
void __global__ ProcessGrid1Kernel(float2* gpuGrid, const unsigned int pitch)
{
    // for reference:
    //   dim3 processBlock1(c.dc, c.dc, 1);
    //   dim3 processGrid1(c.tile_x * c.dw, c.tile_y * c.dh, 1);

    const int tx = threadIdx.x;
    const int ty = threadIdx.y;
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const int dc = blockDim.x;

    // initialise shared memory
    float2* shared = (float2*)array;
    shared[dc * ty + tx] = gpuGrid[pitch * y + x];
    __syncthreads();

    // horizontal pass (left colour)
    float2          acc  = NORMAL_0 * shared[dc * ty + tx];
    if(tx >= 2)     acc += NORMAL_2 * shared[dc * ty + tx - 2];
    if(tx >= 1)     acc += NORMAL_1 * shared[dc * ty + tx - 1];
    if(tx + 1 < dc) acc += NORMAL_1 * shared[dc * ty + tx + 1];
    if(tx + 2 < dc) acc += NORMAL_2 * shared[dc * ty + tx + 2];
    __syncthreads();

    shared[dc * ty + tx] = acc;
    __syncthreads();

    // vertical pass (right colour)
                    acc  = NORMAL_0 * shared[dc * ty + tx];
    if(ty >= 2)     acc += NORMAL_2 * shared[dc * (ty - 2) + tx];
    if(ty >= 1)     acc += NORMAL_1 * shared[dc * (ty - 1) + tx];
    if(ty + 1 < dc) acc += NORMAL_1 * shared[dc * (ty + 1) + tx];
    if(ty + 2 < dc) acc += NORMAL_2 * shared[dc * (ty + 2) + tx];
    __syncthreads();

    gpuGrid[pitch * y + x] = acc;
}


// blur along x-dimension
void __global__ ProcessGrid2Kernel(float2* gpuGrid, const unsigned int pitch, const unsigned int dX, const unsigned int dY)
{
    // for reference:
    //   dim3 processBlock2(c.dw, 1, 1);
    //   dim3 processGrid2(c.dc, c.dc * c.dh, 1);

    const int x = gridDim.x * blockDim.x * dX + gridDim.x * threadIdx.x + blockIdx.x;
    const int y = gridDim.y * blockDim.y * dY + blockIdx.y;

    // initialise shared memory
    float2* shared = (float2*)array;
    shared[threadIdx.x] = gpuGrid[pitch * y + x];
    __syncthreads();

    // apply 5-tap Gaussian blur
    float2                           acc  = NORMAL_0 * shared[threadIdx.x];
    if(threadIdx.x >= 2)             acc += NORMAL_2 * shared[threadIdx.x - 2];
    if(threadIdx.x >= 1)             acc += NORMAL_1 * shared[threadIdx.x - 1];
    if(threadIdx.x + 1 < blockDim.x) acc += NORMAL_1 * shared[threadIdx.x + 1];
    if(threadIdx.x + 2 < blockDim.x) acc += NORMAL_2 * shared[threadIdx.x + 2];

    gpuGrid[pitch * y + x] = acc;
}


// blur along y-dimension
void __global__ ProcessGrid3Kernel(
    float2* gpuGrid, const unsigned int pitch, const unsigned int width,
    const unsigned int offsetX, const unsigned int offsetY,
    const unsigned int deltaY)
{
    // for reference:
    //   dim3 processBlock3(processTileX3, c.dh, 1);
    //   dim3 processGrid3((c.dc * c.dw + processTileX3 - 1) / processTileX3, c.dc, 1);

    const int x = offsetX + blockDim.x * blockIdx.x + threadIdx.x;
    const int y = offsetY + blockIdx.y + deltaY * threadIdx.y;

    // initialise shared memory
    float2* shared = (float2*)array;
    const int pixOffset = blockDim.y * threadIdx.x + threadIdx.y;
    if(x < width) shared[pixOffset] = gpuGrid[pitch * y + x];
    __syncthreads();

    // apply 5-tap Gaussian blur
    float2                           acc  = NORMAL_0 * shared[pixOffset];
    if(threadIdx.y >= 2)             acc += NORMAL_2 * shared[pixOffset - 2];
    if(threadIdx.y >= 1)             acc += NORMAL_1 * shared[pixOffset - 1];
    if(threadIdx.y + 1 < blockDim.y) acc += NORMAL_1 * shared[pixOffset + 1];
    if(threadIdx.y + 2 < blockDim.y) acc += NORMAL_2 * shared[pixOffset + 2];

    if(x < width) gpuGrid[pitch * y + x] = acc;
}


void __global__ SliceGridKernel(
    const cudaPitchedPtr gpuCost,
    const cudaPitchedPtr gpuImg1,
    const cudaPitchedPtr gpuImg2,
    const unsigned int w,
    const unsigned int h,
    const unsigned int d,
    const unsigned int dX,
    const unsigned int dY,
    const unsigned int dw,
    const unsigned int dh,
    const unsigned int dc,
    const float sigmaS,
    const float sigmaC)
{
    // for reference:
    //   dim3 sliceBlock(256, 1, 1);
    //   dim3 sliceGrid(c.tile_x * ((c.w + sliceBlock.x - 1) / sliceBlock.x), c.tile_y * c.h, 1);

	const int x = blockDim.x * blockIdx.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

    if(x >= d && x < w && y < h) // ignore outside pixels
    {
        // compute coordinates in bilateral grid
        const float cX = x / sigmaS;
        const float cY = y / sigmaS;
        const float cL = getColour(((unsigned int*)gpuImg1.ptr)[(gpuImg1.pitch >> 2) * y + x]) / sigmaC; // left image pixel
        const float cR = getColour(((unsigned int*)gpuImg2.ptr)[(gpuImg2.pitch >> 2) * y + x - d]) / sigmaC; // right image pixel

        //// nearest neighbour lookup
        //float2 newCost = tex2D(gpuGridTex, index3D(int(cL), dc, int(cX), dw, dX), index3D(int(cR), dc, int(cY), dh, dY));
        //float2 newCost = tex2D(gpuGridTex, index3D(cL, dc, int(cX), dw, dX), index3D(cR, dc, int(cY), dh, dY)); // with colour blur

        //// implements quadrilinear interpolation based on 4 bilinear lookups
        //const float2 newCostTL = tex2D(gpuGridTex, dc * dw * d + dc * max(0,      int(cX + 0.5f)) + cL, dc * max(0,      int(cY + 0.5f)) + cR);
        //const float2 newCostBL = tex2D(gpuGridTex, dc * dw * d + dc * max(0,      int(cX + 0.5f)) + cL, dc * min(dh + 1, int(cY + 1.5f)) + cR);
        //const float2 newCostTR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1.5f)) + cL, dc * max(0,      int(cY + 0.5f)) + cR);
        //const float2 newCostBR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1.5f)) + cL, dc * min(dh + 1, int(cY + 1.5f)) + cR);
        //const float2 newCostT = lerp(newCostTL, newCostTR, cX + 0.5f - int(cX + 0.5f));
        //const float2 newCostB = lerp(newCostBL, newCostBR, cX + 0.5f - int(cX + 0.5f));
        //const float2 newCost  = lerp(newCostT, newCostB, cY + 0.5f - int(cY + 0.5f));
        //
        //const float2 newCostTL = tex2D(gpuGridTex, dc * dw * d + dc *             int(cX + 0)  + cL, dc *             int(cY + 0)  + cR);
        //const float2 newCostBL = tex2D(gpuGridTex, dc * dw * d + dc *             int(cX + 0)  + cL, dc * min(dh + 1, int(cY + 1)) + cR);
        //const float2 newCostTR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1)) + cL, dc *             int(cY + 0)  + cR);
        //const float2 newCostBR = tex2D(gpuGridTex, dc * dw * d + dc * min(dw - 1, int(cX + 1)) + cL, dc * min(dh + 1, int(cY + 1)) + cR);
        //const float2 newCostT = lerp(newCostTL, newCostTR, cX - int(cX));
        //const float2 newCostB = lerp(newCostBL, newCostBR, cX - int(cX));
        //const float2 newCost  = lerp(newCostT, newCostB, cY - int(cY));
        //
        const float2 newCostTL = tex2D(gpuGridTex, index3D(cL, dc, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR, dc, max(0,      int(cY - 0.5f)), dh, dY));
        const float2 newCostBL = tex2D(gpuGridTex, index3D(cL, dc, max(0,      int(cX - 0.5f)), dw, dX), index3D(cR, dc, min(dh + 1, int(cY + 0.5f)), dh, dY));
        const float2 newCostTR = tex2D(gpuGridTex, index3D(cL, dc, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR, dc, max(0,      int(cY - 0.5f)), dh, dY));
        const float2 newCostBR = tex2D(gpuGridTex, index3D(cL, dc, min(dw - 1, int(cX + 0.5f)), dw, dX), index3D(cR, dc, min(dh + 1, int(cY + 0.5f)), dh, dY));
        const float2 newCostT = lerp(newCostTL, newCostTR, cX - 0.5f - int(cX - 0.5f));
        const float2 newCostB = lerp(newCostBL, newCostBR, cX - 0.5f - int(cX - 0.5f));
        const float2 newCost  = lerp(newCostT, newCostB, cY - 0.5f - int(cY - 0.5f));

        // write to cost space
        ((float*)gpuCost.ptr)[(gpuCost.pitch >> 2) * (h * d + y) + x] = newCost.x / newCost.y; // cost space value
    }
}


struct Config
{
    const Cuda::DeviceMemory<float, 3>* gpuCost;
    const Cuda::DeviceMemory<const unsigned int, 2>* gpuImg1;
    const Cuda::DeviceMemory<const unsigned int, 2>* gpuImg2;
    Cuda::DeviceMemory<float2, 2>* gpuGrid;
    unsigned int w;
    unsigned int h;
    //Parameters p;
	unsigned int numDisps;
	float sigmaS; 
	float sigmaC;
    //bool verbose;

    unsigned int dw;
    unsigned int dh;
    unsigned int dc;
    unsigned int tile_x;
    unsigned int tile_y;

    //// some clean-up code
    //~Config()
    //{
    //    if(gpuGrid != NULL)
    //    {
    //        //gpuGrid.unbindTexture<cudaReadModeElementType>(gpuGridTex); // is this necessary?
    //        delete gpuGrid;
    //        gpuGrid = NULL;
    //    }
    //}
};

// Compute all dimensions (for internal use)
void CalculateGridTextureSize(
    const unsigned int w, const unsigned int h, const unsigned int numDisps,
    const float sigmaS, const float sigmaC,
    unsigned int& dw, unsigned int& dh, unsigned int& dc, unsigned int& tile_x, unsigned int& tile_y)
{
    //// compute dimensions of bilateral grid (using rounding), e.g. downsampled width:
    ////   x = 0..round(x_max/s_s) => round(x_max/s_s) + 1 levels
    //int dw = int((w - 1) / p.aggrDCBSigmaS + 0.5f) + 1;
    //int dh = int((h - 1) / p.aggrDCBSigmaS + 0.5f) + 1;
    //int dc = int(255.0f / p.aggrDCBSigmaC + 0.5f) + 1;

    // compute dimensions of bilateral grid (using floor), e.g. downsampled width:
    //   x = 0..floor(x_max/s_s) => floor(x_max/s_s) + 1 levels
    dw = (unsigned int)((w - 1) / sigmaS) + 1;
    dh = (unsigned int)((h - 1) / sigmaS) + 1;
    dc = (unsigned int)(100.0f / sigmaC) + 1;

    // try to find a rectangular layout for all grid tiles, to fill the entire texture
    const int max_tile_x = (1 << 13) / (dw * dc);
    for(int cols = min(int(sqrtf((float)numDisps)+0.1f), max_tile_x); cols > 0; cols--) // add in fiddle factor of 0.1 to avoid potential rounding issues
    {
        if(cols * (numDisps / cols) == numDisps)
        {
            tile_x = cols;
            tile_y = numDisps / cols;
            break;
        }
    }

    // avoid pathological cases by tiling in reading order (may be inefficient)
    if(tile_x == 1)
    {
        tile_x = min(numDisps, max_tile_x);
        tile_y = (numDisps - 1) / tile_x + 1;
    }
}

// just computes texture size
void CalculateGridTextureSize(
    const unsigned int w, const unsigned int h, const unsigned int numDisps,
    const float sigmaS, const float sigmaC,
    unsigned int& texWidth, unsigned int& texHeight)
{
    unsigned int dw, dh, dc, tx, ty;
    CalculateGridTextureSize(w, h, numDisps, sigmaS, sigmaC, dw, dh, dc, tx, ty);
    texWidth  = dw * dc * tx;
    texHeight = dh * dc * ty;
}

void RunCreateGridKernel(Config& c)
{
    assert(c.gpuGrid != NULL);

    // compute block & grid dimensions
    dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
    dim3 createGrid(c.dw, c.dh, 1);

    RECORD_KERNEL_LAUNCH("Create DCB grid kernel", createGrid, createBlock);

    // create grid for all disparities
    for(unsigned int d = 0; d < c.numDisps; d++)
    {
        CreateGridKernel<1><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
            toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2),
            c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
            c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
    }

    CHECK_KERNEL_ERROR("Create DCB grid kernel");
    CUDA_CALL(cudaThreadSynchronize());
}

void RunCreateGridKernel(Config& c,
    const Cuda::DeviceMemoryReference3D<float>& prevCost,
    const Cuda::DeviceMemoryReference2D<const unsigned int>& prevImageL,
    const Cuda::DeviceMemoryReference2D<const unsigned int>& prevImageR)
{
    assert(c.gpuGrid != NULL);

    // compute block & grid dimensions
    dim3 createBlock((int)ceil(c.sigmaS), (int)ceil(c.sigmaS), 1);
    dim3 createGrid(c.dw, c.dh, 1);

// add first image
    RECORD_KERNEL_LAUNCH("Create DCB grid kernel #1", createGrid, createBlock);

    // create grid for all disparities
    for(unsigned int d = 0; d < c.numDisps; d++)
    {
        CreateGridKernel<1><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
            toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2),
            c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
            c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
    }

    CHECK_KERNEL_ERROR("Create DCB grid kernel #1");
    CUDA_CALL(cudaThreadSynchronize());

// add second image
    printf("Pre-SECOND %p %p %p\n", prevCost.getBuffer(), prevImageL.getBuffer(), prevImageR.getBuffer());
    if(prevCost.getBuffer() != NULL && prevImageL.getBuffer() != NULL && prevImageR.getBuffer() != NULL)
    {
        RECORD_KERNEL_LAUNCH("Create DCB grid kernel #2", createGrid, createBlock);
        printf("SECOND\n");

        // create grid for all disparities
        for(unsigned int d = 0; d < c.numDisps; d++)
        {
            CreateGridKernel<0><<<createGrid, createBlock, c.dc * c.dc * 2 * sizeof(int)>>>(
                toPitchedPtr(prevCost), toPitchedPtr(prevImageL), toPitchedPtr(prevImageR),
                c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3,
                c.w, c.h, d, c.tile_x, c.dc, c.sigmaS, c.sigmaC);
        }

        CHECK_KERNEL_ERROR("Create DCB grid kernel #2");
        CUDA_CALL(cudaThreadSynchronize());
    }
}


void RunProcessGridKernel(Config& c)
{
    // smooth along colour dimensions -------------------------------------------------------------
    dim3 processBlock1(c.dc, c.dc, 1);
    dim3 processGrid1(c.tile_x * c.dw, c.tile_y * c.dh, 1);

    RECORD_KERNEL_LAUNCH("Process DCB grid 1 kernel", processGrid1, processBlock1);

    ProcessGrid1Kernel<<<processGrid1, processBlock1, c.dc * c.dc * sizeof(float2)>>>(c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3);
   
	CHECK_KERNEL_ERROR("Process DCB grid 1 kernel");
    CUDA_CALL(cudaThreadSynchronize());

    // smooth along horizontal (x) dimension ------------------------------------------------------
    dim3 processBlock2(c.dw, 1, 1);
    dim3 processGrid2(c.dc, c.dc * c.dh, 1);

    RECORD_KERNEL_LAUNCH("Process DCB grid 2 kernel", processGrid2, processBlock2);

    for(unsigned int d = 0; d < c.numDisps; d++)
    {
        ProcessGrid2Kernel<<<processGrid2, processBlock2, c.dw * sizeof(float2)>>>(c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3, d % c.tile_x, d / c.tile_x);
    }
	CHECK_KERNEL_ERROR("Process DCB grid 2 kernel");
    CUDA_CALL(cudaThreadSynchronize());

    // smooth along vertical (y) dimension --------------------------------------------------------
    // NB: Conceptually, this kernel is of size 1 x dh. However, as this is pretty inefficient,
    //     we tile together a few of them along the x direction. This tiling factor should ideally
    //     be 16, the size of a half-warp, or some other power of two. Let's set the limit at
    //     256 threads per block, so that we can achieve a good occupancy (on both G80 and GT200).
    int processTileX3 = floorPow2(256 / c.dh);
    dim3 processBlock3(processTileX3, c.dh, 1);
    dim3 processGrid3((c.dc * c.dw + processTileX3 - 1) / processTileX3, c.dc, 1);

    RECORD_KERNEL_LAUNCH("Process DCB grid 3 kernel", processGrid3, processBlock3);

    for(unsigned int d = 0; d < c.numDisps; d++)
    {
        ProcessGrid3Kernel<<<processGrid3, processBlock3, processTileX3 * c.dh * sizeof(float2)>>>(
            c.gpuGrid->getBuffer(), c.gpuGrid->getPitch() >> 3, c.dc * c.dw * c.tile_x,
            c.dc * c.dw * (d % c.tile_x), c.dc * c.dh * (d / c.tile_x), c.dc);
    }
	CHECK_KERNEL_ERROR("Process DCB grid 3 kernel");
    CUDA_CALL(cudaThreadSynchronize());
}


void RunSliceGridKernel(Config& c)
{
    // convert to texture for fast bilinear interpolation
    c.gpuGrid->bindTexture<cudaReadModeElementType>(gpuGridTex);
    gpuGridTex.filterMode = cudaFilterModeLinear;

    // compute block & grid dimensions
    dim3 sliceBlock(min(256, c.w), 1, 1);
    dim3 sliceGrid(((c.w + sliceBlock.x - 1) / sliceBlock.x), c.h, 1);

    RECORD_KERNEL_LAUNCH("Slice DCB grid kernel", sliceGrid, sliceBlock);

    // slice all grids in parallel
    for(unsigned int d = 0; d < c.numDisps; d++)
    {
        SliceGridKernel<<<sliceGrid, sliceBlock>>>(toPitchedPtr(c.gpuCost), toPitchedPtr(c.gpuImg1), toPitchedPtr(c.gpuImg2), c.w, c.h, d, d % c.tile_x, d / c.tile_x, c.dw, c.dh, c.dc, c.sigmaS, c.sigmaC);
    }
	CHECK_KERNEL_ERROR("Slice DCB grid kernel");
    CUDA_CALL(cudaThreadSynchronize());
}


void RunAggregationDCBGrid(
#ifdef DCBGRID_STANDALONE
    const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
    const unsigned int* gpuImg1,
    const unsigned int* gpuImg2,
	const int pitch,
    float2* gpuGrid,
    const unsigned int gridWidth,
    const unsigned int gridHeight,
    const unsigned int gridPitch,
#else
    const Cuda::DeviceMemory<float, 3>& ctGpuCost,
    const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg1,
    const Cuda::DeviceMemory<unsigned int, 2>& ctGpuImg2,
    const Cuda::DeviceMemory<float2, 2>* ctGrid,
#endif
    unsigned int w, unsigned int h,
#ifdef DCBGRID_STANDALONE
    float sigmaS, float sigmaC
#else
    Parameters p
#endif
    )
{
#ifdef DCBGRID_STANDALONE
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
    Cuda::DeviceMemoryReference2D<float2>* ctGrid = NULL;
    if(gpuGrid != NULL)
    {
        ctGrid = new Cuda::DeviceMemoryReference2D<float2>(gridWidth, gridHeight, gpuGrid);
        ctGrid->setPitch(gridPitch);
    }
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);
    const bool verbose = VERBOSE_OUTPUT;
#else
    const float sigmaS = p.aggr.sigmaS;
    const float sigmaC = p.aggr.sigmaC;
    const int numDisps = p.numDisps;
    const bool verbose = p.verbose;
#endif

#ifdef REDIRECT_OUTPUTS
    // redirect stdout & stderr to log files
    FILE* new_stdout = freopen("DCB-grid-stdout.txt", "w", stdout);
    FILE* new_stderr = freopen("DCB-grid-stderr.txt", "w", stderr);

    if (new_stdout == NULL || new_stderr == NULL)
        exit(1);
#endif // REDIRECT_OUTPUTS

	Config c = { &ctGpuCost, &ctGpuImg1, &ctGpuImg2, ctGrid, w, h, numDisps, sigmaS, sigmaC, verbose };

    // remember if we need to free the memory if allocated locally
    bool freeGrid = (c.gpuGrid == NULL);
    
    // c.gpuGrid is a DeviceMemoryReference2D and as such cannot be freed,
    // even though it is allocated using DeviceMemoryPitched2D.
    // We therefore keep an additional pointer to it, to free it afterwards.
    Cuda::DeviceMemoryPitched2D<float2>* gridToFree = NULL;

    try
    {
        // compute dimensions of bilateral grid
        CalculateGridTextureSize(w, h, numDisps, sigmaS, sigmaC, c.dw, c.dh, c.dc, c.tile_x, c.tile_y);

        // allocate grid memory
        if(freeGrid)
        {
            gridToFree = new Cuda::DeviceMemoryPitched2D<float2>(c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
            c.gpuGrid = gridToFree;
        }

        if(verbose)
        {
            printf("\nRunning DCB-Grid with s_s = %.1f, s_c = %.1f\n", c.sigmaS, c.sigmaC);
            printf("  => (%i x %i x %i) x (%i x %i x %i) = %i x %i texture\n", c.dc, c.dw, c.tile_x, c.dc, c.dh, c.tile_y, c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
        }

        // create grid ----------------------------------------------------------------------------
        RunCreateGridKernel(c);

        //// debug: save grid as huge PGM image
        //{
        //    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
        //    Cuda::copy(*(c.gpuGrid), hostGrid);
        //    cutSavePGMf("out/DCBgrid-pre-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
        //}

        // process grid ---------------------------------------------------------------------------
        RunProcessGridKernel(c);

        //// debug: save grid as huge PGM image
        //{
        //    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
        //    Cuda::copy(*(c.gpuGrid), hostGrid);
        //    cutSavePGMf("out/DCBgrid-post-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
        //}

        // slice grid -----------------------------------------------------------------------------
        RunSliceGridKernel(c);

        if(freeGrid)
        {
            gridToFree->free();
            delete c.gpuGrid;
        }
    }
    catch(const std::exception &e)
    {
        fprintf(stderr, "Error: %s", e.what());
    }

#ifdef REDIRECT_OUTPUTS
    fflush(new_stdout); fclose(new_stdout);
    fflush(new_stderr); fclose(new_stderr);
#endif
}

void runAggregationTDCBGrid(
    const cudaPitchedPtr& gpuCostP,
    const cudaPitchedPtr& gpuCost,
	const unsigned int numDisps,
    const unsigned int* gpuImg1p,
    const unsigned int* gpuImg2p,
    const unsigned int* gpuImg1,
    const unsigned int* gpuImg2,
	const int pitch,
    float2* gpuGrid,
    const unsigned int gridWidth,
    const unsigned int gridHeight,
    const unsigned int gridPitch,
    unsigned int w, unsigned int h,
    float sigmaS, float sigmaC
    )
{
	Cuda::DeviceMemoryReference3D<float> ctGpuCostP(w, h, numDisps, (float*)gpuCostP.ptr);
	Cuda::DeviceMemoryReference3D<float> ctGpuCost(w, h, numDisps, (float*)gpuCost.ptr);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1(w, h, gpuImg1);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2(w, h, gpuImg2);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg1p(w, h, gpuImg1p);
	Cuda::DeviceMemoryReference2D<const unsigned int> ctGpuImg2p(w, h, gpuImg2p);
    Cuda::DeviceMemoryReference2D<float2>* ctGrid = NULL;
    if(gpuGrid != NULL)
    {
        ctGrid = new Cuda::DeviceMemoryReference2D<float2>(gridWidth, gridHeight, gpuGrid);
        ctGrid->setPitch(gridPitch);
    }
	ctGpuCost.setPitch(gpuCost.pitch);
	ctGpuImg1.setPitch(pitch);
	ctGpuImg2.setPitch(pitch);
    const bool verbose = VERBOSE_OUTPUT;
    
	Config c = { &ctGpuCost, &ctGpuImg1, &ctGpuImg2, ctGrid, w, h, numDisps, sigmaS, sigmaC, verbose };

    // remember if we need to free the memory if allocated locally
    bool freeGrid = (c.gpuGrid == NULL);
    
    // c.gpuGrid is a DeviceMemoryReference2D and as such cannot be freed,
    // even though it is allocated using DeviceMemoryPitched2D.
    // We therefore keep an additional pointer to it, to free it afterwards.
    Cuda::DeviceMemoryPitched2D<float2>* gridToFree = NULL;

    try
    {
        // compute dimensions of bilateral grid
        CalculateGridTextureSize(w, h, numDisps, sigmaS, sigmaC, c.dw, c.dh, c.dc, c.tile_x, c.tile_y);

        // allocate grid memory
        if(freeGrid)
        {
            gridToFree = new Cuda::DeviceMemoryPitched2D<float2>(c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
            c.gpuGrid = gridToFree;
        }

        if(verbose)
        {
            printf("\nRunning DCB-Grid with s_s = %.1f, s_c = %.1f\n", c.sigmaS, c.sigmaC);
            printf("  => (%i x %i x %i) x (%i x %i x %i) = %i x %i texture\n", c.dc, c.dw, c.tile_x, c.dc, c.dh, c.tile_y, c.dw * c.dc * c.tile_x, c.dh * c.dc * c.tile_y);
        }

        // create grid ----------------------------------------------------------------------------
        RunCreateGridKernel(c, ctGpuCostP, ctGpuImg1p, ctGpuImg2p);

        //// debug: save grid as huge PGM image
        //{
        //    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
        //    Cuda::copy(*(c.gpuGrid), hostGrid);
        //    cutSavePGMf("out/DCBgrid-pre-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
        //}

        // process grid ---------------------------------------------------------------------------
        RunProcessGridKernel(c);

        //// debug: save grid as huge PGM image
        //{
        //    Cuda::HostMemoryLocked2D<float2> hostGrid(*(c.gpuGrid));
        //    Cuda::copy(*(c.gpuGrid), hostGrid);
        //    cutSavePGMf("out/DCBgrid-post-process.pgm", reinterpret_cast<float*>(hostGrid.getBuffer()), 2 * c.gpuGrid->size[0], c.gpuGrid->size[1]);
        //}

        // slice grid -----------------------------------------------------------------------------
        RunSliceGridKernel(c);

        if(freeGrid)
        {
            gridToFree->free();
            delete c.gpuGrid;
        }
    }
    catch(const std::exception &e)
    {
        fprintf(stderr, "Error: %s", e.what());
    }
}