#ifndef CUDA_HELPER_COMMON_H
#define CUDA_HELPER_COMMON_H

#include <builtin_types.h>
#include "device_functions.h"

#include "CudaDebug.h"
#include "Utils.cuh"

// Useful bit-masks when working with 32-bit unsigned integers
#define UINT_LOWER_MASK 0x0000FFFFu
#define UINT_UPPER_MASK 0xFFFF0000u
#define UINT_MASK_0 0x000000FFu
#define UINT_MASK_8 0x0000FF00u
#define UINT_MASK_16 0x00FF0000u
#define UINT_MASK_24 0xFF000000u

#define USHORT_SIZE 16
#define UINT_BYTE_MAX 255u

// Performs the access logic for a 3D pitched pointer
#define ACCESS_3D(pptr, x, y, z) \
	(*(float*)((char*)(pptr).ptr + \
				sizeof(float) * (x) + \
				(pptr).pitch * (y) + \
				(pptr).pitch * (pptr).ysize * (z)))

// Performs the access logic for a 3D pitched pointer
#define ACCESS_3D_UINT(pptr, x, y, z) \
	(*(unsigned int*)((char*)(pptr).ptr + \
				sizeof(unsigned int) * (x) + \
				(pptr).pitch * (y) + \
				(pptr).pitch * (pptr).ysize * (z)))

// Methods for reading in half-precision floats, packed into 32-bit unsigned integers
#define READ_HALF_LOWER(pptr, x, y, z) __half2float((unsigned short)(ACCESS_3D_UINT((pptr), (x), (y), (z)) & UINT_LOWER_MASK))
#define READ_HALF_UPPER(pptr, x, y, z) __half2float((unsigned short)(((unsigned int)ACCESS_3D_UINT((pptr), (x), (y), (z)) & UINT_UPPER_MASK) >> USHORT_SIZE))

// Methods for unpacking half-precision floats from 32-bit unsigned integers
#define UNPACK_HALF_LOWER(n) __half2float((unsigned short)((n) & UINT_LOWER_MASK))
#define UNPACK_HALF_UPPER(n) __half2float((unsigned short)(((unsigned int)(n) & UINT_UPPER_MASK) >> USHORT_SIZE))

// Packs two floats as half-precision floats in a 32-bit unsigned integer
#define PACK_HALFS(hi, lo) (((unsigned int)__float2half_rn(hi) << USHORT_SIZE) + (unsigned int)__float2half_rn(lo))

// 'Unpacks' a single byte from the value passed (normally a 32-bit unsigned integer)
#define UNPACK_BYTE(val, offset) (((val) >> (offset)) & 0xFF)

// Packs a set of bytes into an unsigned integer (warning: does not check each one for wrap-around)
#define PACK_BYTES(val3, val2, val1, val0) ((unsigned int)(val0) + ((unsigned int)(val1) << 8) + ((unsigned int)(val2) << 16) + ((unsigned int)(val3) << 24));

// Sets a given single byte in the target (normally an unsigned int) to the lowest 8 bits of the value given
#define SET_BYTE(target, val, offset) ((target) = ((target) & ~(0x000000FF << (offset))) | (((val) & 0x000000FF) << (offset)))

// A convenient macro to cudaFree and nullify a pointer, if not already null
#define CUDA_FREE(ptr) if((ptr) != NULL) { cudaFree(ptr); ptr = NULL; }

// Helper function for swapping two values (uses copy constructor)
template<typename T>
inline void Swap(T & a, T & b)
{
	T temp = a;
	a = b;
	b = temp;
}

#endif //CUDA_HELPER_COMMON_H