#define GROUP_SIZE 512
#define BLUR_SIZE 64
#define GROUP_BLUR_SIZE GROUP_SIZE + (BLUR_SIZE * 2) //Add twice the radius to allow lookups

int kernelSize = 3;


Texture2D InputTexture;

groupshared float4 sharedMem[GROUP_BLUR_SIZE];

RWTexture2D<float4> RWOutputTexture : BACKBUFFER;

[numthreads(GROUP_BLUR_SIZE,1,1)]
void CS_Simple(uint3 tid : SV_DispatchThreadID, uint3 gtid : SV_GroupThreadID, uint3 gid : SV_GroupID)
{
	int baseLocation = gid.x * GROUP_SIZE;
	int readOffset = gtid.x - BLUR_SIZE;
	
	int2 sample = int2(baseLocation+readOffset, gid.y);
	
	int w,h,d;	
	InputTexture.GetDimensions(0,w,h,d);
	
	sharedMem[gtid.x] = InputTexture.Load(int3(sample,0));
	
	GroupMemoryBarrierWithGroupSync();
	
	if (readOffset >= 0 && readOffset < GROUP_SIZE)
	{
		float4 sum = 0.0f;
		float wh = 0;
		for (int i = - kernelSize; i <  kernelSize; i++)
		{		
			sum += sharedMem[gtid.x+i];
			wh += 1.0f * (sample.x + i >= 0 && sample.x + i < w);
		}
		
		RWOutputTexture[sample] = sum / wh;
	}
}



technique11 Simple
{
	pass P0
	{
		SetComputeShader( CompileShader( cs_5_0, CS_Simple() ) );
	}
}




