Compute shader writes random noise to image

#1

but all pixels have to be same.

Problem started when I tried to implement volumetric clouds (you can see noise here video). Then I just simplified code. Also original code and shader which draw volumetric clouds work fine on OpenGL.

CMake project is available here

There are steps:

  1. Init vulkan.

  2. Create pipeline, image and etc.

  3. Call vkCmdDispatch()

  4. Call vkMapMemory() to get result

     VkCommandBuffer pCommandBuffer;
    
     //draw
     {
       pCommandBuffer = beginCommandBuffer(commandPool);
    
       vkCmdBindPipeline(pCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
    
       vkCmdBindDescriptorSets(
         pCommandBuffer,
         VK_PIPELINE_BIND_POINT_COMPUTE,
         pipelineLayout,
         0,
         1,
         &descriptorSet,
         0,
         nullptr);
    
       vkCmdDispatch(pCommandBuffer, alignUp(WIDTH, (uint32)8) / 8, alignUp(HEIGHT, (uint32)8) / 8, 1);
    
       endCommandBuffer(pCommandBuffer);
     }
    
     //output
     {
       pCommandBuffer = beginCommandBuffer(commandPool);
       imageBarrier(
         pCommandBuffer,
         image,
         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
         0,
         VK_IMAGE_LAYOUT_GENERAL,
         VK_PIPELINE_STAGE_TRANSFER_BIT,
         VK_ACCESS_TRANSFER_READ_BIT,
         VK_IMAGE_LAYOUT_GENERAL);
       endCommandBuffer(pCommandBuffer);
    
       void* bufData;
       vkMapMemory(vulkanRHI_GetDevice(), imageMemory, 0, sizeof(FPixel)*WIDTH*HEIGHT, 0, &bufData);
       memcpy(pixels.data(), bufData, sizeof(FPixel)*WIDTH*HEIGHT);
       vkUnmapMemory(vulkanRHI_GetDevice(), imageMemory);
    
       pCommandBuffer = beginCommandBuffer(commandPool);
       imageBarrier(
         pCommandBuffer,
         image,
         VK_PIPELINE_STAGE_TRANSFER_BIT,
         VK_ACCESS_TRANSFER_READ_BIT,
         VK_IMAGE_LAYOUT_GENERAL,
         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
         0,
         VK_IMAGE_LAYOUT_GENERAL);
       endCommandBuffer(pCommandBuffer);
    
       fstream file;
       file.open(("attempt_" + to_string(i) + ".txt").c_str(), ios::out);
       file << "\n\n\n";
       for (int x = 0; x < 300; x++)
       {
         for (int y = 0; y < 300; y++)
           file << (pixels[x + y * WIDTH].r > 128 ? '+' : ' ');
         file << "\n";
       }
     }
    

Compute shader just does same computation for every pixel. What I get:


You can see random holes which have size 8x8.

Shader code:

#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(local_size_x = 8, local_size_y = 8) in;

layout(rgba8, binding = 3) uniform image2D ufmResultImage;

bool crossRaySphereOut(vec3 rayStart, vec3 rayDirection, vec3 sphereCenter, float sphereRadius, out vec3 result1, out vec3 result2)
{
	vec3 rayPoint=rayStart+dot(sphereCenter-rayStart, rayDirection)*rayDirection;
	float height=length(rayPoint-sphereCenter);
	if(height<=sphereRadius)
	{
		float dist=sqrt(sphereRadius*sphereRadius-height*height);
		result1=rayPoint-rayDirection*dist;
		result2=rayPoint+rayDirection*dist;
		return dot(result1-rayStart,rayDirection)>0.0f || dot(result2-rayStart,rayDirection)>0.0f;
	}
	return false;
}

bool crossRaySphereOutFar(vec3 rayStart, vec3 rayDirection, vec3 sphereCenter, float sphereRadius, out vec3 result)
{
	vec3 tmpPoint;
	return crossRaySphereOut(rayStart, rayDirection, sphereCenter, sphereRadius, tmpPoint, result);
}

float remap(float value, float minValue, float maxValue, float newMinValue, float newMaxValue)
{
    return newMinValue+(value-minValue)/(maxValue-minValue)*(newMaxValue-newMinValue);
}

float cloudGetStepLength(vec3 position, float avrStep)
{
	return avrStep;
}

float cloudGetHeight(vec3 position)
{
	return clamp((length(position)-(6400+15))/((6400+35)-(6400+15)), 0, 1);
}

float cloudSampleDensity(vec3 position, float mip, bool fast)
{
	float gc=1;
	vec4 weather=vec4(1);
	float wc0=weather.r;
	float wc1=weather.g;
	float wh=weather.b;
	float wd=weather.a;
	float ph=cloudGetHeight(position);
	
	float WMc=max(wc0, clamp(gc-0.5, 0, 1)*wc1*2);

	float SRb=clamp(remap(ph, 0, 0.07, 0, 1), 0, 1);
	float SRt=clamp(remap(ph, wh*0.2, wh, 1, 0), 0, 1);
	float SA=SRb*SRt;
	
	float DRb=ph*clamp(remap(ph, 0, 0.15, 0, 1), 0, 1);
	float DRt=ph*clamp(remap(ph, 0.9, 1, 1, 0), 0, 1);
	float DA=DRb*DRt*wd*2;
	
	float SNsample=1; 
	
	float d;
	if(fast)
	{
		d=clamp(remap(SNsample*SA, 1-gc*WMc, 1, 0, 1), 0, 1)*DA;
	}
	else
	{
		float SNnd=clamp(remap(SNsample*SA, 1-gc*WMc, 1, 0, 1), 0, 1);
		float DNfbm = 1;
		float DNmod=0.35*exp(-gc*0.75)*mix(DNfbm, 1-DNfbm, clamp(ph*5, 0, 1));
		d=clamp(remap(SNnd, DNmod, 1, 0, 1)*DA, 0, 1);
	}
	
	return d;
}

float cloudSampleSunDensity(vec3 position, vec3 sunDir, float mip)
{
	float avrStep=((6400+35)-(6400+15));
	
	float sumDensity=0;
	float prevDensity=0;
	
	for(int i=0;i<6;i++)
	{
		float step=cloudGetStepLength(position, avrStep);
		position+=sunDir*step;
		
		float density=cloudSampleDensity(position, mip, false);
		float actualDensity=(density+prevDensity)*0.5;
		prevDensity=density;
		
		sumDensity+=actualDensity*step;
	}

	return sumDensity;
}

vec4 mainMarching(vec3 viewDir, vec3 sunDir, vec3 sunColor)
{
	vec3 position;
	crossRaySphereOutFar(vec3(0, 6400, 0), viewDir, vec3(0), (6400+15), position);
	
	float avrStep=((6400+35)-(6400+15))/float(3/2);
	position=position+viewDir*cloudGetStepLength(position, avrStep);
	
	vec3 color=vec3(0);
	float transmittance=1;
	
	int zeroSamplesCount=0;
	float sampleTest=0;
	float step;
	float prevDensity=0;
	for(int i=0;i<160;i++)
	{
		if(sampleTest>0)
		{
			float density=cloudSampleDensity(position, 0, false);
			float actualDensity=(density+prevDensity)*0.5;
			prevDensity=density;
			
			if(actualDensity>0)
			{
				float sunDensity=cloudSampleSunDensity(position, sunDir, 0);
				vec3 light=15*sunColor*exp(-4*sunDensity)*(1-exp(-4*actualDensity*step));
				light*=4*actualDensity;
				color+=light*transmittance;
			}
			else
			{
				zeroSamplesCount++;
			}
			
			if(zeroSamplesCount<6)
			{
				step=cloudGetStepLength(position, avrStep);
				position+=viewDir*step;
			}
			else
			{
				zeroSamplesCount=0;
				sampleTest=0;
			}
		}
		else
		{
			sampleTest=cloudSampleDensity(position, 0, true);
			if(sampleTest<=0)
			{
				step=cloudGetStepLength(position, avrStep);
				position+=viewDir*step;
			}
		}

		if(transmittance<0.00001 || length(position)>(6400+35))
			break;
	}
	
	return vec4(color*10, 1);
}

void main()
{
    ivec2 g_id = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y);
    if (g_id.x>=0 && g_id.x < 300 && g_id.y>=0 && g_id.y < 300)
	{
		vec3 viewDir=normalize(vec3(1,1,1));
		vec3 sunDir= normalize(vec3(1,1,1));
		vec3 sunColor=vec3(1,1,1);
		imageStore(ufmResultImage, g_id, mainMarching(viewDir, sunDir, sunColor));
	}
} 

I have no idea what is going on. Looks like shader little bit complicated and somehow it gets access to wrong memory.

#2

Is this some kind of pseudocode? You are not even submitting your command buffers.

It may also be bad sync. But you do not show enough code to be certain.

Yes, your shader is complicated, and your test only tests if stuff is >128. Without carefully reading the whole complicated shader that might as well be the correct output AFAICT. What is the desired output?

There is a lost art of making a minimal example. Start deleting stuff. Try to paint a simple gradient or something, and see if the bug persists even for such non-complicated shader.

#4

It is as minimal as possible. In shader all calculations are same for every pixel. Color has to be white. You have missed the link to full project ) https://drive.google.com/file/d/1RZFPQXqjvUtLjoQlFp2DaXYVwJYVovBT/view

#5

Exactly. Then why are you confusing us with bunch of unrelated code, where simple color constant instead whole mainMarching would suffice. Example is not minimal if it contains non-essential code.

Right, so your endCommandBuffer autosubmits as well as paranoidly do vkDeviceWaitIdle.

That means your two imageBarriers do exactly nothing. Assuming it does not change the result, it is another thing that can be removed to achieve minimal example. (BTW I am also pretty sure the flags used are wrong in the barrier, but AIS it should not matter and a problem to be solved later)

Another problem I see is that you round up the dimensions to vkCmdDispatch. I would assume that would result in OOBA into the image.

Furtermore you are using local size (of 8x8), and using (gl_GlobalInvocationID.x, gl_GlobalInvocationID.y) as coordinates into the image, which would result into even more massive OOBA.

E.g. if I take your HEIGHT, which is 972. That height is directly used to create your image.
If I do alignUp(8), that is (972 + 8 - 1) / 8 * 8 = 976. And gl_GlobalInvocationID.y therefore spans from 0 to 975*8 + 7 = 7807.

Maybe that leads to some memory corruption or undefined behavior…

PS: correction: the first image barrier is actually technically required. But it needs correct flags, i.e. VK_PIPELINE_STAGE_HOST_BIT and VK_ACCESS_HOST_READ_BIT.

#6

Cause simple color constant in shader works fine. To reproduce bug you need such complicated shader. There is no line which cause error. You can remove half of all lines in random order and then discover that shader works correctly.

? How do you suggest to synchronize with CPU? Using fence a bit more complicated.

Ok. It is part of previous version but it is not a reason.

No. Look carefully. Formula is (972 + 8 - 1) / 8 * 8 / 8 = 122.

Actually I’m using square 300x300 to output result

 ivec2 g_id = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y);
 if (g_id.x>=0 && g_id.x < 300 && g_id.y>=0 && g_id.y < 300)

Early there was imageSize() checking but little square has more chances for bug to occur.

#7

Now that is interesting.
Try fixing the sync as suggested in the “PS” above (using the host flags in first barrier).

I did not sugest anything. Just provided context for what I was saying.

(In serious program you would of course use the fences; vkDeviceWaitIdle is a blunt tool only suitable for app cleanup, or perhaps swapchain re-creation.)

Right, scratch that.

Alright.

#8

First I thought it was memory barriers problem. That is why I excluded renderpass stage. Now I think it may be videocard’s driver.

I have done. Nothing is changed.

That is how problem looks if render image to screen: link

#9

My GPU e.g. does not have VK_IMAGE_USAGE_STORAGE_BIT for linear images of r8g8b8a8_unorm. You should check that format support.

Also your glfwInit() is below glfwWindowHint. It results in surface creation error for me.

#10

Whoops. I returned optimal tiling and staging buffer. You can use this link to download: link

#11

FWIW I get all spaces with that version.

#12

When I read your message I figured out what cause problem. Looks like we spent time to fix junior’s bug :slight_smile: There was uninitialized variable in the shader. Anyway thank you for help.