Image Layout Transition Bug?

NewOpenGLUser · March 5, 2020, 3:43pm

Perhaps it is not a bug, and I am just stupid?

vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
setImageLayout(commandBuffer, vkImage, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
vkCmdPushConstants(commandBuffer, pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, 4, (void *)&offset);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
vkCmdDispatch(commandBuffer, (uint32_t)ANativeWindow_getWidth(android->app->window), (uint32_t)ANativeWindow_getHeight(android->app->window), 1);
setImageLayout(commandBuffer, vkImage, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
vkEndCommandBuffer(commandBuffer);

That is the command buffer template, the template is used for all vkImages in a swapchain → there are 2 images in the swapchain.

However, when I submit the command buffer to queue, the result is a validation error :

Submitted command buffer expects VkImage 0x1e[]  (subresource: aspectMask 0x1 array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_GENERAL--instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED.

What’s even more strange is that this is only a problem for the first vkImage in the swapchain, the second image has no problems.

I thought this to be a synchronization bug, so I broke the command buffer into 3 separate buffers, and synchronized them with semaphores, but the error persisted.

The purpose of this compute shader is to copy a texture to a swapchain image, after completion, the image is submitted for presentation.

The final visual over time is the texture flickering instead of a constant image, with the error reoccurring every time the first image’s command buffer is submitted.

If you have any experience with the above, please share…

NewOpenGLUser · March 5, 2020, 11:20pm

I could not solve this problem, so I instead decided to drop the swap chain and use a persistently mapped buffer to manually move the texture from gpu memory to cpu display memory.

This is my only option as of now, so if you do know of why the swap chain was the way it was, your experience is anticipated…

NewOpenGLUser · March 6, 2020, 12:01am

Frame rate is horrible with this solution…

ahah

NewOpenGLUser · March 6, 2020, 8:58pm

It turns out to be a problem with the imageStore function in the following shader:

#version 310 es
layout (local_size_x=1) in;
layout (rgba8, binding = 0, set = 0) lowp uniform writeonly image2D imageB;
void main(){
imageStore(imageB, ivec2(gl_WorkGroupID.x, gl_WorkGroupID.y), vec4(10,10,10,10));
int x = 0;
}";

Once I hide the imageStore op, there is no errors reported, but the program runs at 20 fps, when there is only a command buffer for each image in the swap chain, and each command buffer is recorded in the following (which is a snap shot of the loop that records them):

vkBeginCommandBuffer(commandBuffer[swapChainImageIndex], &commandBufferBeginInfo);
VkImageMemoryBarrier imageMemoryBarrier = {
		.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
		.image = swapChainImages[swapChainImageIndex],
		.subresourceRange = {
				.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
				.layerCount = 1,
				.baseArrayLayer = 0,
				.levelCount = 1,
				.baseMipLevel = 0,
		},
		.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
		.newLayout = VK_IMAGE_LAYOUT_GENERAL,
		.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
		.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
		.dstAccessMask = 0,
		.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
		.pNext = nullptr,
};
vkCmdPipelineBarrier(commandBuffer[swapChainImageIndex], VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, &bufferMemoryBarrier, 1, &imageMemoryBarrier);
vkCmdBindDescriptorSets(commandBuffer[swapChainImageIndex], VK_PIPELINE_BIND_POINT_COMPUTE, computeShaderPipelineLayout, 0, 1, &descriptorSets[swapChainImageIndex], 0, nullptr);
vkCmdBindPipeline(commandBuffer[swapChainImageIndex], VK_PIPELINE_BIND_POINT_COMPUTE, computeShaderPipeline);
vkCmdDispatch(commandBuffer[swapChainImageIndex], width, height, 1);
imageMemoryBarrier = {
		.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
		.image = swapChainImages[swapChainImageIndex],
		.subresourceRange = {
				.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
				.layerCount = 1,
				.baseArrayLayer = 0,
				.levelCount = 1,
				.baseMipLevel = 0,
		},
		.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
		.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
		.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
		.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
		.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
		.srcAccessMask = VK_ACCESS_MEMORY_READ_BIT,
		.pNext = nullptr,
};
vkCmdPipelineBarrier(commandBuffer[swapChainImageIndex], VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier);
vkEndCommandBuffer(commandBuffer[swapChainImageIndex]);

The descriptor sets are updated as follows (a snap shot of the actual loop):

VkDescriptorImageInfo descriptorImageInfo = {
		.sampler = sampler,
		.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
		.imageView = swapChainImageViews[descriptorSetIndex],
};
writeDescriptorSets[descriptorSetIndex] = {
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = descriptorSets[descriptorSetIndex],
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.descriptorCount = 1,
.pBufferInfo = nullptr,
.pImageInfo = &descriptorImageInfo,
.pTexelBufferView = nullptr,
.pNext = nullptr,
};

The swap chain and its images are created as follows :

uint32_t chosenFormat;
for (chosenFormat = 0; chosenFormat < formatCount; chosenFormat++) {
if (formats[chosenFormat].format == VK_FORMAT_R8G8B8A8_UNORM) break;
}
assert(chosenFormat < formatCount);

VkSurfaceCapabilitiesKHR surfaceCap;
vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device.gpuDevice_, device.surface_, &surfaceCap);
assert(surfaceCap.supportedCompositeAlpha | VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR && surfaceCap.supportedUsageFlags & VK_IMAGE_USAGE_STORAGE_BIT);

VkSwapchainCreateInfoKHR swapchainCreateInfoKhr = {
		.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
		.queueFamilyIndexCount = 1,
		.pQueueFamilyIndices = &device.queueFamilyIndex_,
		.surface = device.surface_,
		.minImageCount = swapChainImageCount,
		.presentMode = VK_PRESENT_MODE_FIFO_KHR,
		.imageExtent = surfaceCapabilities.currentExtent,
		.imageArrayLayers = 1,
		.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
		.imageColorSpace = formats[chosenFormat].colorSpace,
		.imageFormat = formats[chosenFormat].format,
		.imageUsage = VK_IMAGE_USAGE_STORAGE_BIT,
		.preTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
		.compositeAlpha = VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR,
		.clipped = VK_FALSE,
		.oldSwapchain = 0,
		.flags = 0,
		.pNext = nullptr,
};
vkCreateSwapchainKHR(device.device_, &swapchainCreateInfoKhr, nullptr, &swapChain);

I make the pipeline layout as follows :

descriptorSetLayoutBindings[0] = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.descriptorCount = 1,
.binding = 0,
.pImmutableSamplers = nullptr,
};
VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = {
		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
		.bindingCount = 1,
		.pBindings = descriptorSetLayoutBindings,
		.flags = 0,
		.pNext = nullptr,
};
vkCreateDescriptorSetLayout(device.device_, &descriptorSetLayoutCreateInfo, nullptr, &descriptorSetLayout);

// Pipeline Layout
VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
		.setLayoutCount = 1,
		.pSetLayouts = &descriptorSetLayout,
		.pushConstantRangeCount = 0,
		.pPushConstantRanges = &pushConstantRange,
		.flags = 0,
		.pNext = nullptr,
};
vkCreatePipelineLayout(device.device_, &pipelineLayoutCreateInfo, nullptr, &computeShaderPipelineLayout);

I use the descriptor set layout made above for the descriptor set layouts needed when allocating the actual descriptor sets in the descriptor pool used for each swapchain image.

If any questions, let me know. If you have any jokes, let me laugh!

With all this code in action, the result is still the same, with the flickering, and only swapChainIndex = 0 hitting the validation layer, but when imageStore is dropped from the shader, both swapChainImages transition without a problem.

I still do not understand why swapChainIndex = 1 is not affected at all. The synchronization specs seem to agree with my implementation (this is to clear up my previous beliefs), and given that imageStore is the cause, it must be something to do with with how I am building and binding descriptor sets, descriptor layouts, and pipeline layouts.

NewOpenGLUser · March 7, 2020, 12:46am

To work around what I believe to be an implementation bug, I use vkCmdCopyBufferToImage instead of dispatching a compute shader. The performance is best I’ve achieved using only the display feature and a single texture transfer op to a swap chain image; almost 100 fps, tested on an android mobile phone (I hope this is not the start of another forum entry… ahhaa).

However, I still do not know why the compute shader was unable to pack the image…

krOoze · March 7, 2020, 1:31am

Having good ol’ fashioned one man conversation here I see…

Submitted command buffer expects VkImage 0x1e (subresource: aspectMask 0x1 array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_GENERAL–instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED.

Yea it is pretty common error to happen. There is some discussion at layers repo how to improve the message. It is possible you forgot your layout transition. But more often than not, this means the program is mis-synchronizaed. It means something like “it has to be in layout VK_IMAGE_LAYOUT_GENERAL, but it may or may not be VK_IMAGE_LAYOUT_UNDEFINED at the same time due to bad synchronization”.

Why 0?

NewOpenGLUser:

vkCmdPipelineBarrier(
commandBuffer[swapChainImageIndex],
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, 0, nullptr, 0, &bufferMemoryBarrier, 1, &imageMemoryBarrier
);

What’s the deal with the bufferMemoryBarrier? The count is 0.

ALL_COMMANDS is overkill, but you know that. But anyway, if you use ALL_COMMANDS, you usually want MEMORY_READ | MEMORY_WRITE for access masks as well.

NewOpenGLUser:

    .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
    .srcAccessMask = VK_ACCESS_MEMORY_READ_BIT,
   .pNext = nullptr,
};
vkCmdPipelineBarrier(
commandBuffer[swapChainImageIndex],
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier
);

This seems similar mistake as the first barrier. It feels like you have the .srcAccessMask and .dstAccessMask confused with each other.

Also overkill. Appropriate flags for presentation would be VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT and .dstAccessMask mask of 0. Semaphore signal deals with this instead.

You should add vkQueueSubmit code. It also might be missing (or using wrong) semaphore, or bad pWaitDstStage.

NewOpenGLUser · March 7, 2020, 7:40pm

yeaa… no. Still would not work.

Yea, I sampled the wrong test envo for some parts of the code, where the correct changes were made.

Excusing the rest of my mistakes

When running the code with the appropriate flags you shared, the compute shader was not functional for that first swapchain image.

I then used the same setup using vkCmdCopyBufferToImage instead of vkCmdDispatch, except I necessarily tweaked a couple of the src-dst: stage-access; flags. Everything worked great!

I did not use any wait semaphores for the command buffer, only signals. I believe pWaitDstStage will only be necessary to assign if I use a wait; but the vulkan api I am using does not list pWaitDstStage mask in the presentInfo struct, so I have completely nulled it from my code.

The render loop is as follows :

vkAcquireNextImageKHR(device.device_, swapChain, UINT64_MAX, 0, fence, &imageIndex);
vkWaitForFences(device.device_, 1, &fence, VK_TRUE, UINT64_MAX);
vkResetFences(device.device_, 1, &fence);
submitInfo = {
		.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
		.pCommandBuffers = &commandBuffer[imageIndex],
		.commandBufferCount = 1,
		.waitSemaphoreCount = 0,
		.pWaitSemaphores = nullptr,
		.pWaitDstStageMask = nullptr,
		.signalSemaphoreCount = 1,
		.pSignalSemaphores = &semaphore,
		.pNext = nullptr,
};
presentInfo = {
		.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
		.waitSemaphoreCount = 1,
		.pWaitSemaphores = &semaphore,
		.pResults = nullptr,
		.swapchainCount = 1,
		.pSwapchains = &swapChain,
		.pImageIndices = &imageIndex,
		.pNext = nullptr,
};
vkQueueSubmit(device.device_, 1, &submitInfo, fence);
vkQueuePresent(queue, &presentInfo);
vkWaitForFences(device.device_, 1, &fence, VK_TRUE, UINT64_MAX);
vkResetFences(device.device_, 1, &fence);

krOoze · March 8, 2020, 3:50pm

Well what is the assumably correct code then?

Right, you seem to use fences…

system · October 19, 2021, 1:57pm

This topic was automatically closed 183 days after the last reply. New replies are no longer allowed.