I have a program that is supposed to run on the gpu. Now I’ve measured the performance of one function initEdgesX on the cpu which gave me roughly 150 ms for a 400³ array of data. Now I wanna parallelize it on the gpu, and I expected a high speedup due to the gpus parallel nature.
However, when I run the code on the gpu, it’s only roughly 2 times as fast as the cpu version. I’m using opengl compute shader.
This is my code:
CPU:
ComputeShader computeShader("./AVISE_GPU/Shader/initEdgesX.cs");
computeShader.use();
Buffer scalarFieldBuffer(GL_SHADER_STORAGE_BUFFER, scalarFieldSizeTotal * 4, scalarField, GL_DYNAMIC_COPY);
scalarFieldBuffer.bindBufferBase(0);
Buffer heightmapBufferNeg(GL_SHADER_STORAGE_BUFFER, sizeEdgesX * sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapBufferNeg.bindBufferBase(1);
Buffer heightmapBufferPos(GL_SHADER_STORAGE_BUFFER, sizeEdgesX * sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapBufferPos.bindBufferBase(2);
Buffer heightmapIndexOffsetBufferNeg(GL_SHADER_STORAGE_BUFFER, sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapIndexOffsetBufferNeg.bindBufferBase(3);
Buffer heightmapIndexOffsetBufferPos(GL_SHADER_STORAGE_BUFFER, sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapIndexOffsetBufferPos.bindBufferBase(4);
unsigned int testCounter = 0;
Buffer atomicCounter(GL_ATOMIC_COUNTER_BUFFER, 4, &testCounter, GL_DYNAMIC_COPY);
atomicCounter.bindBufferBase(5);
computeShader.setUInt("sizeX", sizeX);
computeShader.setUInt("sizeY", sizeY);
computeShader.setUInt("sizeZ", sizeZ);
computeShader.setUInt("sizeEdgesX", sizeEdgesX);
glfwSetTime(0.0);
/*for (int x = 0; x < sizeX - 1; ++x) {
computeShader.setUInt("currentX", x);
glDispatchCompute(1, ceil((float)sizeY / 8), ceil((float)sizeZ / 8));
}*/
glDispatchCompute(1, ceil((float)sizeY / 8), ceil((float)sizeZ / 8));
glFinish();
std::cout << glfwGetTime() << std::endl;
And the shader:
# version 450 core
const int localSizeX = 1;
const int localSizeY = 8;
const int localSizeZ = 8;
layout(local_size_x = localSizeX, local_size_y = localSizeY, local_size_z = localSizeZ) in;
uniform uint sizeX;
uniform uint sizeY;
uniform uint sizeZ;
uniform uint currentX;
uniform uint sizeEdgesX;
layout(binding = 5) uniform atomic_uint testCounter;
layout(std430, binding = 0) readonly buffer scalarField
{
float density [];
}
inputScalarField;
layout(std430, binding = 1) buffer heightmapBuffer1
{
uint height [] ;
} heightmapZYNeg;
layout(std430, binding = 2) buffer heightmapBuffer2
{
uint height [] ;
} heightmapZYPos;
layout(std430, binding = 3) buffer heightmapIndexOffsetBuffer1
{
uint indexOffset [] ;
} heightmapIndexOffsetZYNeg;
layout(std430, binding = 4) buffer heightmapIndexOffsetBuffer2
{
uint indexOffset [] ;
} heightmapIndexOffsetZYPos;
uint getScalarIndex(uint x, uint y, uint z)
{
return z * sizeX * sizeY + y * sizeX + x;
}
uint getHeightmapIndex(uint widthIndex, uint heightIndex, uint depthIndex, uint width, uint depth)
{
return heightIndex * width * depth + widthIndex * depth + depthIndex;
}
void main()
{
uint currentYIndex = gl_LocalInvocationID.y + (gl_WorkGroupID.y * localSizeY);
if (currentYIndex > sizeY)
{
return;
}
uint currentZIndex = gl_LocalInvocationID.z + (gl_WorkGroupID.z * localSizeZ);
if (currentZIndex > sizeZ)
{
return;
}
uint heightmapIndexOffsetIndex = currentYIndex * sizeZ + currentZIndex;
heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex] = 0;
heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex] = 0;
atomicCounterIncrement(testCounter);
for (int x = 0; x < sizeX - 1; ++x)
{
float scalar1 = inputScalarField.density[getScalarIndex(x, currentYIndex, currentZIndex)];
float scalar2 = inputScalarField.density[getScalarIndex(x + 1, currentYIndex, currentZIndex)];
if (scalar1 < 0 && scalar2 >= 0)
{
uint currentHeightmapIndexOffset = heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex];
uint arrayIndex = getHeightmapIndex(currentZIndex, currentYIndex, currentHeightmapIndexOffset, sizeZ, sizeEdgesX);
heightmapZYNeg.height[arrayIndex] = x;
heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex] = currentHeightmapIndexOffset + 1;
}
else if (scalar1 >= 0 && scalar2 < 0)
{
uint currentHeightmapIndexOffset = heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex];
uint arrayIndex = getHeightmapIndex(currentZIndex, currentYIndex, currentHeightmapIndexOffset, sizeZ, sizeEdgesX);
heightmapZYPos.height[arrayIndex] = x;
heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex] = currentHeightmapIndexOffset + 1;
}
}
}