Improving my occlusion culling code

EDIT:
I fixed the bug which caused flickering of the render objects.

So now I would like to ask you if you see any improvement potential?
Updating the vbo seems to take a bit long (the vboByteBuffer.putFloat calls). It has to convert the every float to an int and then it puts 4 times 8 bits of the int into the vbo. Is there maybe any way to directly put a float into the vbo?

Original Questing:
Hello,
I’m trying to implement occlusion culling using an ssbo and reading the results in the next frame.
Only render objects that are inside the frustum get occlusion culled and rendered.
The problem I currently have is that when some render objects get outside/inside of the frustum then some of the visible render objects start flickering. But when all render objects stay inside/outside the frustum then there is no flickering at all.
I believe the problem is that the buffer access isn’t synchronized correctly and/or that the memory access is also not synchronized correctly.

Thank you in advance for you help.
If you need the initilization code or the shaders then just ask.

This is my code:

	int shaderProgram = glGetInteger(GL_CURRENT_PROGRAM);
	int bufferNumber = frameNumber++ % buffering;

	glGetBooleanv(GL_COLOR_WRITEMASK, COLOR_MASK_BUFFER);
	boolean depthMaskEnabled = glGetBoolean(GL_DEPTH_WRITEMASK);
	boolean depthTestEnabled = glGetBoolean(GL_DEPTH_TEST);
	boolean texture2dEnabled = glGetBoolean(GL_TEXTURE_2D);
	glColorMask(false, false, false, false);
	glDepthMask(false);
	glEnable(GL_DEPTH_TEST);
	glDisable(GL_TEXTURE_2D);

	shader.use();
	glBindBufferBase(GL_UNIFORM_BUFFER, 0, uboBuffer.buffers[bufferNumber]);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ssboBuffer1.buffers[bufferNumber]);

	if (fences[bufferNumber] != 0) {
		int result = glClientWaitSync(fences[bufferNumber], 0, 100_000);
		if (result == GL_TIMEOUT_EXPIRED || result == GL_WAIT_FAILED) {
			System.err.println("waiting failed: " + result);
		}
		glDeleteSync(fences[bufferNumber]);
	}

	if (debug > 0) {
		debug--;
	} else {
		glGetNamedBufferSubData(ssboBuffer2.buffers[bufferNumber], 0, ssboByteBuffer);
	}

	ByteBuffer uboByteBuffer = glMapNamedBufferRange(uboBuffer.buffers[bufferNumber], 0, uboBuffer.size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
	Matrix4f projectionViewMatrix = projectionIn.copy();
	projectionViewMatrix.multiply(matrixStackIn.last().pose());
	MatrixHelper.multiplyWithTranslationMatrix(projectionViewMatrix, (float) -x, (float) -y, (float) -z);
	MatrixHelper.store(projectionViewMatrix, uboByteBuffer, true);
	glUnmapNamedBuffer(uboBuffer.buffers[bufferNumber]);

	FrustumCulling cullingTest = new FrustumCulling(projectionViewMatrix);
	int entityCount = 0;
	ByteBuffer vboByteBuffer = glMapNamedBufferRange(vboBuffer.buffers[bufferNumber], 0, vboBuffer.size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
	for (RenderObject renderObject : renderObjects) {
		AABB aabb = renderObject.getAABB();
		if (!cullingTest.isAABBInFrustum(aabb)) {
			continue;
		}
		int i = (entityCount * 7) << 2;
		vboByteBuffer.putFloat(i, (float) (aabb.minX - 0.5D));
		vboByteBuffer.putFloat(i + 4, (float) (aabb.minY - 0.5D));
		vboByteBuffer.putFloat(i + 8, (float) (aabb.minZ - 0.5D));
		vboByteBuffer.putFloat(i + 12, (float) (aabb.maxX + 0.5D));
		vboByteBuffer.putFloat(i + 16, (float) (aabb.maxY + 0.5D));
		vboByteBuffer.putFloat(i + 20, (float) (aabb.maxZ + 0.5D));
		vboByteBuffer.putInt(i + 24, entityCount);
		renderObject.setIsCulled(ssboByteBuffer.get(renderObject.getLastCullingID() << 2) == 0);
		renderObject.setLastCullingID(entityCount);
		entityCount++;
	}
	glUnmapNamedBuffer(vboBuffer.buffers[bufferNumber]);

	glBindBuffer(GL_ARRAY_BUFFER, vboBuffer.buffers[bufferNumber]);
	glEnableVertexAttribArray(0);
	glEnableVertexAttribArray(1);
	glEnableVertexAttribArray(2);
	glVertexAttribPointer(0, 3, GL_FLOAT, false, 28, 0);
	glVertexAttribPointer(1, 3, GL_FLOAT, false, 28, 12);
	glVertexAttribIPointer(2, 1, GL_INT, 28, 24);
	glDrawArrays(GL_POINTS, 0, entityCount);
	glDisableVertexAttribArray(2);
	glDisableVertexAttribArray(1);
	glDisableVertexAttribArray(0);
	glBindBuffer(GL_ARRAY_BUFFER, 0);

	if (debug <= 0) {
		glCopyNamedBufferSubData(ssboBuffer1.buffers[bufferNumber], ssboBuffer2.buffers[bufferNumber], 0, 0, ssboBuffer1.size);
		glClearNamedBufferData(ssboBuffer1.buffers[bufferNumber], GL_R8I, GL_RED, GL_BYTE, (ByteBuffer) null);
	}

	fences[bufferNumber] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);

	if (texture2dEnabled) {
		glEnable(GL_TEXTURE_2D);
	} else {
		glDisable(GL_TEXTURE_2D);
	}
	if (depthTestEnabled) {
		glEnable(GL_DEPTH_TEST);
	} else {
		glDisable(GL_DEPTH_TEST);
	}
	glDepthMask(depthMaskEnabled);
	glColorMask(COLOR_MASK_BUFFER.get(0) == 1, COLOR_MASK_BUFFER.get(1) == 1, COLOR_MASK_BUFFER.get(2) == 1, COLOR_MASK_BUFFER.get(3) == 1);

	glUseProgram(shaderProgram);

I found the problem. I was reading the wrong buffer. Instead it should be:

	glGetNamedBufferSubData(ssboBuffer2.buffers[bufferNumber - 1 >= 0 ? bufferNumber - 1 : buffering - 1], 0, ssboByteBuffer);

So now that it’s fixed do you see any improvement potential?
Updating the vbo seems to take a bit long (the vboByteBuffer.putFloat calls). It has to convert the every float to an int and then it puts 4 times 8 bits of the int into the vbo. Is there maybe any way to directly put a float into the vbo?

That’s basically a Java issue. I would expect putFloat to be essentially just:
*(float *)(this->base + offset) = value
But the issue is likely to be the Java method call overhead. It might be more efficient to store the calculated values in a float[6] then copy that in one call (with FloatBuffer.put()). Or even create one large float[] and copy that in one go (using a separate VBO for the entity counts).

ByteBuffer should be specially optimized by the JIT. Perhaps it hadn’t heated up yet?