OpenGL searching a way to not render distant objects

Besseo · October 12, 2022, 1:44pm

Hi,

Last week I jumped back in OpenGL and I wanted to worked on a project I have never finished. The project is about displaying a terrain generated with perlin noise and display some objects over it.

When I was working on this project I created a c++ API that will go through all my objects (by using the vertex array) and then considering their position from the camera I display them or not. What I did is that I used GL_DYNAMIC_DRAW in order to bind my buffers before drawing with only the object I wanted to draw (I also use this mechanism to move my objetcs).

But I figured out that testing the distance from the camera of each object of my vertex array is very slow and causes a big impact in the runtime (CPU side).
But as I have a lot of objects to be rendered if I do not do that it has a big impact GPU side.

So I was wondering if it exists a way to only display objects that are near the camera.
The other limitation is that the terrain should be rendered even far from the camera.

Camera :

	glm::mat4 view = glm::lookAt(glm::vec3(x, y, z), glm::vec3(x + directionx, y + directiony, z + directionz), glm::vec3(0, 1, 0));
	glm::mat4 proj = glm::perspective(yfov, xfov, znear,zfar);
	glUniformMatrix4fv(get_uni_loc(programId, "projection"), 1, false, glm::value_ptr(proj));
	glUniformMatrix4fv(get_uni_loc(programId, "modelview"), 1, false, glm::value_ptr(view));

Vertex shader :

#version 330 core
layout (location = 0) in vec3 positions;
layout (location = 1) in vec3 normals;

//Varying
out vec3 coord3d;
out vec3 realCoord3d;
flat out vec3 normal;
out vec3 light;
out vec2 tex_coord;

//Uniform
uniform mat4 modelview;
uniform mat4 projection;
uniform vec4 translation;
uniform vec3 rotation;


vec3 light_pos = vec3(100.0f,200.0f,100.0f);
mat4 rotationx = mat4(1,0,0,0,
					  0,cos(rotation.x),-sin(rotation.x),0,
					  0,sin(rotation.x),cos(rotation.x),0,
					  0,0,0,1);
mat4 rotationy =mat4(
					  cos(rotation.y),0,sin(rotation.y),0,
					  0,1,0,0,
					  -sin(rotation.y),0,cos(rotation.y),0,
					  0,0,0,1);;

mat4 rotationz = mat4(cos(rotation.z),-sin(rotation.z),0,0,
					  sin(rotation.z),cos(rotation.z),0,0,
					  0,0,1,0,
					  0,0,0,1);;


void main(){
coord3d = positions;
mat4 view = modelview;
vec4 p = view * (vec4(positions,1.0f)*rotationx*rotationy*rotationz + translation);
realCoord3d = positions;
p = projection * p;
normal = normals;
light = light_pos;
gl_Position = p;

The way I render :

void Renderer::initBillboard(Billboard& object) {

	//Vertex array buffer generation
	glGenBuffers(1, &object.vertexBuffer);
	glBindBuffer(GL_ARRAY_BUFFER, object.vertexBuffer);
	glBufferData(GL_ARRAY_BUFFER, (object.nbBillboards)* 36 * sizeof(float), nullptr, GL_STATIC_DRAW);
	//Index array buffer generation
	glGenBuffers(1, &object.indexBuffer);
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, object.indexBuffer);
	glBufferData(GL_ELEMENT_ARRAY_BUFFER, object.nbBillboards * 6 * sizeof(int), nullptr, GL_STATIC_DRAW);


}
void Renderer::renderBillboard(Billboard& object, GLuint programId, GLuint texture) {

	glDisable(GL_POLYGON_OFFSET_FILL);
	PRINT_OPENGL_ERROR();
	glUseProgram(programId);
	PRINT_OPENGL_ERROR();
	updateBillboard(object);
	PRINT_OPENGL_ERROR();
	//Buffer binding
	//glGenVertexArrays(1, &object.VAO);
	//glBindVertexArray(object.VAO);
	glBindBuffer(GL_ARRAY_BUFFER, object.vertexBuffer);
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, object.indexBuffer);

	//Attrib pointers
	glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 9 * sizeof(float), nullptr);
	glEnableVertexAttribArray(0);
	glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 9 * sizeof(float), buffer_offset(3 * sizeof(float)));
	glEnableVertexAttribArray(1);
	glVertexAttribPointer(2, 3, GL_FLOAT, GL_FALSE, 9 * sizeof(float), buffer_offset(6 * sizeof(float)));
	glEnableVertexAttribArray(2);

	//Uniform
	PRINT_OPENGL_ERROR();
	float c[4]{ 0.0f,0.0f,0.0f,0.0f };
	glUniform4fv(get_uni_loc(programId, "color"), 1, c);
	glUniform3fv(get_uni_loc(programId, "rotation"), 1, object.rotation);
	glUniform4fv(get_uni_loc(programId, "translation"), 1, object.translate);
	GLuint text_id = glGetUniformLocation(programId, "texture");
	glUniform1i(text_id, texture);
	PRINT_OPENGL_ERROR();

	//Draw
	glDrawElements(GL_TRIANGLES, 6 * object.nbBillboards, GL_UNSIGNED_INT, nullptr);
	glDisableVertexAttribArray(0);
	glDisableVertexAttribArray(1);
	glDisableVertexAttribArray(2);
	//glDeleteBuffers(1,&object.vertexBuffer);
	//glDeleteBuffers(1, &object.indexBuffer);
	
}

void Renderer::updateBillboard(Billboard& object) {

	//Vertex array buffer update
	glBindBuffer(GL_ARRAY_BUFFER, object.vertexBuffer);// PRINT_OPENGL_ERROR();
	glBufferSubData(GL_ARRAY_BUFFER,0, (object.nbBillboards) * 36 * sizeof(float), &object.vertexArray[0]);
	//index array buffer update 
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, object.indexBuffer);
	glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, 0, object.nbBillboards * 6 * sizeof(int), &object.indexArray[0]);
}

//Terrains 

void Renderer::initTerrain(Terrain& object) {

	//Vertex array buffer generation
	glGenBuffers(1, &object.vertexBuffer);
	glBindBuffer(GL_ARRAY_BUFFER, object.vertexBuffer);
	glBufferData(GL_ARRAY_BUFFER, object.nbSquares * 24 * sizeof(float), &object.vertexArray[0], GL_DYNAMIC_DRAW);

	//Index array buffer generation
	glGenBuffers(1, &object.indexBuffer);
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, object.indexBuffer);
	glBufferData(GL_ELEMENT_ARRAY_BUFFER, object.nbSquares * 6 * sizeof(int), &object.indexArray[0], GL_DYNAMIC_DRAW);

}
void Renderer::renderTerrain(Terrain& object, GLuint programId,GLuint texture) {

	glDisable(GL_POLYGON_OFFSET_FILL);
	glUseProgram(programId);


	//Buffer binding
	//glGenVertexArrays(1, &object.VAO);
	//glBindVertexArray(object.VAO);
	glBindBuffer(GL_ARRAY_BUFFER, object.vertexBuffer);
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, object.indexBuffer);

	//Attrib pointers
	glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), nullptr);
	glEnableVertexAttribArray(0);

	glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), buffer_offset(3 * sizeof(float)));
	glEnableVertexAttribArray(1);


	//Uniform
	float c[4]{ object.color.x,object.color.y,object.color.z,object.alpha };
	//PRINT_OPENGL_ERROR();
	glUniform4fv(get_uni_loc(programId, "color"), 1, c);
	GLuint text_id = glGetUniformLocation(programId, "texture");
	glUniform1i(text_id, texture);

	//Draw
	glDrawElements(GL_TRIANGLES, 6 * object.nbSquares, GL_UNSIGNED_INT, nullptr);

}

Thanks you in advance !

Alfonse_Reinheart · October 12, 2022, 2:11pm

Before getting into viewport culling or even distance culling, you need to resolve all of the bad things your code is doing (depending on how many objects constitutes “a lot of objects”.

There are techniques for updating buffer data every frame. This is not one of them. It’s random-access per-object rather than sequential. It makes no effort to avoid GPU/CPU synchronization with multiple buffering or invalidation. You’re giving each “billboard” object a separate buffer object instead of putting them all into a single buffer. Etc.

It’s just a litany of bad practices from a performance perspective. Just look at your index buffer. Are you not using the exact same index data for every billboard? Oh sure, some “billboards” have more quads to render, but that’s a matter of the count you pass to glDrawElements. They’re all using the same index data. You could just have one “big” buffer with a bunch of quad indices in them which they all share.

You need to resolve all of these performance issues regardless of how you choose not to render distant objects.

Besseo · October 12, 2022, 2:35pm

In fact I made a mistake and my code is not clear.
The billboard object contain all the vertices for one type of billboard, for example for the trees the billboard object will contain and render the vertices of each trees.
In fact I made a billboard object should be called groupOfBillboards as it contains a lot of them. I made this object in order to be able to create different type of billboards.

In fact for all my trees for example I only do one gldraw element.
I’m still learning, and i’m not sure to understant waht is wrong. Can you help me ?

Besseo · October 12, 2022, 3:07pm

I have 450 000 billboards to draw and they are split in 8 different billboard objects.
When I run it with my 3060 I have 0 issues (60 frames per seconds) ! But when I run it with my laptop (cheap nvidia card) I get like 30 frames per second at best.

Those 450 000 billboards are always rendered as I don’t have a mechanism to only render object near to the camera. And It explains the performance issue I guess.

Dark_Photon · October 13, 2022, 2:51pm

I wouldn’t assume. Profile to determine what your largest bottleneck is.

Are you CPU bound? GPU bound? And bound on what specifically? You have no idea what to optimize until you know what your largest bottleneck is. And it sounds like you’re running with VSync ON. Disable it for profiling.

If/when you establish that you are definitely not CPU limited (reworking your rendering to ensure this; see tips from Alfonse above), then figure out what on the GPU you’re bound by.

When I see numbers of “450,000 billboards”, the GPU-side issues that come to mind are 1) vertex transforms, and 2) fill (fragment shader executions, framebuffer pixels written, blend cost (framebuffer pixels read), alpha-test cost/tradeoffs, etc.).

Good culling (already discussed) will cut back on some of this cost. But even with that, perf may not be sufficient for your needs on a low-end laptop GPU. You’ll just have to see.

As a quick kick-the-tires test, on your laptop, try rendering the same field-of-view to a 4X larger window (2x2 more pixels), and/or a 4X smaller window (0.5 x 0.5 pixels). What do you see? About the same frame rate with VSync OFF? If so, you might be vertex transform limited. If not, you might be fill limited. Consider techniques to reduce this (e.g. billboard thinning, tile-based rendering, etc.)

mhagain · October 13, 2022, 10:58pm

450k billboards says “particle system” to me. Now, 450k can run fast or can run slow, even on low end hardware, but it’s heavily dependent on so many factors, including information you haven’t yet provided.

A valid approach to culling particles is what I call “hang 'em all and let God sort 'em out” - in other words, the built-in culling on the GPU can be pretty efficient, and for simple quads is often more efficient than per-quad CPU culling, which can come with knock-ons such as breaking batches, needing to update buffers, etc.

Speaking of updating buffers, if you can run your particle physics entirely on the GPU, it can be a win. Particle physics, if it’s simple enough that it doesn’t need to interact with other objects, but is just basic velocity and gravity, can be calculated each frame on the GPU from initial parameters, and can help you keep the data static, avoiding updating buffers.

If you must cull, you probably have a concept of particle emitters, each of which spawns multiple particles. Culling per emitter can work well, and this kind of coarse CPU culling, leaving the GPU to handle finer culling, is the way to go.

There are lots of online articles about the respective benefits or disadvantages of geometry shaders vs instancing vs just sending 4 vertices per quad for this kind of drawing. Anecdotally, I’ve had both good and bad results from all of these approaches. I’ve built million particle systems that run well on Intel integrateds. A lot of it is going to depend on what other work your GPU is doing in each frame, and I’d advise to trust nobody, benchmark them all, and pick the approach that gives you the best.

That leaves you with fillrate and blending. It’s just an unfortunate fact that large, overlapping alpha-blended quads are just going to run slower than the same number of smaller, non-overlapping, non-blended quads. The GPU just has to do more work. You can get benefit from simplifying your fragment shader, using smaller textures, or even avoiding alpha blending if you can get away with it.