Deferred Shading is very slow

My OpenGL application (written in Java using LWJGL) works fine when rendering the scene normal. Now i have implemented deferred rendering an it’s running only at about 30 fps. I havent got many models in my 3D scene.

Sadly it is very difficult to debug those performance issues cause i’m not able to measure the time of each method call in code, cause they’re not executed immediatley. Also i’m not able to use gdebugger with my java application (or very difficult).

Rendering-Code:

glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glColor4f(1.0f, 1.0f, 1.0f, 1.0f);
GraphicsData.camera.setView();

glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);

multipleRenderTarget.start();

drawEntities();

glBindTexture(GL_TEXTURE_2D, 0);

glUseProgram(0);

multipleRenderTarget.stop();

if(1==0)
{
multipleRenderTarget.showTexture(0, 512, 384, 0, 0);
multipleRenderTarget.showTexture(1, 512, 384, 512, 0);
multipleRenderTarget.showTexture(2, 512, 384, 0, 384);
}
else
{
DeferredRendering.render();
}

Object that manages the Framebufferobject: (called multipleRenderTarget)

private int fbo;
private int diffuseRT;
private int positionRT;
private int normalsRT;
private int depthBuffer;

private int diffuseTexture;
private int positionTexture;
private int normalsTexture;

//Constructor
public FBORenderTexture() throws Exception
{

fbo = glGenFramebuffers();
diffuseRT = glGenRenderbuffers();
positionRT = glGenRenderbuffers();
normalsRT = glGenRenderbuffers();
depthBuffer = glGenRenderbuffers();

glBindFramebuffer(GL_FRAMEBUFFER, fbo);

glBindRenderbuffer(GL_RENDERBUFFER, diffuseRT);
glRenderbufferStorage(GL_RENDERBUFFER, GL_RGBA, GraphicsData.WIDTH, GraphicsData.HEIGHT);
glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, diffuseRT);

glBindRenderbuffer(GL_RENDERBUFFER, positionRT);
glRenderbufferStorage(GL_RENDERBUFFER, GL_RGBA32F, GraphicsData.WIDTH, GraphicsData.HEIGHT);
glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, GL_RENDERBUFFER, positionRT);

glBindRenderbuffer(GL_RENDERBUFFER, normalsRT);
glRenderbufferStorage(GL_RENDERBUFFER, GL_RGBA16F, GraphicsData.WIDTH, GraphicsData.HEIGHT);
glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT2, GL_RENDERBUFFER, normalsRT);

glBindRenderbuffer(GL_RENDERBUFFER, depthBuffer);
glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, GraphicsData.WIDTH, GraphicsData.HEIGHT);
glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, depthBuffer);

diffuseTexture = glGenTextures();
glBindTexture(GL_TEXTURE_2D, diffuseTexture);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GraphicsData.WIDTH, GraphicsData.HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, (ByteBuffer)null);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, diffuseTexture, 0);

positionTexture = glGenTextures();
glBindTexture(GL_TEXTURE_2D, positionTexture);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, GraphicsData.WIDTH, GraphicsData.HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, (ByteBuffer)null);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, GL_TEXTURE_2D, positionTexture, 0);

normalsTexture = glGenTextures();
glBindTexture(GL_TEXTURE_2D, normalsTexture);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, GraphicsData.WIDTH, GraphicsData.HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, (ByteBuffer)null);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT2, GL_TEXTURE_2D, normalsTexture, 0);

if(glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE)
throw new Exception(“Can’t initialize FBO!”);

glBindFramebuffer(GL_FRAMEBUFFER, 0);
}

public void start()
{

glBindFramebuffer(GL_FRAMEBUFFER, fbo);
glPushAttrib(GL_VIEWPORT_BIT);
glViewport(0, 0, GraphicsData.WIDTH, GraphicsData.HEIGHT);

glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glClearColor(0f, 0f, 0f, 1f);

glActiveTexture(GL_TEXTURE0);
glEnable(GL_TEXTURE_2D);

IntBuffer buffers = BufferUtils.createIntBuffer(3);
buffers.put(0, GL_COLOR_ATTACHMENT0);
buffers.put(1, GL_COLOR_ATTACHMENT1);
buffers.put(2, GL_COLOR_ATTACHMENT2);

glDrawBuffers(buffers);

glUseProgram(shaderEngine.mrt);
shaderEngine.loadMatrix(glGetUniformLocation(shaderEngine.mrt, “projectionMatrix”), GraphicsData.camera.projectionMatrix);
shaderEngine.loadMatrix(glGetUniformLocation(shaderEngine.mrt, “viewMatrix”), GraphicsData.camera.viewMatrix);
}

public void stop()
{

glBindFramebuffer(GL_FRAMEBUFFER, 0);
glPopAttrib();

glUseProgram(0);
}

[FONT=arial]Final rendering code:

[/FONT]//Projection setup
glMatrixMode(GL_PROJECTION);
glPushMatrix();
glLoadIdentity();
glOrtho(0,GraphicsData.WIDTH,0,GraphicsData.HEIGHT,0.1f,2);

//Model setup
glMatrixMode(GL_MODELVIEW);
glPushMatrix();

glUseProgram(shaderEngine.rendering);
glUniform3f(glGetUniformLocation(shaderEngine.rendering, “cameraPosition”), GraphicsData.camera.x, GraphicsData.camera.y, GraphicsData.camera.z);

glActiveTexture(GL_TEXTURE0);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, renderEngine.multipleRenderTarget.getDiffuseTexture());
glUniform1i(glGetUniformLocation(shaderEngine.rendering, “diffuseTexture”), 0);

glActiveTexture(GL_TEXTURE1);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, renderEngine.multipleRenderTarget.getPositionTexture());
glUniform1i(glGetUniformLocation(shaderEngine.rendering, “positionTexture”), 1);

glActiveTexture(GL_TEXTURE2);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, renderEngine.multipleRenderTarget.getNormalsTexture());
glUniform1i(glGetUniformLocation(shaderEngine.rendering, “normalsTexture”), 2);

//Render the quad
glLoadIdentity();
glColor4f(1f, 1f, 1f, 1f);
glTranslatef(0f, 0f, -1.0f);

glBegin(GL_QUADS);
glTexCoord2f( 0, 0 );
glVertex3f( 0.0f, 0.0f, 0.0f);
glTexCoord2f( 1, 0 );
glVertex3f( (float) GraphicsData.WIDTH, 0.0f, 0.0f);
glTexCoord2f( 1, 1 );
glVertex3f( (float) GraphicsData.WIDTH, (float) GraphicsData.HEIGHT, 0.0f);
glTexCoord2f( 0, 1 );
glVertex3f( 0.0f, (float) GraphicsData.HEIGHT, 0.0f);
glEnd();

//Reset OpenGL state
glActiveTexture(GL_TEXTURE0);
// glDisable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, 0);

glActiveTexture(GL_TEXTURE1);
glDisable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, 0);

glActiveTexture(GL_TEXTURE2);
glDisable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, 0);

glActiveTexture(GL_TEXTURE0);

glUseProgram(0);

glMatrixMode(GL_PROJECTION);
glPopMatrix();
glMatrixMode(GL_MODELVIEW);
glPopMatrix();

Deffered shading vertex shader:

#version 130

uniform mat4 projectionMatrix;
uniform mat4 viewMatrix;
uniform mat4 modelMatrix;
uniform mat3 normalMatrix;

out vec3 normals;
out vec4 position;

void main()
{
//use normalMatrix calculated on CPU
//mat4 worldMatrix = modelMatrix;
//mat3 worldRotationInverse = transpose(mat3(worldMatrix));

gl_Position = projectionMatrix * viewMatrix * modelMatrix * gl_Vertex;
gl_TexCoord[0] = gl_TextureMatrix[0] * gl_MultiTexCoord0;
normals = normalize(normalMatrix * gl_Normal);
position = modelMatrix * gl_Vertex;
gl_FrontColor = vec4(1.0, 1.0, 1.0, 1.0);

}

[FONT=arial]Deferred shading fragment shader:

#version 130

uniform sampler2D tex;

in vec3 normals;
in vec4 position;

void main()
{
gl_FragData[0] = vec4(texture2D(tex, gl_TexCoord[0].st).rgb, 1);
gl_FragData[1] = vec4(position.xyz, 1);
gl_FragData[2] = vec4(normals.xyz,1);
}

Deffered rendering vertex shader:

[/FONT][FONT=arial][FONT=courier new]#version 130

void main()
{
gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;
gl_TexCoord[0] = gl_MultiTexCoord0;

gl_FrontColor = vec4(1.0, 1.0, 1.0, 1.0);

}

Deffered rendering fragment shader:

[/FONT]#version 130

uniform sampler2D diffuseTexture;
uniform sampler2D positionTexture;
uniform sampler2D normalsTexture;
uniform vec3 cameraPosition;

void main()
{
vec4 image = texture2D(diffuseTexture, gl_TexCoord[0].xy);
vec4 position = texture2D(positionTexture, gl_TexCoord[0].xy);
vec3 normal = texture2D(normalsTexture, gl_TexCoord[0].xy).xyz;

vec3 light = vec3(0, 0, 0);
vec3 lightDir = light - position.xyz;

normal = normalize(normal);
lightDir = normalize(lightDir);

vec3 eyeDir = normalize(cameraPosition - position.xyz);
vec3 vHalfVector = normalize(lightDir.xyz + eyeDir);

vec4 diffuseComponent = max(dot(normal,lightDir),0.0) * image;
float specularComponent = pow(max(dot(normal,vHalfVector),0.0), 100) * 1.5;

gl_FragColor = diffuseComponent + specularComponent;

}

How can i speed the rendering process up? Which tools may i use to debug it?[/FONT]

I’m going to ignore your very strange combination of legacy GL with modern methods. That’s probably not the issue here.

There are several things you’re not doing very well.

First, let’s manage your expectations. Deferred rendering is not a magical salve that you coat a rendering system with to make it go faster. It is an optimization technique, and like all such techniques does not always result in faster execution.

Deferred rendering is designed to make scenes that use lots of lights render fast. And by “lots,” I mean “hundreds”, dozens of which are affecting multiple objects at any one time. If your regular scene is only running a couple of lights, it won’t be faster and will almost certainly be a good deal slower.

Second:

My OpenGL application (written in Java using LWJGL) works fine when rendering the scene normal. Now i have implemented deferred rendering an it’s running only at about 30 fps.

You didn’t say what your framerate was pre-deferred rendering. Equally importantly, don’t measure framerate. Instead, measure the time it takes to render frames (typically in miliseconds). If available, you should use OpenGL timer queries to do this.

Third, you’re creating 3 renderbuffers that you don’t need and never use. That’s probably not a performance problem per-se, but it is a problem in that you’re allocating lots of memory you don’t need.

Fourth:

glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, GraphicsData.WIDTH, GraphicsData.HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, (ByteBuffer)null);

You are using way too much precision for mere normals. GL_R11F_G11F_B10F is perfectly acceptable for storing normal data, and it only takes up 4 bytes per pixel (unlike the eight here). GL_RGBA8_SNORM also works fine. It’d also give you an extra 8 bits to play with for other data.

The size of your data is a big part of performance in deferred renderers. Less data to fetch means less time to fetch that data.

Fifth:

glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, GraphicsData.WIDTH, GraphicsData.HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, (ByteBuffer)null);

You are using 32-bit floats to store positions. Equally importantly, you’re storing 4 components, when one of them is pretty much guaranteed to be 1.0. Unless you have some pressing need, you should only be storing 3 components, and those components ought to be 16-bit floats. Not unless you can prove that you absolutely need the precision.

But that doesn’t matter, because:

Sixth: you’re storing positions at all. Never do this; you can reconstruct the position in the fragment shader, on your deferred pass, just from the depth value taken by the depth buffer, part of gl_FragCoord, viewport settings, and a few elements from your projection matrix.

There’s even a more optimal way that uses your vertex shader.

You should be able to do lighting in eye-space (transform the lights into eye-space before using them), or you can transform the position into whatever space you need.

I don’t guarantee any particular performance improvement from this, since it’s not clear how much stress you’re putting on the system.

[QUOTE=Geosearchef;1266457]My OpenGL application (written in Java using LWJGL) works fine when rendering the scene normal. Now i have implemented deferred rendering an it’s running only at about 30 fps. I havent got many models in my 3D scene.


Sadly it is very difficult to debug those performance issues cause i’m not able to measure the time of each method call in code, cause they’re not executed immediatley.[/QUOTE]
Correct. So what can you do?

Well, without doing anything special at all, you can make your camera always fill the window (same field of view, same eyepoint, just different resolution). Now resize the window (i.e. change the resolution; you can do this by changing the viewport as well). Does the frame time go up when you resize the window larger? If so, look for ways to reduce your fragment load (cost per fragment or number of fragments). If not, look higher up the pipeline (e.g. CPU or vertex shader). True, this won’t give you the smoking gun, but it helps narrow down your problem a bit without any instrumentation (besides accurately measuring your whole-frame render time including swap). NOTE: in this test (and all the ones below) I’m assuming you’re measuring frame times with V-Sync (sync-to-vblank) disabled. Otherwise, you’re timing variable sleep times and this is all pointless. ALSO NOTE: Here I also assume you have a glFinish() right after your SwapBuffers command, and that you are running on a desktop GPU (or Tegra) – more on that below.

So what else can you do without any instrumentation? You can measure time deltas? Do it one way and get a frame time. Now remove something (X), retest, and get a frame time. See any delta? There you go. That’s the cost of X (if you’re careful). Having “knobs” in your engine to turn different subfeatures on and off at runtime can be very useful for doing this quick diagnosis to nail down where the culprit(s) might be without changing any code.

Now, anything else you can do without instrumentation? Sure. You can time large subsections of your frame with high-precision CPU timers, separating each with a glFinish(). NOTE: Do NOT have these glFinish()s enabled by default. Have a knob “enable sub-frame timing statistics” (disabled by default). When enabled, time each subsection, with a glFinish() before sampling the “end subsection” time. When disabled, skip these glFinish()s. NOTE: This adds some time to your frame (so don’t do this in production mode), but it’s an easy way to ballpark which section of your frame is the “big fish”. NOTE: Here I assume you’re running on a desktop GPU (or Tegra), where all sections of the GPU pipeline run in parallel “on the current frame”! This advice does not apply to a non-Tegra mobile GPU (aka tile-based deferred GPUs; e.g. PowerVR, Mali, Adreno), which rasterize and shades fragments a frame late. glFinish() really hoses those GPUs up.

As to the rest, Alfonse’s already given you some good input. You are aware that deferred shading can have pretty high framebuffer bandwidth requirements, right? What is your GPU? Is it a top-of-the-line high-end desktop GPU? A low-end desktop? A laptop GPU? Which? If it doesn’t have a pretty hefty GPU memory bandwidth, you might want to be cautious or just get a better GPU for comparison. What resolution are you running at? If high, be doubly cautious. And as Alonso said, if you don’t have lots of lights, or you can easily determine which lights illuminate which objects, then you might want to reconsider why you’re doing this and what you hope to gain from it. Your example above, as written, doesn’t appear to warrent deferred rendering.

Thanks for your answers i’m currently trying to implement your suggestions.

You didn’t say what your framerate was pre-deferred rendering.

It was always at 60 (using Display.sync(60)). Now it’s jumping between 30 and 60. The more important part is, that it feels much slower and it also feels like the input is delayed.

Instead, measure the time it takes to render frames (typically in miliseconds). If available, you should use OpenGL timer queries to do this.

I’ve measured time with Timer Queries:

NEW VERSION using deferred Shading:
Frame 951 : 24.464ms
Rendering : 21.986ms
Clearing Buffers : 0.039ms
MRT : 19.191ms
MRT setup : 0.049ms
Rendering : 19.133ms
Rendering 0 : 3.854ms (skybox)
Rendering 1 : 15.027ms (a model with many polygons)
Rendering 2 : 0.23ms
Rendering 3 : 0.014ms
Stopping MRT : 0.001ms
Rendering lighting: : 2.75ms
Updating Display : 2.474ms (includes remaining waiting time)

OLD VERSION:
Frame 2750 : 15.256ms
Rendering : 11.187ms
Clearing Buffers : 0.044ms
Starfield setup : 0.001ms
Rendering starfield : 0.001ms
Rendering setup : 0.011ms
Rendering objects : 11.116ms (includes lighting)
Rendering 0 : 1.52ms (skybox)
Rendering 1 : 9.37ms (a model with many polygons, runs fluent at all times)
Rendering 2 : 0.185ms
Rendering 3 : 0.024ms
Updating Display : 4.062ms (includes remaining waiting time)

[FONT=arial]

[/FONT]Third, you’re creating 3 renderbuffers that you don’t need and never use

[FONT=arial]Where?

[/FONT]I’m assuming you’re measuring frame times with V-Sync (sync-to-vblank) disabled.

Of course.

[FONT=courier new][FONT=arial]

[/FONT][/FONT]What is your GPU?

I’m running it on a laptop. I’m using an Intel Graphics HD integrated graphics card of an i7-4500U (which I noticed right now). NVIDIA choses the integrated card automaticly for all Java applications. So i used the NVIDIA control panel to make it user my Geforce GT 740M. Now it’s running much better but the frame time is still jumping around. The rendering time of the old code has decreased even more. I wonder why this is the case cause the problem isn’t caused by the framebuffer itself but instead by rendering to the framebuffer. Maybe it’s also caused by the shaders used for multiple target rendering, I’ll check them.

why you’re doing this

I want to use post processing which in cannot do in the shader while drawing the scene.

I’ve measured time with Timer Queries:

The idea with deferred rendering is that the cost increases for the primary rendering should be offset by the cost decreases from removing and consolidating lighting computations. Your scene doesn’t have enough lighting complexity, overdraw, and other elements for deferred rendering to improve overall performance.

Also, those numbers look very much like you’re killing the card’s bandwidth, which exacerbates the cost increase for primary rendering. This is reinforced by:

I’m running it on a laptop. I’m using an Intel Graphics HD integrated graphics card of an i7-4500U (which I noticed right now).

Yeah, bandwidth on that’s really going to hurt. Especially with that 128-bit-per-pixel position buffer.

Third, you’re creating 3 renderbuffers that you don’t need and never use

Where?

The part where you’re calling glRenderbufferStorage, to allocate storage for them, but then never using them for anything.

Framebuffer objects have multiple attachment points. But you can only ever have one image attached to any particular attachment point at any one time. You call glFramebufferRenderbuffer(GL_COLOR_ATTACHMENT0) to attach a renderbuffer to the FBO. But later, you call glFramebufferTexture2D(GL_COLOR_ATTACHMENT0), which detaches the previously attached renderbuffer, changing the attachment to an image in the texture.

So you create renderbuffers, attach them to the FBO, then immediately detach them. So they are never used by anything.

I’ve reduced the precision of the normals to GL_R11F_G11F_B10F. Got me 0.15 ms. Sadly it doesn’t render all sides. Apparently it can not save negative numbers.

I’ve also removed the renderbuffers.

Especially with that 128-bit-per-pixel position buffer

Is there a way to use a buffer with lower precision?

Currently I’m trying to reconstruct the position (in the fragment shader of the lighting-pass) from the depth. How do i read the depth? gl_FragCoord.z? How can i convert it back to eyeSpace then using the inverse of the projection Matrix? Can i use the inverse of the viewMatrix afterwards to convert it back to worldSpace? (cause that’s what I’m using for my lighting calculation) I’m a little bit confused about how to implement this into my shader.

Doh! My fault; I forgot that those images were only for positive floats-only.

But GL_RGBA8_SNORM ought to work. I’m rather surprised that image formats don’t provide a 10-bit-per-channel signed normalized integer format (there is GL_RGB10_A2, but that’s unsigned).

Well, 16-bit floats could work, depending on your precision needs and your distance from the center of the world.

By reading from the depth buffer you wrote during your main rendering passes. That’s why it’s a texture and not a renderbuffer; so that you can read from it in your deferred passes.

You’d be replacing a 128-bit read with a 32-bit read.

I showed you a link, explaining in detail exactly what you have to do, with (admittedly untested) example GLSL code. Also, you need a little more than just the inverse projection matrix.

Well, I don’t know what spaces your viewMatrix work in. But generally speaking, people try to avoid using an explicit world space in shaders, preferring to do lighting in view/eye/camera space.

But if your viewMatrix goes from world to view space, then the inverse of that matrix goes from view to world space.