Optimisation problem when using "per pixel linked lists"

Hi! VBO can optimize the speed a lot but I’ve a problem when using multiple VBO.
In fact, when I want to draw a VBO and then another VBO here :


                for (unsigned int i = 0; i < m_normals.size(); i++) {
                   if (m_normals[i].getAllVertices().getVertexCount() > 0) {
                        //std::cout<<"next frame draw normal"<<std::endl;
                        if (m_normals[i].getMaterial().getTexture() == nullptr) {
                            perPixelLinkedList2.setParameter("haveTexture", 0.f);
                        } else {
                            math::Matrix4f texMatrix = m_normals[i].getMaterial().getTexture()->getTextureMatrix();
                            perPixelLinkedList2.setParameter("textureMatrix", texMatrix);
                            perPixelLinkedList2.setParameter("haveTexture", 1.f);
                        }
                        if (m_normals[i].getVertexArrays()[0]->getEntity()->isWater()) {
                            perPixelLinkedList2.setParameter("water", 1.0f);
                        } else {
                            perPixelLinkedList2.setParameter("water", 0.0f);
                        }
                        if (core::Application::app != nullptr) {
                            float time = core::Application::getTimeClk().getElapsedTime().asSeconds();
                            perPixelLinkedList2.setParameter("time", time);
                        }
                        currentStates.blendMode = sf::BlendNone;
                        currentStates.shader = &perPixelLinkedList2;
                        currentStates.texture = m_normals[i].getMaterial().getTexture();
                        vb.clear();
                        vb.setPrimitiveType(m_normals[i].getAllVertices().getPrimitiveType());
                        for (unsigned int j = 0; j < m_normals[i].getAllVertices().getVertexCount(); j++) {
                            vb.append(m_normals[i].getAllVertices()[j]);
                        }
                        vb.update();
                        frameBuffer.drawVertexBuffer(vb, currentStates);
                    }
                }
                glCheck(glFinish());
                glCheck(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
                glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
                vb2.clear();
                vb2.setPrimitiveType(sf::Quads);
                Vertex v1 (sf::Vector3f(0, 0, quad.getSize().z));
                Vertex v2 (sf::Vector3f(quad.getSize().x,0, quad.getSize().z));
                Vertex v3 (sf::Vector3f(quad.getSize().x, quad.getSize().y, quad.getSize().z));
                Vertex v4 (sf::Vector3f(0, quad.getSize().y, quad.getSize().z));
                vb2.append(v1);
                vb2.append(v2);
                vb2.append(v3);
                vb2.append(v4);
                vb2.update();
                math::Matrix4f matrix = quad.getTransform().getMatrix().transpose();
                perPixelLinkedListP2.setParameter("worldMat", matrix);
                currentStates.shader = &perPixelLinkedListP2;
                frameBuffer.drawVertexBuffer(vb2, currentStates);
                glCheck(glFinish());
                frameBuffer.display();

There is a lack of performances.
This is because I’ve have to set the vertex attrib pointers again each time I switch the VBO here (even if I set the pointers once for all my VBO, at the first frame, that doesn’t work it doesn’t display the scene form frame 2 I really have to reset vertex attrib pointers at every frames when I use multiple VBO for rendering) :

void RenderTarget::drawVertexBuffer(VertexBuffer& vertexBuffer, RenderStates states) {
            if (vertexBuffer.getVertexCount() == 0) {
                return;
            }

            if (activate(true))
            {

                if (!m_cache.glStatesSet)
                    resetGLStates();
                // Apply the view
                if (m_cache.viewChanged)
                    applyCurrentView();

                if (states.blendMode != m_cache.lastBlendMode)
                    applyBlendMode(states.blendMode);

                // Apply the texture
                sf::Uint64 textureId = states.texture ? states.texture->getNativeHandle() : 0;
                if (textureId != m_cache.lastTextureId)
                    applyTexture(states.texture);
                // Apply the shader
                if (states.shader)
                    applyShader(states.shader);
                if (m_versionMajor > 3 || m_versionMajor == 3 && m_versionMinor >= 3)
                    glCheck(glBindVertexArray(m_vao));
                if (m_cache.lastVboBuffer != &vertexBuffer) {
                    if (m_versionMajor > 3 || m_versionMajor == 3 && m_versionMinor >= 3) {
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer.vboVertexBuffer));
                        glCheck(glEnableVertexAttribArray(0));
                        glCheck(glEnableVertexAttribArray(1));
                        glCheck(glEnableVertexAttribArray(2));
                        glCheck(glVertexAttribPointer(0, 3,GL_FLOAT,GL_FALSE,sizeof(Vertex), (GLvoid*) 0));
                        glCheck(glVertexAttribPointer(1, 4,GL_UNSIGNED_BYTE,GL_TRUE,sizeof(Vertex),(GLvoid*) 12));
                        glCheck(glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(Vertex), (GLvoid*) 16));
                        glCheck(glEnableVertexAttribArray(3));
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer.vboNormalBuffer));
                        glCheck(glVertexAttribPointer(3, 3, GL_FLOAT, GL_FALSE, sizeof(sf::Vector3f), (GLvoid*) 0));
                        glCheck(glDisableVertexAttribArray(0));
                        glCheck(glDisableVertexAttribArray(1));
                        glCheck(glDisableVertexAttribArray(2));
                        glCheck(glDisableVertexAttribArray(3));
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, 0));
                    } else {
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer.vboVertexBuffer));
                        glCheck(glEnableClientState(GL_COLOR_ARRAY));
                        glCheck(glEnableClientState(GL_TEXTURE_COORD_ARRAY));
                        glCheck(glEnableClientState(GL_VERTEX_ARRAY));
                        glCheck(glVertexPointer(3, GL_FLOAT, sizeof(Vertex), (GLvoid*) 0 ));
                        glCheck(glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(Vertex), (GLvoid*) 12));
                        glCheck(glTexCoordPointer(2, GL_FLOAT, sizeof(Vertex),(GLvoid*) 16));
                        glCheck(glEnableClientState(GL_NORMAL_ARRAY));
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer.vboNormalBuffer));
                        glCheck(glNormalPointer(GL_FLOAT, sizeof(sf::Vector3f), (GLvoid*) 0));
                        glCheck(glDisableClientState(GL_COLOR_ARRAY));
                        glCheck(glDisableClientState(GL_TEXTURE_COORD_ARRAY));
                        glCheck(glDisableClientState(GL_VERTEX_ARRAY));
                        glCheck(glDisableClientState(GL_NORMAL_ARRAY));
                        glCheck(glBindBuffer(GL_ARRAY_BUFFER, 0));
                    }
                    m_cache.lastVboBuffer = &vertexBuffer;
                }
                if (m_versionMajor > 3 || m_versionMajor == 3 && m_versionMinor >= 3) {
                    glCheck(glEnableVertexAttribArray(0));
                    glCheck(glEnableVertexAttribArray(1));
                    glCheck(glEnableVertexAttribArray(2));
                    glCheck(glEnableVertexAttribArray(3));
                } else {
                    glCheck(glEnableClientState(GL_COLOR_ARRAY));
                    glCheck(glEnableClientState(GL_TEXTURE_COORD_ARRAY));
                    glCheck(glEnableClientState(GL_VERTEX_ARRAY));
                    glCheck(glEnableClientState(GL_NORMAL_ARRAY));
                }

                // Find the OpenGL primitive type
                static const GLenum modes[] = {GL_POINTS, GL_LINES, GL_LINE_STRIP, GL_TRIANGLES,
                                                   GL_TRIANGLE_STRIP, GL_TRIANGLE_FAN, GL_QUADS};
                GLenum mode = modes[vertexBuffer.getPrimitiveType()];
                if (vertexBuffer.m_indexes.size() > 0) {
                    glCheck(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBuffer.vboIndexBuffer));
                    glCheck(glDrawElements(mode, vertexBuffer.m_indexes.size(), GL_UNSIGNED_INT, (GLvoid*) 0));
                    glCheck(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0));
                } else {
                    //std::cout<<"draw arrays"<<std::endl;
                    glCheck(glBindFramebuffer(GL_FRAMEBUFFER, m_framebufferId));
                    glCheck(glDrawArrays(mode, 0, vertexBuffer.getVertexCount()));
                }
                if (m_versionMajor > 3 || m_versionMajor == 3 && m_versionMinor >= 3) {
                    glCheck(glDisableVertexAttribArray(0));
                    glCheck(glDisableVertexAttribArray(1));
                    glCheck(glDisableVertexAttribArray(2));
                    glCheck(glDisableVertexAttribArray(3));
                    glCheck(glBindVertexArray(0));
                } else {
                    glCheck(glDisableClientState(GL_COLOR_ARRAY));
                    glCheck(glDisableClientState(GL_TEXTURE_COORD_ARRAY));
                    glCheck(glDisableClientState(GL_VERTEX_ARRAY));
                    glCheck(glDisableClientState(GL_NORMAL_ARRAY));
                }
            }

        }

When I comment the line where I draw the second VBO, it runs really faster, or is it my second fragment shader which is slow ?

    const std::string fragmentShader2 =
                   R"(
                   #version 460
                   #define MAX_FRAGMENTS 20
                   struct NodeType {
                      vec4 color;
                      float depth;
                      uint next;
                   };
                   layout(binding = 0, r32ui) uniform uimage2D headPointers;
                   layout(binding = 0, std430) buffer linkedLists {
                       NodeType nodes[];
                   };
                   layout(location = 0) out vec4 fcolor;
                   void main() {
                      NodeType frags[MAX_FRAGMENTS];
                      int count = 0;
                      uint n = imageLoad(headPointers, ivec2(gl_FragCoord.xy)).r;
                      while( n != 0xffffffffu && count < MAX_FRAGMENTS) {
                           frags[count] = nodes[n];
                           n = frags[count].next;
                           count++;
                      }
                      //merge sort
                      int i, j1, j2, k;
                      int a, b, c;
                      int step = 1;
                      NodeType leftArray[MAX_FRAGMENTS/2]; //for merge sort
                      while (step <= count)
                      {
                          i = 0;
                          while (i < count - step)
                          {
                              ////////////////////////////////////////////////////////////////////////
                              //merge(step, i, i + step, min(i + step + step, count));
                              a = i;
                              b = i + step;
                              c = (i + step + step) >= count ? count : (i + step + step);
                              for (k = 0; k < step; k++)
                                  leftArray[k] = frags[a + k];
                              j1 = 0;
                              j2 = 0;
                              for (k = a; k < c; k++)
                              {
                                  if (b + j1 >= c || (j2 < step && leftArray[j2].depth > frags[b + j1].depth))
                                      frags[k] = leftArray[j2++];
                                  else
                                      frags[k] = frags[b + j1++];
                              }
                              ////////////////////////////////////////////////////////////////////////
                              i += 2 * step;
                          }
                          step *= 2;
                      }
                      vec4 color = vec4(0, 0, 0, 0);
                      for( int i = count - 1; i >= 0; i--)
                      {
                        color.rgb = frags[i].color.rgb * frags[i].color.a + color.rgb * (1 - frags[i].color.a);
                        color.a = frags[i].color.a + color.a * (1 - frags[i].color.a);
                      }
                      fcolor = color;
                   })";

But I don’t think so I think calling glBindXXX functions and reset the vertexAttribPointers slow down the performances.
How should I do ? Use only one VBO to draw everything ? I don’t thinks that’s a good idea because if I want to draw particles by example which can be added and removed from the scene while rendering it, I need to remove/insert new indexes to the CPU index array to pass it then to the indexed VBO, this is really slow to modify the array of indexes with the CPU. (I’ve already tested this)
This is stupid to have to redefines them, because the attrib pointers are the same for all my VBOs.

What should I do ? Thanks.

EDIT Maybe I should put every visible objects to one VBO and update it at every frame, so I 've to pass this unique VBO, to all my renderers, this is really…

Or maybe using a static variable.

EDIT 2 : Ho! Using the same vbo for the two passes also decrease the performance so it’s the shader which is slow.

for (unsigned int i = 0; i < m_normals.size(); i++) {
                   if (m_normals[i].getAllVertices().getVertexCount() > 0) {
                        //std::cout<<"next frame draw normal"<<std::endl;
                        if (m_normals[i].getMaterial().getTexture() == nullptr) {
                            perPixelLinkedList2.setParameter("haveTexture", 0.f);
                        } else {
                            math::Matrix4f texMatrix = m_normals[i].getMaterial().getTexture()->getTextureMatrix();
                            perPixelLinkedList2.setParameter("textureMatrix", texMatrix);
                            perPixelLinkedList2.setParameter("haveTexture", 1.f);
                        }
                        if (m_normals[i].getVertexArrays()[0]->getEntity()->isWater()) {
                            perPixelLinkedList2.setParameter("water", 1.0f);
                        } else {
                            perPixelLinkedList2.setParameter("water", 0.0f);
                        }
                        if (core::Application::app != nullptr) {
                            float time = core::Application::getTimeClk().getElapsedTime().asSeconds();
                            perPixelLinkedList2.setParameter("time", time);
                        }
                        currentStates.blendMode = sf::BlendNone;
                        currentStates.shader = &perPixelLinkedList2;
                        currentStates.texture = m_normals[i].getMaterial().getTexture();
                        vb.clear();
                        vb.setPrimitiveType(m_normals[i].getAllVertices().getPrimitiveType());
                        for (unsigned int j = 0; j < m_normals[i].getAllVertices().getVertexCount(); j++) {
                            vb.append(m_normals[i].getAllVertices()[j]);
                        }
                        vb.update();
                        frameBuffer.drawVertexBuffer(vb, currentStates);
                    }
                }
                glCheck(glFinish());
                glCheck(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
                glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
                vb.clear();
                vb.setPrimitiveType(sf::Quads);
                Vertex v1 (sf::Vector3f(0, 0, quad.getSize().z));
                Vertex v2 (sf::Vector3f(quad.getSize().x,0, quad.getSize().z));
                Vertex v3 (sf::Vector3f(quad.getSize().x, quad.getSize().y, quad.getSize().z));
                Vertex v4 (sf::Vector3f(0, quad.getSize().y, quad.getSize().z));
                vb.append(v1);
                vb.append(v2);
                vb.append(v3);
                vb.append(v4);
                vb.update();
                math::Matrix4f matrix = quad.getTransform().getMatrix().transpose();
                perPixelLinkedListP2.setParameter("worldMat", matrix);
                currentStates.shader = &perPixelLinkedListP2;
                frameBuffer.drawVertexBuffer(vb2, currentStates);
                glCheck(glFinish());
                frameBuffer.display();

Note that with OpenGL 4.3 or the ARB_vertex_attrib_binding extension, you can use glBindVertexBuffer to change the buffer without affecting the format.

The ARB_vertex_attrib_binding interface essentially splits the work of glVertexAttribPointer into three distinct parts: glVertexAttribFormat sets the format, glVertexAttribBinding associates attribute locations with binding points, and glBindVertexBuffer binds buffers to binding points.

But I doubt that this is going to make much difference performance-wise.

1 Like

Ok I used only one VBO to draw everything but it doesn’t increase the performance so I thing this is this shader which decrease the performance I passe from 570 to 370 FPS just because of this shader!

const std::string fragmentShader2 =
               R"(
               #version 460
               #define MAX_FRAGMENTS 20
               struct NodeType {
                  vec4 color;
                  float depth;
                  uint next;
               };
               layout(binding = 0, r32ui) uniform uimage2D headPointers;
               layout(binding = 0, std430) buffer linkedLists {
                   NodeType nodes[];
               };
               layout(location = 0) out vec4 fcolor;
               void main() {
                  NodeType frags[MAX_FRAGMENTS];
                  int count = 0;
                  uint n = imageLoad(headPointers, ivec2(gl_FragCoord.xy)).r;
                  while( n != 0xffffffffu && count < MAX_FRAGMENTS) {
                       frags[count] = nodes[n];
                       n = frags[count].next;
                       count++;
                  }
                  //merge sort
                  int i, j1, j2, k;
                  int a, b, c;
                  int step = 1;
                  NodeType leftArray[MAX_FRAGMENTS/2]; //for merge sort
                  while (step <= count)
                  {
                      i = 0;
                      while (i < count - step)
                      {
                          ////////////////////////////////////////////////////////////////////////
                          //merge(step, i, i + step, min(i + step + step, count));
                          a = i;
                          b = i + step;
                          c = (i + step + step) >= count ? count : (i + step + step);
                          for (k = 0; k < step; k++)
                              leftArray[k] = frags[a + k];
                          j1 = 0;
                          j2 = 0;
                          for (k = a; k < c; k++)
                          {
                              if (b + j1 >= c || (j2 < step && leftArray[j2].depth > frags[b + j1].depth))
                                  frags[k] = leftArray[j2++];
                              else
                                  frags[k] = frags[b + j1++];
                          }
                          ////////////////////////////////////////////////////////////////////////
                          i += 2 * step;
                      }
                      step *= 2;
                  }
                  vec4 color = vec4(0, 0, 0, 0);
                  for( int i = count - 1; i >= 0; i--)
                  {
                    color.rgb = frags[i].color.rgb * frags[i].color.a + color.rgb * (1 - frags[i].color.a);
                    color.a = frags[i].color.a + color.a * (1 - frags[i].color.a);
                  }
                  fcolor = color;
               })";

So using “per pixel linked lists” seems to decrease the performances a lot, but I want that my engine support OIT…, I’ve heard about other methods but that was not working very good.

Please do not look at framerate to detect performance. Look at the actual time a frame takes to render.

570fps == 1.8ms per frame.
370fps = 2.8ms per frame.

The actual difference is 1 millisecond.

I wouldn’t be too concerned. One milisecond isn’t a trivial difference, but that difference isn’t based on how much stuff you render. It’s a screen-space process, so it will depend on how complex the overlapping blended stuff is. More overlap means more time spent in shaders.

Also, consider using an insertion sort rather than a merge sort. You’re not sorting that many things, and the main cost is going to be reading the data from the buffer.

That was my next question, which sorting algorithm to use, it’s a bit faster with an insertion sort.

Reading the data from the buffer seems to slow the performances a bit.