This is my way…
#define valloc(size, prot) VirtualAllocEx(GetCurrentProcess(), NULL, (size), MEM_COMMIT, (prot))
#define vfree(mem) VirtualFreeEx(GetCurrentProcess(), mem, 0, MEM_RELEASE)
#define vlock(mem, size) VirtualLock((mem), (size))
#define BUFSIZE (4*1024)
// globals
GLuint m_pbos[NUMR_PBO]; // PBO pool
int vram2sys; // index of PBO used to copy from vram to sysmem
int gpu2vram; // index of PBO used to copy framebuffer to vram
unsigned char* membuffer = NULL;
unsigned char* tempbuff = NULL; // used during fast mem copy
unsigned int memsize;
// call this with size of framebuffer
void InitReadback( int xsize, int ysize)
{
tempbuff = (unsigned char*)valloc(BUFSIZE, PAGE_READWRITE);
vlock(tempbuff, BUFSIZE);
memsize = xsize * ysize * 4;
if (m_pbos[0] == 0)
glGenBuffers(NUMR_PBO, m_pbos);
for (int i=0; i<NUMR_PBO; i++)
{
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, m_pbos[i]);
glBufferData(GL_PIXEL_PACK_BUFFER_ARB, memsize, NULL, GL_STATIC_READ);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
vram2sys = 0;
gpu2vram = NUMR_PBO-1;
if (membuffer != NULL)
{
vfree(membuffer);
membuffer = NULL;
}
membuffer = (unsigned char*)valloc(memsize, PAGE_READWRITE);
vlock(membuffer, memsize);
}
// call this onec per frame or slice...
void ReadBack(int xsize, int ysize)
{
// first.. post read pixels
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, m_pbos[gpu2vram]);
glReadPixels(0, 0, xsize, ysize, GL_BGRA, GL_UNSIGNED_BYTE, 0);
// then copy previous frame from vram to sysmem (membuffer)
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, m_pbos[vram2sys]);
void* data = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
if (data != NULL)
{
FastMemCopy(membuffer, data, tempbuff, BUFSIZE, memsize);
}
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
// unbind PBO
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
// shift names
GLuint temp = m_pbos[0];
for (int i=1; i<NUMR_PBO; i++)
m_pbos[i-1] = m_pbos[i];
m_pbos[NUMR_PBO - 1] = temp;
}
// audiofreak tnx for this
void FastMemCopy(void *dst, const void *src, void *buf, size_t bufsize, size_t nbytes)
{
__asm
{
mov esi, src
mov edi, dst
mov eax, buf
mov ebx, bufsize
bsr ecx, ebx
mov ebx, nbytes
shr ebx, cl
main_loop:
test ebx, ebx
jz main_loop_end
mov edx, eax
mov ecx, bufsize
shr ecx, 7
L1_cache_loop:
test ecx, ecx
jz L1_cache_loop_end
prefetchnta [esi + 64 * 10]
movaps xmm0, [esi]
movaps xmm1, [esi + 16]
movaps xmm2, [esi + 32]
prefetchnta [esi + 64 * 11]
movaps xmm3, [esi + 48]
movaps xmm4, [esi + 64]
movaps xmm5, [esi + 80]
movaps xmm6, [esi + 96]
movaps xmm7, [esi + 112]
movaps [edx], xmm0
movaps [edx + 16], xmm1
movaps [edx + 32], xmm2
movaps [edx + 48], xmm3
movaps [edx + 64], xmm4
movaps [edx + 80], xmm5
movaps [edx + 96], xmm6
movaps [edx + 112], xmm7
add esi, 128
add edx, 128
sub ecx, 1
jmp L1_cache_loop
L1_cache_loop_end:
mov edx, eax
mov ecx, bufsize
shr ecx, 7
stream_loop:
test ecx, ecx
jz stream_loop_end
movaps xmm0, [edx]
movaps xmm1, [edx + 16]
movaps xmm2, [edx + 32]
movaps xmm3, [edx + 48]
movaps xmm4, [edx + 64]
movaps xmm5, [edx + 80]
movaps xmm6, [edx + 96]
movaps xmm7, [edx + 112]
movntps [edi], xmm0
movntps [edi + 16], xmm1
movntps [edi + 32], xmm2
movntps [edi + 48], xmm3
movntps [edi + 64], xmm4
movntps [edi + 80], xmm5
movntps [edi + 96], xmm6
movntps [edi + 112], xmm7
add edx, 128
add edi, 128
sub ecx, 1
jmp stream_loop
stream_loop_end:
sub ebx, 1
jmp main_loop
main_loop_end:
sfence
}
}