OpenCL Stucture Aligment

Hello,

I have a problem with structure aligment. I think.

Header on HOST


#define CL_HPP_MINIMUM_OPENCL_VERSION 110
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#include <CL/cl2.hpp>

#pragma pack (push, 16)
typedef struct		s_ray
{
	cl_float4		dir;
	cl_float4		origin;
	cl_uint		pos;
	cl_float		intensity;
}					t_ray;
#pragma pack (pop)


Header on Device :


struct __attribute__ ((aligned (16))) s_ray
{
  float4		dir;
  float4		origin;
  uint			pos;
  float			intensity;
};

typedef struct s_ray t_ray;


GenRay Kernel :


__kernel void GenRays(__global float4 *PosUpLeft,
					  __global float4 *Origin,
					  __global float4 *NormCam,
					  __global t_ray *BufferRaysOut,
					  uint width,
					  uint MaxRaysLen)
{
	uint id = get_global_id(0);

	BufferRaysOut[id].pos = id;
}

Trace Kernel :



__kernel void trace(
__global t_ray *rays,
__global uint *buffer,
uint		raysLen)
{
	uint id = get_global_id(0);

	if (id < raysLen)
	{
		buffer[id] = 0xFF000000 + (rays[id].pos % 255) << 2;
	}
}

Allocation Code :


	RayBufferMaxLen = width * height;
	std::cout << "kernelGenerateRayInit..." << std::endl;
	Ray1 = new cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(cl_float4));
	Ray2 = new cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(cl_float4));
	Ray3 = new cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(cl_float4) * 3);
	Rays = new cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(t_ray) * RayBufferMaxLen);
	KernelGenerateRay = new cl::Kernel(*program, "GenRays");
	KernelGenerateRay->setArg(0, *Ray1);
	KernelGenerateRay->setArg(1, *Ray2);
	KernelGenerateRay->setArg(2, *Ray3);
	KernelGenerateRay->setArg(3, *Rays);
	KernelGenerateRay->setArg(4, width);
	KernelGenerateRay->setArg(5, RayBufferMaxLen);

Execution Code :


	for (size_t i = 0; i < bufferLen; i++)
	{
		rays[i].pos = 0;
	}
	ret = queue->enqueueWriteBuffer(*Ray1, CL_TRUE, 0, sizeof(cl_float4), positionOnScreen);
	ret |= queue->enqueueWriteBuffer(*Ray2, CL_TRUE, 0, sizeof(cl_float4), originCamera);
	ret |= queue->enqueueWriteBuffer(*Ray3, CL_TRUE, 0, sizeof(cl_float4) * 3, vec3Cam);
	ret |= queue->enqueueNDRangeKernel(*KernelGenerateRay, cl::NullRange, cl::NDRange(bufferLen), cl::NDRange(64));
	ret |= queue->finish();
	ret |= queue->enqueueReadBuffer(*Rays, CL_TRUE, 0, sizeof(t_ray) * bufferLen, rays);
	displayOpenCLerror(ret);
	for (size_t i = 0; i < bufferLen; i++)
	{
		std::cout << "I == " << i << std::endl;
		if (rays[i].pos != i)
			std::cout << "Diff : " << i << " and " << rays[i].pos, system("pause") ,exit(1);
	}

The seconde step is diff 1 != 0
But If I look All Values, I Read Correct Data for many pixels.

Just Why ?

Thank for your help :slight_smile:

I solved my problem with a power of two.

The answer at this problem isn’t easy. When I make my first struct, I had 40 bytes in sizeof(t_ray).
After many calculus, I have found why I don’t work. I solved my problem with 64 bytes.

When I solve my problem. I don’t use pragam and packed method.