How to fill image2d_t from float vector?

I have already asked this question on Stack Overflow to no avail: In a little program I wrote I work with grayscale images, which I read into standard float-vectors. I think a lot of work my kernels do could be done in hardware by the TMUs if I didn’t use regular buffers but image2d_t memory objects. My question would be: How can I copy a float vector into an image? If I understand it correctly, I basically have to write the entries of my float vector into a vector that is four times as long, like so:

std::vector<float> A(256,0.0);
std::vector<float> A_img(1024, 0.0);

for(int i=0; i<256; i++)
{
    A_img[4*i] = A[i];
    A_img[(4*i)+3] = 1;
}

Maybe my device being little endian causes the components to get order reversed, but I am not sure on that. However, I can’t make this idea work. Here is a minimal working example:

#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#if defined(__APPLE__) 
#include <OpenCL/cl2.hpp>
#else 
#include <CL/cl2.hpp>
#endif
#include <iostream>
#include <string>
#include <vector>
#include <fstream>

int main(void)
{
    // Set up platform, device and context
    std::vector<cl::Platform> platforms;
    std::vector<cl::Device> devices;
    cl::Device default_device;
    cl::Platform::get(&platforms);
    
    if (platforms.size() == 0)
    {
        std::cout << "No OpenCL platform found, check installation!" << std::endl;
        exit(-1);
    }
    platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
    
    if (devices.size() == 0)
    {
        std::cout << "No devices found in platform, check installation!" << std::endl;
        exit(-1);
    }
    default_device = devices[0];
    cl::Context context(default_device);
    
    std::ifstream program_file("read_write_image.cl");
    std::string program_string(std::istreambuf_iterator<char>(program_file), (std::istreambuf_iterator<char>()));
    cl::Program::Sources source { program_string };
    cl::Program dummy_program(context, source);
    if (dummy_program.build()!=CL_SUCCESS)
    {
        std::cout << "Error building: " << dummy_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<< std::endl;
        exit(-1);
    }
    cl::Kernel kernel(dummy_program, "read_write_image");
    cl::CommandQueue queue(context, default_device);
    
    // Set up dummy grayscale image
    std::vector<float> A(256, 0.0);
    for(int i=0; i<256; i++)
    {
        A[i] = 255.0f - i;
        std::cout << A[i] << "  ";
    }
    std::cout << std::endl;
    
    // Blow up to float4 array
    std::vector<float> A_img(1024, 0.0);
    for(int i=0; i<256; i++)
    {
        A_img[4*i] = A[i];
        A_img[(4*i)+3] = 1;
    }
    std::vector<float> B_img(1024, 0.0);
    
    cl::ImageFormat grayscale(CL_R, CL_FLOAT);
    cl::Image2D Input_Image(context, CL_MEM_READ_ONLY, grayscale, 16, 16);
    cl::Image2D Output_Image(context, CL_MEM_WRITE_ONLY, grayscale, 16, 16);
    
    std::array<cl::size_type, 3> origin {0,0,0};
    std::array<cl::size_type, 3> region {16, 16, 1};

    queue.enqueueWriteImage(Input_Image, CL_TRUE, origin, region, 0, 0, &A_img[0]);

    kernel.setArg(0, Input_Image);
    kernel.setArg(1, Output_Image);
    
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(16,16), cl::NullRange, NULL); 
    queue.enqueueReadImage(Output_Image, CL_TRUE, origin, region, 0, 0, &B_img[0]);

    for(int i=0; i<1024; i++)
    {
        std::cout << B_img[i] << "  ";
    }   
    std::cout << std::endl;
    
    return EXIT_SUCCESS;
}

with this kernel:

__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_CLAMP|CLK_FILTER_LINEAR;

__kernel void read_write_image(read_only image2d_t input_image, write_only image2d_t output_image)
{
    int i = get_global_id(0);
    int j = get_global_id(1);
    float tmp = read_imagef(input_image, sampler, (int2) (i,j)).x;
    write_imagef(output_image, (int2) (i,j), (float4) (tmp,0,0,1));
}

The output I recieve is only zeroes. What do I do wrong? How can I make this work?

The functions “read_imagef” and “write_imagef” work with float4s, but not every image is necessarily a float4-array. Skipping the A_img-part and just working with A itself. i.e. using the host code

#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#if defined(__APPLE__) 
#include <OpenCL/cl2.hpp>
#else 
#include <CL/cl2.hpp>
#endif
#include <iostream>
#include <string>
#include <vector>
#include <fstream>

int main(void)
{
	// Set up platform, device and context
	std::vector<cl::Platform> platforms;
	std::vector<cl::Device> devices;
	cl::Device default_device;
	cl::Platform::get(&platforms);
	
	if (platforms.size() == 0)
	{
		std::cout << "No OpenCL platform found, check installation!" << std::endl;
		exit(-1);
	}
	platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
	
	if (devices.size() == 0)
	{
		std::cout << "No devices found in platform, check installation!" << std::endl;
		exit(-1);
	}
	default_device = devices[0];
	cl::Context context(default_device);
	
	std::ifstream program_file("read_write_image.cl");
	std::string program_string(std::istreambuf_iterator<char>(program_file), (std::istreambuf_iterator<char>()));
	cl::Program::Sources source { program_string };
	cl::Program dummy_program(context, source);
    if (dummy_program.build()!=CL_SUCCESS)
	{
        std::cout << "Error building: " << dummy_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<< std::endl;
        exit(-1);
    }
	cl::Kernel kernel(dummy_program, "read_write_image");
	cl::CommandQueue queue(context, default_device);
	
	// Set up dummy grayscale image
	std::vector<float> A(256, 0.0);
	for(int i=0; i<256; i++)
	{
		A[i] = 255.0f - i;
	}
	// Set up empty target image
	std::vector<float> B(256, 0.0);
	
	cl::ImageFormat grayscale(CL_R, CL_FLOAT);
	cl::Image2D Input_Image(context, CL_MEM_READ_ONLY, grayscale, 16, 16);
	cl::Image2D Output_Image(context, CL_MEM_WRITE_ONLY, grayscale, 16, 16);
	
	std::array<cl::size_type, 3> origin {0,0,0};
	std::array<cl::size_type, 3> region {16, 16, 1};

	queue.enqueueWriteImage(Input_Image, CL_TRUE, origin, region, 0, 0, &A[0]);

	kernel.setArg(0, Input_Image);
	kernel.setArg(1, Output_Image);
	
	queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(16,16), cl::NullRange, NULL); 
	queue.enqueueReadImage(Output_Image, CL_TRUE, origin, region, 0, 0, &B[0]);

	for(int i=0; i<256; i++)
	{
		std::cout << B[i] << "  ";
	}	
	std::cout << std::endl;
	
	return EXIT_SUCCESS;
}

makes this example a working example. However, it seems support may be platform dependent. On my Macbook, it still doesn’t work, while on my AMD graphics card it workes with the changes.