Access violation when writing buffer to device

I have a project in OpenCL. It’s matrix decomposition on the GPU. All works fine and the results are okay. The only thing I’m seeing is that when I execute the program multiple times in a row (once every second or so), I get access violations when I write my initial buffers to the device.

It’s always at writing the buffers that it gets stuck. I’m very new to OpenCL and I’m wondering if perhaps I have to clear the memory in my GPU when I exit my program? Sometimes it crashes on the first run, but succeeds after 2 or 3 tries. Then again, sometimes is immediatly succeeds, as well as the subsequent runs. It’s just pretty random. The actual buffer write that fails differs from time to time as well. Sometimes it’s the third buffer write that fails, sometimes the fourth one.

The parameters I run this program with is a workgroup size of 7 and a matrix of 70*70 elements. At first I figured it could be that my matrix is too big for the GPU (GT650M with 2GB), but sometimes a run with a matrix ox 10.000 elements succeeds as well.

The code up until the buffer writes is given below.

Any help is greatly appreciated.

Ps: for clarity’s sake, PRECISION is a macro #define PRECISION float.

    int main(int argc, char *argv[])
    	//// INITIALIZATION PART ///////////////////////////////////////////////////////////////////////////////////////
    	try {
    		if (argc != 5) {
    			std::ostringstream oss;
    			oss << "Usage: " << argv[0] << " <kernel_file> <kernel_name> <workgroup_size> <array width>";
    			throw std::runtime_error(oss.str());
    		// Read in arguments.
    		std::string kernel_file(argv[1]);
    		std::string kernel_name(argv[2]);
    		unsigned int workgroup_size = atoi(argv[3]);
    		unsigned int array_dimension = atoi(argv[4]);
    		int total_matrix_length = array_dimension * array_dimension;
    		int total_workgroups = total_matrix_length / workgroup_size;
    		total_workgroups += total_matrix_length % workgroup_size == 0 ? 0 : 1;
    		// Print parameters
    		std::cout << "Workgroup size:  "   << workgroup_size      << std::endl;
    		std::cout << "Total workgroups:  " << total_workgroups    << std::endl;
    		std::cout << "Array dimension: "   << array_dimension     << " x " << array_dimension << std::endl;
    		std::cout << "Total elements:  "   << total_matrix_length << std::endl;
    		// OpenCL initialization
    		std::vector<cl::Platform> platforms;
    		std::vector<cl::Device> devices;
    		platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
    		cl::Context context(devices);
    		cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
    		// Load the kernel source.
    		std::string file_text;
    		std::ifstream file_stream(kernel_file.c_str());
    		if (!file_stream) {
    			std::ostringstream oss;
    			oss << "There is no file called " << kernel_file;
    			throw std::runtime_error(oss.str());
    		file_text.assign(std::istreambuf_iterator<char>(file_stream), std::istreambuf_iterator<char>());
    		// Compile the kernel source.
    		std::string source_code = file_text;
    		std::pair<const char *, size_t> source(source_code.c_str(), source_code.size());
    		cl::Program::Sources sources;
    		cl::Program program(context, sources);
    		try {;
    		catch (cl::Error& e) {
    			std::string msg;
    			program.getBuildInfo<std::string>(devices[0], CL_PROGRAM_BUILD_LOG, &msg);
    			std::cerr << "Your kernel failed to compile" << std::endl;
    			std::cerr << "-----------------------------" << std::endl;
    			std::cerr << msg;
    		//// CREATE RANDOM INPUT DATA //////////////////////////////////////////////////////////////////////////////////
    		// Create matrix to work on.
    		// Create a random array.
    		int matrix_width         = sqrt(total_matrix_length);
    		PRECISION* random_matrix = new PRECISION[total_matrix_length];
    		random_matrix            = randommatrix(total_matrix_length);
    		PRECISION* A             = new PRECISION[total_matrix_length];
    		for (int i = 0; i < total_matrix_length; i++)
    			A[i] = random_matrix[i];
    		PRECISION* L_SEQ = new PRECISION[total_matrix_length];
    		PRECISION* U_SEQ = new PRECISION[total_matrix_length];
    		PRECISION* P_SEQ = new PRECISION[total_matrix_length];
    		// Do the sequential algorithm.
    		decompose(A, L_SEQ, U_SEQ, P_SEQ, matrix_width);
    		float* PA = multiply(P_SEQ, A, total_matrix_length);
    		float* LU = multiply(L_SEQ, U_SEQ, total_matrix_length);
    		std::cout << "PA = LU?" << std::endl;
    		bool eq = equalMatrices(PA, LU, total_matrix_length);
    		std::cout << eq << std::endl;
    		//// RUN AND SETUP KERNELS /////////////////////////////////////////////////////////////////////////////////////
    		// Initialize arrays for GPU.
    		PRECISION* L_PAR = new PRECISION[total_matrix_length];
    		PRECISION* U_PAR = new PRECISION[total_matrix_length];
    		PRECISION* P_PAR = new PRECISION[total_matrix_length];
    		PRECISION* ROW_IDX = new PRECISION[matrix_width];
    		PRECISION* ROW_VAL = new PRECISION[matrix_width];
    		// Write A to U and initialize P.
    		for (int i = 0; i < total_matrix_length; i++)
    			U_PAR[i] = A[i];
    		// Initialize P_PAR.
    		for (int row = 0; row < matrix_width; row++)
    			for (int i = 0; i < matrix_width; i++)
    				IDX(P_PAR, row, i) = 0;
    			IDX(P_PAR, row, row) = 1;
    		// Allocate memory on the device
    		cl::Buffer P_BUFF(context, CL_MEM_READ_WRITE, total_matrix_length*sizeof(PRECISION));
    		cl::Buffer L_BUFF(context, CL_MEM_READ_WRITE, total_matrix_length*sizeof(PRECISION));
    		cl::Buffer U_BUFF(context, CL_MEM_READ_WRITE, total_matrix_length*sizeof(PRECISION));
    		// Buffer to determine maximum row value.
    		cl::Buffer MAX_ROW_IDX_BUFF(context, CL_MEM_READ_WRITE, total_workgroups*sizeof(PRECISION));
    		cl::Buffer MAX_ROW_VAL_BUFF(context, CL_MEM_READ_WRITE, total_workgroups*sizeof(PRECISION));
    		// Create the actual kernels.
    		cl::Kernel kernel(program, kernel_name.c_str());
    		std::string max_row_kernel_name = "max_row";
    		cl::Kernel max_row(program, max_row_kernel_name.c_str());
    		std::string swap_row_kernel_name = "swap_row";
    		cl::Kernel swap_row(program, swap_row_kernel_name.c_str());
    		// transfer source data from the host to the device
    		std::cout << "Writing buffers" << std::endl;
    		queue.enqueueWriteBuffer(P_BUFF, CL_TRUE, 0, total_matrix_length*sizeof(PRECISION), P_PAR);
    		queue.enqueueWriteBuffer(L_BUFF, CL_TRUE, 0, total_matrix_length*sizeof(PRECISION), L_PAR);
    		queue.enqueueWriteBuffer(U_BUFF, CL_TRUE, 0, total_matrix_length*sizeof(PRECISION), U_PAR);
    		queue.enqueueWriteBuffer(MAX_ROW_IDX_BUFF, CL_TRUE, 0, total_workgroups*sizeof(PRECISION), ROW_IDX);
    		queue.enqueueWriteBuffer(MAX_ROW_VAL_BUFF, CL_TRUE, 0, total_workgroups*sizeof(PRECISION), ROW_VAL);

The full error that I get when I hook in with the debugger is the following:

    Unhandled exception at 0x55903CC0 (nvopencl.dll) in Project.exe:
     0xC0000005: Access violation reading location 0x0068F004.
    If there is a handler for this exception, the program may be safely continued.

The function the debugger shows me is the following, in the namespace cl:

    cl_int enqueueWriteBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        ::size_t offset,
        ::size_t size,
        const void* ptr,
        const VECTOR_CLASS<Event>* events = NULL,
        Event* event = NULL) const
        return detail::errHandler(
                object_, buffer(), blocking, offset, size,
                (events != NULL) ? (cl_uint) events->size() : 0,
                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
                (cl_event*) event),

I noticed something in your code. When the code writes the MAX_ROW_IDX_BUFF and MAX_ROW_VAL_BUFF buffers, it uses total_workgroups*sizeof(PRECISION) as the amount of data to write. The data comes from the ROW_IDX and ROW_VAL regions of memory, but they allocated using new PRECISION[matrix_width]. So the size of the the source data and the size of the memory write aren’t obviously correlated. Could it be that more data is being read from ROW_IDX and/or ROW_VAL than is being allocated for them?