Segmentation fault (core dumped )

Hi everyone,

I am doing parallel programming(using cuda) over an year now and I am new to OpenCL. I was making my 1st openCL code (matrix multiplication ). I wrote the following code,

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <SDKCommon.hpp>
#include <SDKApplication.hpp>
#include <SDKCommandArgs.hpp>
#include <SDKFile.hpp>
#include <CL/cl.h>

#define MAX_SOURCE_SIZE (0x100000)
#define MATSIZE 16

void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col);

void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col)
    unsigned int i;





int main(void)

// Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

// matrix declaration
    float *A;
    float *B;
    float *C;

// set dimesions
    int Arow,AcolBrow,Bcol;


// no. of elements in matrix
    int sizea, sizeb, sizec;

// Error code from opencl

    int err;

// Setting up matrices
    sizea= Arow*AcolBrow;
    sizeb= AcolBrow*Bcol;
    sizec= Arow*Bcol;

    A = (float *) malloc(sizeof(float)*sizea);
    B = (float *) malloc(sizeof(float)*sizeb);
    C = (float *) malloc(sizeof(float)*sizec);


// get platform id & device id

    cl_uint numplatform;
    cl_platform_id platformid=NULL;
    cl_device_id deviceid=NULL;

    err= clGetPlatformIDs(1,&platformid,&numplatform);

    cl_context_properties properties[]= 
            CL_CONTEXT_PLATFORM, (cl_context_properties)platformid,0 

// create context
    cl_context context= clCreateContext(properties,1,&deviceid,NULL,NULL,&err);

/* when more than one gpu is installed on the system than we make use of the approach as we stated in the  notes !! */

// create command queue

    cl_command_queue queue = clCreateCommandQueue(context,deviceid,0,&err); // I have disabled profiling option

// Allocate buffer object for Ad,Bd,Cd

    cl_mem Ad = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizea,NULL,NULL);
    cl_mem Bd = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizeb,NULL,NULL);
    cl_mem Cd = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float)*sizec,NULL,NULL);

// We are not explicitely making kernel. We are putting the kernel code here itself (see notes)

    cl_program program= clCreateProgramWithSource(context,1,(const char **)&source_str, (const size_t *)&source_size,&err);

// Build program using program object just created

    err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);

    if(err !=CL_SUCCESS)

        size_t len;
        char buffer[2048];
        printf("ERROR: Failed to build executable 
        clGetProgramBuildInfo(program,deviceid,CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer , &len);
//return FAILURE;


// Create kernel object

    cl_kernel kernel = clCreateKernel(program,"matmul",NULL);

// set kernel argument values

    err= clSetKernelArg(kernel,0,sizeof(int),&Arow);
    err|= clSetKernelArg(kernel,1,sizeof(int),&AcolBrow);
    err|= clSetKernelArg(kernel,2,sizeof(int),&Bcol);
    err|= clSetKernelArg(kernel,3,sizeof(cl_mem),&Ad);
    err|= clSetKernelArg(kernel,4,sizeof(cl_mem),&Bd);
    err|= clSetKernelArg(kernel,5,sizeof(cl_mem),&Cd);

// Write to device buffers. Ad=A and Bd=B   : Equivalent to CUDAmemcpy


// since we have set the copy as synchronous we will be creating event
    cl_event event;

// Execute the kernel over entire range of C matrix

    size_t global[2];
    size_t local[2];
    cl_uint * ndim; // no. of dimension in ND range. 3rd parameter in kernel call signifies the dimension.


    * ndim=2; // because we want 2-D multiplication. Gives n

/* no local size declaration cause we are not making work groups ie blocks.We are just make making oneblock where everythread takes one element of A,B and computes C */

    err = clEnqueueNDRangeKernel(queue,kernel,*ndim,NULL,global,NULL,0,NULL,&event); // the NULL position after global is for passing local dimension. In this case we don't have one.
    clFinish(queue); // wait for kernel to finish before we begin copying the result back on host

//read back the result


// free all memory


    return 0;

My kernel code is as follows,

__kernel void matmul(const int Mdim, const int Ndim,const int Pdim,__global float* A,__global float* B,__global float* C)

float tmp;
//*int k;
// int i = get_global_id(0);
//int j = get_global_id(1);
//C[i*Ndim+j] = 3.0; 


I could compile it successfully and it created a binary in …/bin/x86/ folder. When I try to run it using ./matmul it throes the following error,

93 > Sun Mar 17 : 04:22 PM : samkit@samkit:~/AMD/AMD-APP-SDK-v2.8-RC-lnx32/samples/opencl/bin/x86$ ./matmul
Segmentation fault (core dumped)

I know functionality of kernel is no where near that of matrix multiplication kernel but I did this to just check if these is some error in my kernel. Please give suggestions or advice that can help me make my code run.

Thanks in advance.

I have debugged it. :smiley:

What was the solution to this question? I think I have a similar problem