How to write host code for the following kernel code

I wrote the kernel code and its compiled successfully. Now I am trying to write a host code. I tried but not sure about the input, output, and there size. Any reviewer please !!


inline float Euclidean_distance(__global int* restrict array_point_A, __global int* restrict array_point_B) {
    float sum = 0.0;
    float  w[20] = { 0.0847282, 0.0408621, 0.105036, 0.0619821, 0.0595455, 0.0416739, 0.0181147, 0.00592921,
     0.040049, 0.0766054, 0.0441091, 0.0376111, 0.0124285, 0.0733558, 0.0587338, 0.0303001, 0.0579207, 0.0449221,
          0.0530462, 0.0530462 };
    for (int i = 0; i < 20; ++i) {
        float a = array_point_A[i] - array_point_B[i];
        float wieghted_distance = w[i] * (a * a);
        sum += wieghted_distance;

    }
    return sqrt(sum);
}
__kernel void KNN_classifier(__global int * restrict X_train,__global int * restrict Y_train,__global int * restrict data_point, int k)
{
     
    float array_dist[4344] = {};
    int index_arr[4344] = {};
    for (int i = 0; i < 4344; ++i)
    { 
       array_dist[i] = Euclidean_distance(X_train,data_point);
       index_arr[i] = i;
    }
    float temp; int x;
    for (int i = 0; i < 4344; i++)
    {
        for (int j = i + 1; j < 4344; j++)
        {
            if (array_dist[index_arr[i]] > array_dist[index_arr[j]])
            {
                x = index_arr[i];
                index_arr[i] = index_arr[j];
                index_arr[j] = x;
            }

        }
    }

    int array_Y_class_target[2] = {};
    float CT[2] = {};
    float SumOf_Each_class_distances[2] = { 0.0 };
    int min_index = -1;
    for (int i = k; i > 0; --i) {
        for (int c = 0; c < 2; ++c) {
            for (int j = 0; j < i; ++j) {
                int index = index_arr[j];
                if (Y_train[index] == c)
                {
                    array_Y_class_target[c] ++;
                    float dist = array_dist[index_arr[j]];
                    SumOf_Each_class_distances[c] += dist;
                }
            }
            if (array_Y_class_target[c] != 0)
            {
                CT[c] = (((float)k / (float)array_Y_class_target[c]) + (SumOf_Each_class_distances[c] / (float)array_Y_class_target[c]));
            }
            else
            {
                CT[c] = 1.5;

            }
        }

        float min = 1.8;
        int max_index = -1;
        for (int r = 0; r < 2; ++r) {
            float elem = CT[r];
            if (elem <= min) {
                min = elem;
                min_index = r;
            }
        }
        for (size_t r = 0; r < 2; ++r) {
            float elem = CT[r];
            if ((elem == min) && (r != min_index)) {
                if (SumOf_Each_class_distances[0] < SumOf_Each_class_distances[1])
                {
                    min_index = 0;
                }
                else
                {
                    min_index = 1;
                }
            }

        }
        int  class_label = min_index; ;
    }
}

What bindings do you use? C, C++, Python, Java, etc.?

If you use C/C++ then I would recommend to use the C++ bindings for OpenCL.

https://github.khronos.org/OpenCL-CLHPP/

For your kernel it would be like this:

		auto kernelf = cl::KernelFunctor<
				cl::Buffer,
				cl::Buffer,
				cl::Buffer,
				int>(kernel, t.kernel);

		std::vector xtrainData(size);
		std::vector ytrainData(size);
		std::vector pointData(size);
		int k = 5;
		
		cl::Buffer xtrainBuffer(begin(xtrainData), end(xtrainData), false);
		cl::Buffer ytrainBuffer(begin(xtrainData), end(xtrainData), false);
		cl::Buffer datapointBuffer(begin(pointData), end(pointData), false);
		
		cl_int error;
		kernelf(
				cl::EnqueueArgs(cl::NDRange(size)),
				xtrainBuffer,
				ytrainBuffer,
				datapointBuffer,
				k,
				error);