# [newbie] LU decompozition, Crout algorithm

I have a simple OpenCL project and I hit a dead end.
I have to parallelize Crout algorithm.

Here is the seqvential algorithm:

``````void crout(double **A, double **L, double **U, int n) {
int i, j, k;
double sum = 0;

for (i = 0; i < n; i++) {
U[i][i] = 1;
}

for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
sum = 0;
for (k = 0; k < j; k++) {
sum = sum + L[i][k] * U[k][j];
}
L[i][j] = A[i][j] - sum;
}

for (i = j; i < n; i++) {
sum = 0;
for(k = 0; k < j; k++) {
sum = sum + L[j][k] * U[k][i];
}
if (L[j][j] == 0) {
printf("det(L) close to 0!
Can't divide by 0...
");
exit(EXIT_FAILURE);
}
U[j][i] = (A[j][i] - sum) / L[j][j];
}
}
}
``````

Here is the host program:

``````#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <CL/cl.h>

#define M 4

#define CROUT_CL "crout.cl"

/* Allocates a matrix with random float entries. */
void rand_matrix(float *data, int size)
{
int i = 0;
for (; i < size; i++)
data[i] = rand() / (float)RAND_MAX;
}

int
main(int argc, char** argv)
{
/* Local matrix data */
unsigned int i = 0, j = 0;
unsigned int size_U = 0;
unsigned int mem_size_U = 0;
float* host_U = NULL;
unsigned int size_A = 0;
unsigned int mem_size_A = 0;
float* host_A = NULL;
unsigned int size_L = 0;
unsigned int mem_size_L = 0;
float* host_L = NULL;

FILE *log = NULL;

/* OpenCL specific variables */
cl_program program = NULL;
cl_kernel kernel = NULL;

size_t dataBytes;

cl_int result = CL_SUCCESS;
cl_platform_id platform;
cl_device_id device;
cl_context_properties props = {CL_CONTEXT_PLATFORM, 0, 0};
cl_context ctx = NULL;
cl_command_queue queue = NULL;
cl_uint ndevices = 0;

/* OpenCL device memory for matrices */
cl_mem d_A;
cl_mem d_L;
cl_mem d_U;

cl_device_id *clDevices = NULL;
FILE *matrix_1_file = NULL;
size_t srcsz = 0;
char *matrix_1_src = NULL;

size_t localWorkSize, globalWorkSize;

/* XXX: Ugly hack for setting the CL arguments as pointer references */
int m = M;

/* Host memory for matrix A*/
size_A = M * M;
mem_size_A = sizeof(float) * size_A;
host_A = (float*) malloc(mem_size_A);
if (host_A == NULL) {
printf("Matrix A malloc() failed, bailing out!
");
goto err;
}
/* Generate matrices */
srand(2006);
rand_matrix(host_A, size_A);

/* Log A*/
log = fopen("crout.log", "wb");
if (log == NULL) {
printf("Failed to create log file!
");
goto err;
}
fprintf(log, "

Matrix A
");
for(i = 0; i < M; i++) {
for (j = 0; j < M ; j++) {
fprintf(log, "%f ", host_A[j * M + i]);
}
fprintf(log, "
");
}
/* Host memory for the result matrice L and U */
size_U = M * M;
mem_size_U = sizeof(float) * size_U;
host_U = (float*) malloc(mem_size_U);
if (host_U == NULL) {
printf("Matrix U malloc() failed, bailing out!
");
goto err;
}
size_L = M * M;
mem_size_L = sizeof(float) * size_L;
host_L = (float*) malloc(mem_size_L);
if (host_L == NULL) {
printf("Matrix L malloc() failed, bailing out!
");
goto err;
}

/*
* Initialize OpenCL.
*/

result = clGetPlatformIDs(1, &platform, NULL);
if (result != CL_SUCCESS) {
printf ("Failed getting platform ID
");
goto err;
}

result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
&ndevices);
if (result != CL_SUCCESS) {
printf("Failed fetching number of devices
");
goto err;
}
fprintf(log, "Number of devices: %d
", ndevices);
result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if(result != CL_SUCCESS) {
printf("Failed fetching devices
");
goto err;
}

props = (cl_context_properties)platform;

ctx = clCreateContext(props, 1, &device, NULL, NULL, &result);
if(result != CL_SUCCESS) {
printf("Failed to create context
");
goto err;
}

/* fetch the list of devices associated with context */
result = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
clDevices = (cl_device_id *)malloc(dataBytes);
if (clDevices == NULL) {
printf("clDevices malloc() failed!
");
goto err;
}
result |= clGetContextInfo(ctx, CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
if (result != CL_SUCCESS) {
printf("clGetContextInfo() failed!
");
goto err;
}

/* Create a command-queue */
queue = clCreateCommandQueue(ctx, clDevices, 0,
&result);
if (result != CL_SUCCESS) {
printf("clGetContextInfo() failed!
");
goto err;
}

/* Setup device memory */
d_L = clCreateBuffer(ctx, CL_MEM_READ_WRITE, mem_size_L, NULL, &result);
d_U = clCreateBuffer(ctx, CL_MEM_READ_WRITE, mem_size_U, NULL, &result);
d_A = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, host_A, &result);

/*
* Compile and link the OpenCL kernel.
*/

/* Read-in the source code */
matrix_1_file = fopen(CROUT_CL, "rb");
if (matrix_1_file == NULL) {
printf("fopen() failed!
");
goto err;
}
fseek(matrix_1_file, 0, SEEK_END);
srcsz = ftell(matrix_1_file);
fseek(matrix_1_file, 0, SEEK_SET);
matrix_1_src = (char *)malloc(srcsz + 1);
if (matrix_1_src == NULL) {
printf("matrix_1_src malloc() failed!
");
goto err;
}
matrix_1_src[srcsz] = 0;

fprintf(log, "
FILE DUMP BEGINS
");
fprintf(log, "%s", matrix_1_src);
fprintf(log, "FILE DUMP ENDS
");

program = clCreateProgramWithSource(ctx, 1,
(const char **)&matrix_1_src, &srcsz, &result);
if (result != CL_SUCCESS) {
printf("clCreateProgamWithSource() failed!
");
goto err;
}

/* Build the kernel */
result = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (result != CL_SUCCESS) {
char programLog = {0};
printf("clBuildProgram() failed!
");
result = clGetProgramBuildInfo(program, device,
CL_PROGRAM_BUILD_LOG, 10000, programLog, 0);
printf("%s
", programLog);
goto err;
}

/* Fetch the kernel for the crout program */
kernel = clCreateKernel(program, "crout", &result);
if (result != CL_SUCCESS) {
printf("clCreateKernel() failed!
");
goto err;
}

/* Push arguments down the stack*/
result |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_A);
result = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_L);
result = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_U);
result |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&m);
if (result != CL_SUCCESS) {
printf("clSetKernelArg() failed!
");
goto err;
}

localWorkSize = 1;
localWorkSize = 1;

globalWorkSize = M;
globalWorkSize = M;

result = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if (result != CL_SUCCESS) {
printf("clEnqueueNDRangeKernel() failed!
");
goto err;
}

/*
* Results.
*/
result = clEnqueueReadBuffer(queue, d_L, CL_TRUE, 0, mem_size_L, host_L,
0, NULL, NULL);
if (result != CL_SUCCESS) {
");
goto err;
}
result = clEnqueueReadBuffer(queue, d_U, CL_TRUE, 0, mem_size_U, host_U,
0, NULL, NULL);
if (result != CL_SUCCESS) {
");
goto err;
}

/* Log the results */
fprintf(log, "

Matrix L (Results)
");
for(i = 0; i < M; i++) {
for (j = 0; j < M; j++) {
fprintf(log, "%f ", host_L[j * M + i]);
}
fprintf(log, "
");
}
fprintf(log, "
");
fprintf(log, "

Matrix U (Results)
");
for(i = 0; i < M; i++) {
for (j = 0; j < M; j++) {
fprintf(log, "%f ", host_U[j * M + i]);
}
fprintf(log, "
");
}
fprintf(log, "
");

err:
if (host_A)
free(host_A);
if (host_L)
free(host_L);
if (host_U)
free(host_U);
if (matrix_1_file)
fclose(matrix_1_file);
if (matrix_1_src)
free(matrix_1_src);
if (clDevices)
free(clDevices);
if (ctx)
clReleaseContext(ctx);
if (kernel)
clReleaseKernel(kernel);
if (program)
clReleaseProgram(program);
if (queue)
clReleaseCommandQueue(queue);
if (log)
fclose(log);

return 0;
}
``````

And here is the kernel. What I have done so far. I only done it for the L matrix because U is almost similar.

``````__kernel void crout (__global const float *a,
__global float *l, __global float *u, const int m)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k=0;
float ll =0;
float uu =0;/*
for (k=0;k<m;k++){
if (i>=k && j>k){
l[i+k*m]=a[i+k*m]-l[i+(k-1)*m]*l[k*m+k];  // this line needs work.
// k desc         k desc
uu-=a[m*j+k];
}

}
/* This is just for the first two columns
if(i>=2)
l[i+2*m]=a[i+2*m]-a[i+1*m]*a[2*m+1]-a[i+0*m]*a[2*m+0];
*/
}
``````

I don’t now how to make the partial sums without a loop.

You can’t, but there’s nothing wrong with a loop in a kernel.

Yes it can be done. I found the CUDA solution here: https://smartech.gatech.edu/bitstream/handle/1853/33980/virk_bikram_j_201005_mast.pdf Page 34. But I don’t know know how to implement it in OpenCL.