Simple OpenCL XOR encryption problem

Hello,
I am trying to make a simple OpenCL program. Program asks for source file (.txt), destination path and password. Then it splits source text to the arrays of the same size as the password and make XOR encryption.
This is my source code:

#include<iostream>
#include<string>
#include<fstream>
#include <CL/cl.h>


// Kernel (OpenCL funkcia), ktora vykonava samotny proces sifrovania / OpenCL kernel
const char* OpenCLkernel[] = {
"__kernel void sifrovanie (__global char* file, __global char* password, __global char* output)",
"{",
"output[get_global_id(0)] = file[get_global_id(0)] ^ password[get_global_id(0)];",
"}",
};

using namespace std;

int main()
{
    char c;
    ifstream source;
    ofstream final;
    string source_fname;
    string final_fname;
        
	cout<<"Vlozte cestu ku suboru, ktory sa ma zasifrovat / path to the source file: "<<endl;
    cin>>source_fname;
    source.open(source_fname.c_str());

    cout<<endl;
    cout<<"Vlozte cestu a nazov zasifrovaného suboru / path and name of final encrypted file:  "<<endl;
    cin>>final_fname;
    final.open(final_fname.c_str());
    
    

    // Vypytanie si hesla od pouzivatela a jeho vlozenie do pola / ask for a password and put it in array=============================

    string password;
    cout<<"

Vlozte vase heslo o maximalnej dlzke 32 znakov: / password (max 32 characters) "<<endl;
    cin>>password;
        
    int password_length; // pocet znakov v hesle / number of characters in password
    password_length = password.length();
    
    while(password_length>32)
    {
        cout<<"Heslo, ktore ste vlozili ma viac ako 32 znakov, prosim vlozte nove heslo: / password is too long ";
        getline(cin,password);
        password_length = password.length();
    }

    char password_array[32];
    for(int i=0; i<password_length; i++)
    {
        password_array[i] = password[i];
    }

    char file_array[32];
    char final_array[32];
    

    while(!source.eof())
    {

    
    // Vkladanie casti suboru do pola, ktore zodpoveda velkosti hesla / add part of source file to the array with the size of password===========
    
    int j=0;
    while(!source.eof() && j<password_length)
    {
        source.get(c);
        file_array[j] = c;
        j++;
        if(source.eof())
            password_length = j;
    }
	
//OPENCL
  
// Query platform ID
  cl_platform_id platform;
  clGetPlatformIDs (1, &platform, NULL);

// Setup context properties
  cl_context_properties props[3];
  props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
  props[1] = (cl_context_properties)platform;
  props[2] = (cl_context_properties)0;

// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
  cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL);

// Get the list of GPU devices associated with this context
  size_t ParmDataBytes;
  clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
  cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
  clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

// Create a command-queue on the first GPU device
  cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);

// Allocate GPU memory for source vectors AND initialize from CPU memory
  cl_mem GPUfile_array = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * password_length, file_array, NULL);
  cl_mem GPUpassword_array = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * password_length, password_array, NULL);

// Allocate output memory on GPU
  cl_mem GPUfinal_array = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(cl_int) * password_length, NULL, NULL);

// Create OpenCL program with source code
  cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLkernel, NULL, NULL);

// Build the program (OpenCL JIT compilation)
  clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

// Create a handle to the compiled OpenCL function (Kernel)
  cl_kernel OpenCLsifrovanie = clCreateKernel(OpenCLProgram, "sifrovanie", NULL);

// In the next step we associate the GPU memory with the Kernel arguments
  clSetKernelArg(OpenCLsifrovanie, 0, sizeof(cl_mem), (void*)&GPUfinal_array);
  clSetKernelArg(OpenCLsifrovanie, 1, sizeof(cl_mem), (void*)&GPUfile_array);
  clSetKernelArg(OpenCLsifrovanie, 2, sizeof(cl_mem), (void*)&GPUpassword_array);

// Launch the Kernel on the GPU
  size_t WorkSize[1] = {password_length}; // one dimensional Range
  clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLsifrovanie, 1, NULL, WorkSize, NULL, 0, NULL, NULL);

// Copy the output in GPU memory back to CPU memory
  char Hostfinal_array[32];
  clEnqueueReadBuffer(GPUCommandQueue, GPUfinal_array, CL_TRUE, 0, password_length * sizeof(cl_int), final_array, 0, NULL, NULL);

// Cleanup
  free(GPUDevices);
  clReleaseKernel(OpenCLsifrovanie);
  clReleaseProgram(OpenCLProgram);
  clReleaseCommandQueue(GPUCommandQueue);
  clReleaseContext(GPUContext);
  clReleaseMemObject(GPUfile_array);
  clReleaseMemObject(GPUpassword_array);
  clReleaseMemObject(GPUfinal_array); 



    // ************************************************

    // Sifrovanie / put encrypted array into file==========
	
    for(int k=0; k<password_length; k++)
    {
        
        final.put(Hostfinal_array[k]);
    }
	}
    
	return 0;
} 

I can compile it but always get an error after I type password. Thank you for your help. Sorry for my english

Are you compiling this on Windows? If you are using Microsoft Visual Studio it would be good to use the debugger to find out where the crash happens.

If you are using Linux you can use GDB or other debuggers as well to learn which line of code is causing the crash.

One thing I noticed on the OpenCL side is this: the way that kernel works assumes that the input data and the password have the same length. When you call into clEnqueueNDRangeKernel you pass a WorkSize value of password_length, which means that only the first “password_length” bytes of the file will be encrypted.

OK, thanks, I figured it out. In line clCreateProgramWithSource(GPUContext, 4, OpenCLkernel, NULL, NULL); I changed from 7 to 4 and now it is ok. But there is another problem. I have made some modifications, so my algorithm looks like this:


In password array I have 256 integers - int password [256]

And in loop:
until end-of-file:
{
1. Take 256 chars from file
2. Convert them to int array
3. In the array I have 256 integers 
4.

// Query platform ID
  cl_platform_id platform;
  clGetPlatformIDs (1, &platform, NULL);

// Setup context properties
  cl_context_properties props[3];
  props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
  props[1] = (cl_context_properties)platform;
  props[2] = (cl_context_properties)0;

// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
  cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL);

// Get the list of GPU devices associated with this context
  size_t ParmDataBytes;
  clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
  cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
  clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

// Create a command-queue on the first GPU device
  cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);


  cl_mem GPUfile_array = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * 256, file, NULL);
  cl_mem GPUpassword_array = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * 256, password, NULL);


  cl_mem GPUfinal_array = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(int) * 256, NULL, NULL);

// Create OpenCL program with source code
  cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 4, OpenCLkernel, NULL, NULL);

// Build the program (OpenCL JIT compilation)
  clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

// Create a handle to the compiled OpenCL function (Kernel)
  cl_kernel OpenCLsifrovanie = clCreateKernel(OpenCLProgram, "sifrovanie", NULL);

// In the next step we associate the GPU memory with the Kernel arguments
  clSetKernelArg(OpenCLsifrovanie, 0, sizeof(cl_mem), (void*)&GPUfinal_array);
  clSetKernelArg(OpenCLsifrovanie, 1, sizeof(cl_mem), (void*)&GPUfile_array);
  clSetKernelArg(OpenCLsifrovanie, 2, sizeof(cl_mem), (void*)&GPUpassword_array);


  size_t WorkSize[1] = {256}; // pre jednorozmerné pole
  clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLsifrovanie, 1, NULL, WorkSize, NULL, 0, NULL, NULL);


  int enc[256];
  clEnqueueReadBuffer(GPUCommandQueue, GPUfinal_array, CL_TRUE, 0, 256*sizeof(int) , enc, 0, NULL, NULL);


  free(GPUDevices);
  clReleaseKernel(OpenCLsifrovanie);
  clReleaseProgram(OpenCLProgram);
  clReleaseCommandQueue(GPUCommandQueue);
  clReleaseContext(GPUContext);
  clReleaseMemObject(GPUfile_array);
  clReleaseMemObject(GPUpassword_array);
  clReleaseMemObject(GPUfinal_array); 


5. Put the results to the file
  for(int k=0; k<256; k++)
    {   
		
		final<<enc[k]<<" ";
		
    }


So I take first 256 chars, send them to kernel, encrypt and print results. Take another 256 chars, send to kernel, encrypt… and so on until the end of file.
It works fine, but very slowly. It takes 8 minutes to encrypt 480KB txt file. (with c++ encryption only 10s). Kernel is the same as above. Where can be a problem?

So I take first 256 chars, send them to kernel, encrypt and print results

That is the reason it’s so slow. Send all the data in a single batch and you will see better performance.

Also, it is enough to call clGetPlatformIDs(), clCreateContextFromType(), clCreateBuffer(), clBuildProgram(), etc. only one time. You don’t need to execute those functions inside the loop.

Thank you david.garcia, now it takes only few seconds. Now I am workning on XTEA algorithm.
This is kernel:

const char* OpenCLkernel[] = {
"__kernel void XTEA (__global unsigned long* f,__global unsigned long* v, __global unsigned long* k)",
"{int n = get_global_id(0); ",
"unsigned long v0, v1, sum=0, delta=0x9E3779B9;",
	"v0=v[n];",
	"v1=v[n+1];",
	"for (int i=0; i < 32; i++) {",
        "v0 += (((v1 << 4) ^ (v1 >> 5)) + v1) ^ (sum + k[sum & 3]);",
        "sum += delta;",
        "v1 += (((v0 << 4) ^ (v0 >> 5)) + v0) ^ (sum + k[(sum>>11) & 3]);}",
 "f[n]=v0; f[n+1]=v1;",
"}",
};

v is array of source unsigned longs (eg. 64 ulongs), k is array of 4 unsigned longs (key). f is final encrypted array.
Kernel should take two ulongs v[n] and v[n+1], put them in v0 and v1, do encryption and then put results in f[n] and f[n+1]. Than take another 2 ulongs, put them in v0, v1, do encryption and so on.
I have already make this encryption in C/C++ and it works fine. But OpenCL kernel results are different from C/C++ results. Kernel gives different ulongs, so after decryption I do not have the original source ulongs.
I do not know where is the problem.

Thank you david.garcia, now it takes only few seconds.

That’s great :slight_smile: Congratulations.

But OpenCL kernel results are different from C/C++ results.

I’m not familiar with XTEA, but it looks like there’s a problem with the way you read the input data and produce output data. This is where I see the problem:


__kernel void XTEA (__global unsigned long* f,__global unsigned long* v, __global unsigned long* k)
{
    int n = get_global_id(0); ",
    unsigned long v0, v1;
    v0=v[n];
    v1=v[n+1];
    ...
    f[n]=v0;
    f[n+1]=v1;
}

Notice that each work-item reads two 64-bit integers v0 and v1. Now take a piece of paper and draw the contents of buffer v. It will look something like this:

v: [v0, v1, v2, v3, v4, v5, ...]

Then look at the kernel and simulate what each work-item does. In particular, see which pieces of v each work-item is reading. Then you will see the problem :slight_smile:

The reason I don’t tell you the answer directly is because you will learn more if I only give you some ideas. Let me know if you need more help :slight_smile: You are doing great so far.

OK, according to your hint I wrote something like this:
1st work-item works with v0 and v1
2nd work-item works with v1 and v2 (but I need v2 and v3)
3rd work-item works with v2 and v3 (I need v4 and v5)
Am I correct?

If so, I can split v array in two like:
a {v0, v2, v4…}
b {v1, v3, v5…}

and then in kernel:
__kernel void XTEA (__global unsigned long* a,__global unsigned long* b, __global unsigned long* f, __global unsigned long* f)
{
int n = get_global_id(0);
unsigned long v0, v1;
v0=a[n];
v1=b[n];

f[n]=v0;
g[n]=v1;
}

Is there another way, how to solve this?

OK, according to your hint I wrote something like this:
1st work-item works with v0 and v1
2nd work-item works with v1 and v2 (but I need v2 and v3)
3rd work-item works with v2 and v3 (I need v4 and v5)
Am I correct?

Yes, that’s right :slight_smile:

What about this?

__kernel void XTEA (__global unsigned long* f,__global unsigned long* v, __global unsigned long* k)
{
    int n = get_global_id(0); ",
    unsigned long v0, v1;
    v0=v[2*n];
    v1=v[2*n+1];
    ...
    f[2*n]=v0;
    f[2*n+1]=v1;
}

Would that work? How many work-items do you need now?

Yes, it is working, thank you, but there is another problem with this encryption and i can not find what is wrong:
In my program I read the chars from text file and convert them to ulongs, and then encrypt with kernel above. If I have text file < 340KB (so the source array for kernel has cca 87 000 ulongs), everything is perfect, I can encrypt and decrypt with no problem. But if source text file exceed 340KB, the result from kernel is only one repeating ulong number. Any ideas?

Have you checked if any of the OpenCL API calls returns an error code? Also, have you tried passing a function pointer pfn_notify when you call clCreateContext()? Some errors are only reported through that pfn_notify.

clEnqueueReadBuffer returns -5 error, what should be CL_OUT_OF_RESOURCES. I tried new NVIDIA drivers(270.61), I got -36 error and black screen and after several seconds vga driver restarts. So i returned to old drivers (260.99).

In kernel above, “k” is the array of 4 ulongs same for every work-item. I declare it directly inside the kernel:

__kernel void XTEA (__global unsigned long* f,__global unsigned long* v)
{int n = get_global_id(0);,
unsigned long v0, v1, k[4] = {0x4E8E7829, 0xC88BA95E, 0xB84E28AF, 0xA0A47295};
.
.
}

Is it possible, that vga runs out of memory if I declare this array inside kernel and i get -5 error?

So I have tried to send this array from main program, but I do not know how to do it.

__kernel void XTEA (__global unsigned long* f,__global unsigned long* v, unsigned long k[4])
{int n = get_global_id(0);,
unsigned long v0, v1;
.
.
}

and

clSetKernelArg(OpenCLencryption, 2, sizeof(unsigned long)*4, (void*)&k);

but I get error at clCreateProgramWithSource.

Is it possible, that vga runs out of memory if I declare this array inside kernel and i get -5 error?

I don’t think that would be the problem. It’s a very small array.

but I get error at clCreateProgramWithSource.

If you want to try that, what you need to do is create a new small buffer object “myKey” of size equal to sizeof(cl_ulong)*4 using clCreateBuffer(), initialize it with clEnqueueWriteBuffer() and finally call


clSetKernelArg(OpenCLencryption, 2, sizeof(cl_mem), (void*)&myKey);