Good afternoon, everyone!

I ask for help in figuring out the reasons for changing the accuracy of calculations within the same code in one call to the kernel program.

There is a simple program to calculate the length of an arc segment on the surface of a sphere:

```
double distanceD(double point1[2], double point2[2])
{
double R = 6371110.0;
double phi1 = point1[0];
double lambda1 = point1[1];
double phi2 = point2[0];
double lambda2 = point2[1];
double delta_lambda = lambda2 - lambda1;
double p1 = sin(delta_lambda) * cos(phi2);
double p2 = cos(phi1) * sin(phi2) - sin(phi1) * cos(phi2) * cos(delta_lambda);
double q = sin(phi1) * sin(phi2) + cos(phi1) * cos(phi2) * cos(delta_lambda);
double res = abs(atan2(sqrt(p1 * p1 + p2 * p2), q) * R);
return res;
}
```

When I was testing the OpenCL port of my large program, I found that in some cases the **distanceD** function value obtained on the GPU differs from the CPU version by * 7* decimal places (i.e., I get 9 correct decimal places, followed by garbage). That is, it is more accurate than

**float**, but also far from

**double**. During some experiments I managed to make the GPU produce an absolutely accurate value, but I cannot use this “invention”.

Let me explain. Here is the code of the control program (Qt 5, CUDA 12.4, OpenCL 3.0 (auto)):

```
#include <QCoreApplication>
#include <QDebug>
#include <QFile>
#include <math.h>
#include <CL/cl.hpp>
double distanceD(double point1[2], double point2[2])
{
double R = 6371110.0;
double phi1 = point1[0];
double lambda1 = point1[1];
double phi2 = point2[0];
double lambda2 = point2[1];
double delta_lambda = lambda2 - lambda1;
double p1 = sin(delta_lambda) * cos(phi2);
double p2 = cos(phi1) * sin(phi2) - sin(phi1) * cos(phi2) * cos(delta_lambda);
double q = sin(phi1) * sin(phi2) + cos(phi1) * cos(phi2) * cos(delta_lambda);
double res = abs(atan2(sqrt(p1 * p1 + p2 * p2), q) * R);
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU p2 =" << p2;
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU res =" << res;
double p2_h1 = cos(phi1) * sin(phi2);
double p2_h2 = sin(phi1) * cos(phi2) * cos(delta_lambda);
double p2_sum = p2_h1 - p2_h2;
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU p2_h1 =" << p2_h1 ;
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU p2_h2 =" << p2_h2 ;
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU p2_sum =" << p2_sum;
return res;
}
int main(int argc, char *argv[])
{
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if(all_platforms.size() == 0){
qDebug() << "No platforms found. Check OpenCL installation.";
return 1;
}
cl::Platform default_platform=all_platforms[0];
qDebug() << "Using platform: " << QString::fromStdString(default_platform.getInfo<CL_PLATFORM_NAME>());
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if(all_devices.size() == 0){
qDebug() << "No devices found. Check OpenCL installation or make sure at least one compatible GPU is connected.";
return 1;
}
cl::Device default_device=all_devices[0];
qDebug() << "Using device: " << QString::fromStdString(default_device.getInfo<CL_DEVICE_NAME>());
qDebug() << "Using CL version: " << QString::fromStdString(default_device.getInfo<CL_DEVICE_VERSION>());
cl::Context context(default_device);
cl::Program::Sources sources;
QFile kernelCode1(":/CalcDist.cl");
kernelCode1.open(QFile::ReadOnly | QFile::Text);
std::string codeString = kernelCode1.readAll().toStdString();
unsigned long long codeStringSize = codeString.length();
sources.push_back({codeString.c_str(), codeStringSize});
cl::Program program(context,sources);
if(program.build({default_device})!=CL_SUCCESS){
qDebug() << " Error building: " << QString::fromStdString(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device));
return 1;
}
cl::CommandQueue queue(context,default_device);
cl::compatibility::make_kernel<> calcDist(cl::Kernel(program, "mainProgram"));
cl::EnqueueArgs eargs(queue, cl::NullRange, cl::NDRange(1), cl::NullRange);
calcDist(eargs).wait();
double p1 [2] = {0.891949793450334649, 0.513485940430910115};
double p2 [2] = {0.891949846176460226, 0.513485940430959964};
double dist = distanceD(p1, p2);
qDebug() << Qt::scientific << qSetRealNumberPrecision(18) << "CPU dist =" << dist;
return 0;
}
```

and the code for the CalcDist.cl file:

```
double distanceD(double2 point1, double2 point2);
void __kernel mainProgram()
{
long long index = get_global_id(0);
if (index == 0)
{
double2 p1 = (double2)(0.891949793450334649, 0.513485940430910115);
double2 p2 = (double2)(0.891949846176460226, 0.513485940430959964);
double dist = distanceD(p1, p2);
printf("NV dist = %.18e\n", dist);
}
}
double distanceD(double2 point1, double2 point2)
{
double R = 6371110.0;
double phi1 = point1[0];
double lambda1 = point1[1];
double phi2 = point2[0];
double lambda2 = point2[1];
double delta_lambda = lambda2 - lambda1;
// variables that eliminate duplication of trigonometric calculations
double cosphi1 = cos(phi1);
double cosphi2 = cos(phi2);
double sinphi1 = sin(phi1);
double sinphi2 = sin(phi2);
double cosdeltalambda = cos(delta_lambda);
//
double p1 = sin(delta_lambda) * cosphi2;
// one-step calculation
double p2 = cosphi1 * sinphi2 - sinphi1 * cosphi2 * cosdeltalambda;
// multi-step calculation
// double p2_h1 = cosphi1 * sinphi2;
// double p2_h2 = sinphi1 * cosphi2 * cosdeltalambda;
// double p2 = p2_h1 - p2_h2;
//
// one-step calculation
double q = sinphi1 * sinphi2 + cosphi1 * cosphi2 * cosdeltalambda;
// multi-step calculation
// double q_h1 = sinphi1 * sinphi2;
// double q_h2 = cosphi1 * cosphi2 * cosdeltalambda;
// double q = q_h1 + q_h2;
//
// one-step calculation
double res = fabs(atan2(sqrt(p1 * p1 + p2 * p2), q) * R);
// multi-step calculation
// double res_spqr = sqrt(p1 * p1 + p2 * p2);
// double res_atan2 = atan2(res_spqr, q);
// double res_atan2_R = res_atan2 * R;
// double res = fabs(res_atan2_R);
//
printf("NV p2 = %.18e\n", p2 );
printf("NV res = %.18e\n", res);
double p2_h1 = cosphi1 * sinphi2;
double p2_h2 = sinphi1 * cosphi2 * cosdeltalambda;
double p2_sum = p2_h1 - p2_h2;
double new_res = fabs(atan2(sqrt(p1 * p1 + p2_sum * p2_sum), q) * R);
printf("NV p2_h1 = %.18e\n", p2_h1 );
printf("NV p2_h2 = %.18e\n", p2_h2 );
printf("NV p2_sum = %.18e\n", p2_sum);
printf("NV new_res = %.18e\n", new_res);
return res;
}
```

If we just run this program and compare the distance values, we can see

```
CPU dist = 3.359239459231472269e-01
NV dist = 3.359239460197458449e-01
```

that only the first 8 digits match.

The main reason for this is that the value of the variable **p2** turns out to be this:

```
CPU p2 = 5.272612557671862987e-08
NV p2 = 5.272612559188061227e-08
```

and there’s a nine-digit match here.

However, if we duplicate the main calculation (inside the kernel) of the distance after the main calculation by splitting the calculation of the variable **p2** into three separate operations, then, OH WONDER, the output is as follows:

```
CPU p2_h1 = 4.886896713611054155e-01
CPU p2_h2 = 4.886896186349798388e-01
CPU p2_sum = 5.272612557671862987e-08
NV p2_h1 = 4.886896713611054155e-01
NV p2_h2 = 4.886896186349798388e-01
NV p2_sum = 5.272612557671862987e-08
```

Nothing has changed for the CPU (reasonably), but **what has happened on the GPU**?

I thought I had found the problem, so I replaced the old single-step code sections with new multi-step code sections. **HOW surprised I was** when I got eight unfortunate correct decimal places out of 15.

Hence my question - what is actually happening? Why, I don’t even know how to call it correctly, the video card doesn’t want to count normally the first time? Why does the correct calculation happen only when there is excessive duplication?

Just in case, I attach the full console output of the above program:

```
Using platform: "NVIDIA CUDA"
Using device: "NVIDIA GeForce RTX 4080"
Using CL version: "OpenCL 3.0 CUDA"
CPU p2 = 5.272612557671862987e-08
CPU res = 3.359239459231472269e-01
CPU p2_h1 = 4.886896713611054155e-01
CPU p2_h2 = 4.886896186349798388e-01
CPU p2_sum = 5.272612557671862987e-08
CPU dist = 3.359239459231472269e-01
NV p2 = 5.272612559188061227e-08
NV res = 3.359239460197458449e-01
NV p2_h1 = 4.886896713611054155e-01
NV p2_h2 = 4.886896186349798388e-01
NV p2_sum = 5.272612557671862987e-08
NV new_res = 3.359239459231471714e-01
NV dist = 3.359239460197458449e-01
```

A HUGE thank you in advance for your help!

P.S. On another forum, it was suggested that I research the answers to this question on stackoverflow. People there conclude that you need to compile the **kernel** with the parameter

```
-fmad=false
```

but such a parameter **does NOT** exist and has never existed, apparently (the first block of page 4 of the OpenCL 3.0 Reference Guide document says exactly that (as does the 600-page specification, though - it also mentions obsolete parameters, and there are none among them **-fmad=false** or similar)).

There’s a parameter

```
-cl-opt-disable
```

which, quote, “disables all code optimizations”, but it doesn’t change anything.

To make sure the compiler is even listening to what I’m saying, I tried the parameter

```
-cl-fast-relaxed-math
```

which is supposed to simplify the math, and it did simplify it - now in the second output of **p2** I also get the value rounded to 9 digits (incorrect), so the compiler applies the options. Perhaps it’s not about optimizations, since disabling them didn’t affect anything.

*P.P.S. Besides, I have to point out that in that discussion the questioner does not quite understand what he wants, because the values output by his programs match by 15 significant digits, and this is the accuracy guaranteed by the double type. In fact, the author of the question had no problems with accuracy in principle. In my case, beavers chew up 6-7 digits out of 15.*