Radeon HD5850 vs 7770, 7850, 6530(APU), or 6450

A while ago, I developed a kernel that runs fine on the 5850 card.

When I had an opportunity to test it on the newer 7770 and 7850 cards, it did not work. The result was wrong, it messed up the mandelbrot set, even though it was somewhat recognizable.

Now, I can also test it on the APU with 6530 and on a discrete 6450 card, and it does not work, even after making adjustments for the lack of 64bit support. It did not work at all and zeroed the output buffer contents, as if an exception kept occurring in every instance. The modified kernel still works fine on 5850.

I am using AMD OpenCL SDK 3.0.130.136-GA under both Linux and Windows.

First of all, which AMD cards support cl_khr_fp64 and cl_amd_fp64 and which cards do not?

And why would a kernel that runs fine on 5850 fail on the 7000 series cards that support cl_khr_fp64 extension?

This is my original kernel:

static const char *kernelSource =                                    "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                        \n" \
"__kernel void mbCalc(                                                \n" \
"   __global uint *v,                                                 \n" \
"   const double _x0,                                                 \n" \
"   const double _y0,                                                 \n" \
"   const double _sizeCbyP,                                           \n" \
"   const uint _iterate)                                              \n" \
"{                                                                    \n" \
"    const uint id = get_global_id(0); //Get our global thread ID     \n" \
"                                                                     \n" \
"    double zreal = 0.0,                                              \n" \
"           zimag = 0.0;                                              \n" \
"    const double creal = _x0 + _sizeCbyP * (id / 1000),              \n" \
"                 cimag = _y0 + _sizeCbyP * (id % 1000);              \n" \
"                                                                     \n" \
"    uint val = 0;                                                    \n" \
"    double _zreal;                                                   \n" \
"                                                                     \n" \
"    while (true) {                                                   \n" \
"        if (val >= _iterate || zimag > 50.0 || zreal > 50.0)         \n" \
"            break;                                                   \n" \
"        _zreal = zreal * zreal + creal - zimag * zimag;              \n" \
"        zimag = 2.0 * zreal * zimag + cimag;                         \n" \
"        zreal = _zreal;                                              \n" \
"        val++;                                                       \n" \
"    }                                                                \n" \
"    v[id] = val;                                                     \n" \
"}                                                                    \n" ;

And this is its 32bit adaptation:

static const char *kernelSource32 =                                  "\n" \
"__kernel void mbCalc(                                                \n" \
"   __global uint *v,                                                 \n" \
"   const float _x0,                                                  \n" \
"   const float _y0,                                                  \n" \
"   const float _sizeCbyP,                                            \n" \
"   const uint _iterate)                                              \n" \
"{                                                                    \n" \
"    const uint id = get_global_id(0); //Get our global thread ID     \n" \
"                                                                     \n" \
"    float zreal = 0.0,                                               \n" \
"          zimag = 0.0;                                               \n" \
"    const float creal = _x0 + _sizeCbyP * (id / 1000),               \n" \
"                cimag = _y0 + _sizeCbyP * (id % 1000);               \n" \
"                                                                     \n" \
"    uint val = 0;                                                    \n" \
"    float _zreal;                                                    \n" \
"                                                                     \n" \
"    while (true) {                                                   \n" \
"        if (val >= _iterate || zimag > 50.0 || zreal > 50.0)         \n" \
"            break;                                                   \n" \
"        _zreal = zreal * zreal + creal - zimag * zimag;              \n" \
"        zimag = 2.0 * zreal * zimag + cimag;                         \n" \
"        zreal = _zreal;                                              \n" \
"        val++;                                                       \n" \
"    }                                                                \n" \
"    v[id] = val;                                                     \n" \
"}                                                                    \n" ;