After googling a bit it appears as though open source implementations of small matrix operations for OpenCL are not easy to come by.

I frequently need such functionality so I have started with 3 by 3 matrix equations. I’ve tested pivoted gaussian elimination with back substitution

```
bool matrixSolve3x3f(float16 *M, float4 *x)
{
// find first pivot
if(M->s4 > M->s0 && M->s4 > M->s8)
{
float4 t = {M->s0, M->s1, M->s2, x->x};
M->s0 = M->s4; M->s1 = M->s5; M->s2 = M->s6; x->x = x->y;
M->s4 = t.x; M->s5 = t.y; M->s6 = t.z; x->y = t.w;
}
else if(M->s8 > M->s0)
{
float4 t = {M->s0, M->s1, M->s2, x->x};
M->s0 = M->s8; M->s1 = M->s9; M->s2 = M->sa; x->x = x->z;
M->s8 = t.x; M->s9 = t.y; M->sa = t.z; x->z = t.w;
}
if(abs(M->s0) < FLOAT_EPSILON)
return false;
// eliminate
float u = M->s4/M->s0;
M->s5 -= M->s1*u; M->s6 -= M->s2*u; x->y -= x->x*u;
u = M->s8/M->s0;
M->s9 -= M->s1*u; M->sa -= M->s2*u; x->z -= x->x*u;
// find second pivot
if(M->s5 < M->s9)
{
float4 t = {M->s5, M->s6, x->y, 0.0f};
M->s5 = M->s9; M->s6 = M->sa; x->y = x->z;
M->s9 = t.x; M->sa = t.y; x->z = t.z;
}
if(abs(M->s5) < FLOAT_EPSILON)
return false;
// eliminate
u = M->s9/M->s5;
M->sa -= M->s6*u; x->z -= x->y*u;
if(abs(M->sa) < FLOAT_EPSILON)
return false;
// back substitution
x->z /= M->sa;
x->y = (x->y - M->s6*x->z)/M->s5;
x->x = (x->x - M->s1*x->y - M->s2*x->z)/M->s0;
return true;
}
```

and Kramer’s rule

```
bool matrixSolve3x3fKramer(const float16 *M, float4 *x)
{
float D = M->s0*(M->s5*M->sa - M->s6*M->s9)
- M->s1*(M->s4*M->sa - M->s6*M->s8)
+ M->s2*(M->s4*M->s9 - M->s5*M->s8);
if(abs(D) < FLOAT_EPSILON)
return false;
float D0 = x->s0*(M->s5*M->sa - M->s6*M->s9)
- M->s1*(x->s1*M->sa - M->s6*x->s2)
+ M->s2*(x->s1*M->s9 - M->s5*x->s2);
float D1 = M->s0*(x->s1*M->sa - M->s6*x->s2)
- x->s0*(M->s4*M->sa - M->s6*M->s8)
+ M->s2*(M->s4*x->s2 - x->s1*M->s8);
float D2 = M->s0*(M->s5*x->s2 - x->s1*M->s9)
- M->s1*(M->s4*x->s2 - x->s1*M->s8)
+ x->s0*(M->s4*M->s9 - M->s5*M->s8);
x->s0 = D0/D;
x->s1 = D1/D;
x->s2 = D2/D;
return true;
}
```

The machine epsilon I set to approx 2^-32 and matrix multiplication were implemented as follows

```
float4 matrixMult3x3f(const float16 *M, const float4 *v)
{
return {M->s0*v->x + M->s1*v->y + M->s2*v->z, M->s4*v->x + M->s5*v->y + M->s6*v->z, M->s8*v->x + M->s9*v->y + M->sa*v->z, 0.0f};
}
```

To test the accuracy I used the following kernel (with random matrices)

```
__kernel void vecMathTest(__global const float16 *A, __global const float4 *b, __global const int *n, __global float *r)
{
int i = get_global_id(0);
if(i < *n)
{
float16 M = A[i];
float4 x = b[i];
matrixSolve3x3f(&M, &x);
float16 T = A[i];
float4 R = matrixMult3x3f(&T, &x) - b[i];
r[i] = length(R)/length(b[i]);
}
}
```

Kramer’s rule appears to produce more accurate results but I expect gaussian elimination to be faster as it has fewer floating point operations. I haven’t gotten around to timing it though.

Now I’m quite new to OpenCL so I’d be interested in any optimization tips or improvements as well as thoughts on whether to go for gauss or kramer. I’m leaning towards kramer but allthough it is more accurate for random matrices it is not given that it will be so in all cases.

Also, if anyone notice any bugs I’d appreciate if you’d point the out.