Calculations on the GPU and CPU coincide only if one thread processes the entire batch of data.
Working version of the code:
if (tid == 0)
{
pCurrCostMin[pix] = AggrCostBad;
for (int d = 0; d < dCurCount; ++d)
{
ushort Cost = pCurrCosts[d];
pCurrAggrCosts[d] = Cost;
pCurrCostMin[pix] = min(Cost, pCurrCostMin[pix]);
}
}
Not a working option:
int threadCount = get_local_size(0);
if (tid == 0)
pCurrCostMin[pix] = AggrCostBad;
int iterCount = (dCurCount + threadCount - 1) / threadCount;
for (int iIter = 0; iIter < iterCount; iIter++)
{
int minIdx = iIter * threadCount;
int maxIdx = minIdx + threadCount;
if (maxIdx > dCurCount)
threadCount = dCurCount - minIdx;
if (tid < threadCount)
{
pCurrAggrCosts[minIdx + tid] = pCurrCosts[minIdx + tid];
}
if (tid == 0)
{
for (int d = 0; d < threadCount; ++d)
{
pCurrCostMin[pix] = min(pCurrCosts[minIdx + d], pCurrCostMin[pix]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
Tell me, please, what is my mistake