/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#undef _GLIBCXX_USE_INT128

#include <stdint.h>

texture<uchar4, 2, cudaReadModeNormalizedFloat> sourceImagePyramid;
surface<void, 2> orientedBlobScoreOutput;


#if 0

__constant__ float blobbResponse[4*4] = {
-0.5f, 0.5f, 0.5f, -0.5f,
-1.0f, 1.0f, 1.0f, -1.0f,
-1.0f, 1.0f, 1.0f, -1.0f,
-0.5f, 0.5f, 0.5f, -0.5f
};


extern "C" __global__ void findOrientedBlobs(unsigned lod, float scaleX, float scaleY, unsigned yOffset)
{
    const unsigned fullIndex = threadIdx.y*4+threadIdx.x;
    unsigned x = blockIdx.x;
    unsigned y = yOffset+blockIdx.y;

    __shared__ float accuX[4*4];
    __shared__ float accuY[4*4];
    __shared__ float gradScore;
    __shared__ float maxBlobResponse;

    if (fullIndex < 4) {
        int offset1 = -1 + (int)(2*(threadIdx.x&1));
        int offset2 = -1 + (int)((threadIdx.x&2));

        float4 srcColor = tex2DLod(sourceImagePyramid, ((int)x + offset1*2) * scaleX, ((int)y + offset2*2) * scaleY, lod + 2.0f);
        accuX[threadIdx.x] = (srcColor.x+srcColor.y+srcColor.z)*srcColor.w;
    }
    __syncthreads();
    if (fullIndex == 0) {
        float dx = (accuX[1] + accuX[3]) - (accuX[0] + accuX[2]);
        float dy = (accuX[2] + accuX[3]) - (accuX[0] + accuX[1]);
        float sqrGradStrength = dx*dx+dy*dy;

        gradScore = sqrGradStrength;
        maxBlobResponse = 0.0f;
    }

    //if (gradScore > 1e-10f)
        {

        for (unsigned scaleIter = 0; scaleIter < 5; scaleIter++) {
            float scale = 1.0f + scaleIter / 5.0f;

            float du = threadIdx.x - 1.5f;
            float dv = threadIdx.y - 1.5f;

            float u = (x + du * scale) * scaleX;
            float v = (y + dv * scale) * scaleY;

            float4 srcColor = tex2DLod(sourceImagePyramid, u, v, lod);
            float value = (srcColor.x+srcColor.y+srcColor.z)*srcColor.w;
            accuX[fullIndex] = value * blobbResponse[threadIdx.x + threadIdx.y*4];
            accuY[fullIndex] = value * blobbResponse[threadIdx.x*4 + threadIdx.y];

            for (unsigned i = 4*2; i > 0; i/=2) {
                __syncthreads();
                if (fullIndex < i) {
                    accuX[fullIndex] += accuX[fullIndex + i];
                    accuY[fullIndex] += accuY[fullIndex + i];
                }
            }

            __syncthreads();

            if (fullIndex == 0) {
                maxBlobResponse = fmax(maxBlobResponse, (accuX[0]*accuY[0]));
            }

            __syncthreads();
        }
    }

    if (fullIndex == 0) {
        float score = maxBlobResponse*maxBlobResponse * gradScore;
        //float score = maxBlobResponse*0.1f;
        surf2Dwrite(score, orientedBlobScoreOutput, x * 4, y);
    }
}

#else

__constant__ float blobbResponse[6] = {
-0.25f, -0.75f, 1.0f, 1.0f, -0.75f, -0.25f
};


extern "C" __global__ void
__launch_bounds__(128, 16)
findOrientedBlobs(unsigned lod, float scaleX, float scaleY, unsigned yOffset, unsigned width, unsigned height)
{
    unsigned x = blockIdx.x * 4 + threadIdx.y;
    unsigned y = yOffset+blockIdx.y;

    if ((x >= width) || (y >= height))
        return;

    __shared__ float gradScore[4];
    __shared__ float maxBlobResponse[4];


    {
        float lum;

        if (threadIdx.x < 4) {
            int offset1 = -1 + (int)(2*(threadIdx.x&1));
            int offset2 = -1 + (int)((threadIdx.x&2));

            float4 srcColor = tex2DLod(sourceImagePyramid, ((int)x + offset1*2) * scaleX, ((int)y + offset2*2) * scaleY, lod + 2.0f);
            lum = (srcColor.x+srcColor.y+srcColor.z)*srcColor.w;
        }

        float lum0 = __shfl(lum, 0, 32);
        float lum1 = __shfl(lum, 1, 32);
        float lum2 = __shfl(lum, 2, 32);
        float lum3 = __shfl(lum, 3, 32);
        if (threadIdx.x == 0) {
            float dx = (lum1 + lum3) - (lum0 + lum2);
            float dy = (lum2 + lum3) - (lum0 + lum1);
            float sqrGradStrength = dx*dx+dy*dy;

            gradScore[threadIdx.y] = sqrGradStrength;
            maxBlobResponse[threadIdx.y] = 0.0f;
        }
    }

    {

        float alpha = threadIdx.x * (float)M_PI / 32.0f;

        float cosAlpha = cosf(alpha);
        float sinAlpha = sinf(alpha);

        for (unsigned scaleIter = 0; scaleIter < 4; scaleIter++) {
            float scale = 1.0f + scaleIter / 4.0f;

            float sum = 0.0f;

            for (unsigned ir = 0; ir < 6; ir++) {
                float r = (ir - 2.5f) * 0.5f;

                float u = (x + cosAlpha * r * scale) * scaleX;
                float v = (y + sinAlpha * r * scale) * scaleY;

                float4 srcColor = tex2DLod(sourceImagePyramid, u, v, lod);
                float value = (srcColor.x+srcColor.y+srcColor.z)*srcColor.w;
                sum += value * blobbResponse[ir];
            }

            sum = sum * 10.0f;

            #pragma unroll
            for (int i=16; i>=1; i/=2)
                sum *= __shfl_xor(sum, i, 32);


            if (threadIdx.x == 0) {
                //maxBlobResponse = fmax(maxBlobResponse, (accuX[0]*accuY[0]));
                maxBlobResponse[threadIdx.y] = fmax(maxBlobResponse[threadIdx.y], fabs(sum));
            }
        }
    }

    if (threadIdx.x == 0) {
        float score = maxBlobResponse[threadIdx.y] * gradScore[threadIdx.y];
        //float score = maxBlobResponse*0.1f;
        surf2Dwrite(score, orientedBlobScoreOutput, x * 4, y);
    }
}

#endif

texture<float, 2, cudaReadModeElementType> stage1OutputPyramid;

extern "C" __global__ void nonMaximumSuppress(unsigned lod, unsigned maxlod, float scaleX, float scaleY, float minThresh, uint32_t *candidateData, unsigned maxEntries)
{
    const unsigned fullIndex = threadIdx.y*8+threadIdx.x;
    unsigned x = blockIdx.x;
    unsigned y = blockIdx.y;

    if ((x < 4) || (y < 4) || (x+4 >= gridDim.x) || (y+4 >= gridDim.y))
        return;


    __shared__ float centralScore;
    if (fullIndex == 0) {
        centralScore = tex2DLod(stage1OutputPyramid, (x+0.5f) * scaleX, (y+0.5f) * scaleY, lod);
    }

    __syncthreads();

    if (centralScore < minThresh)
        return;

    __syncthreads();

    __shared__ float scoreDiffs[8*8];

    const float slackFactor = 0.01f * centralScore;
/*
    if (lod > 0) {
        float v = 0.0f;

        for (int oy = -16; oy <= 8; oy+=8)
            for (int ox = -16; ox <= 8; ox+=8) {

                int dx = ox + threadIdx.x;
                int dy = oy + threadIdx.y;
                float slack = (dx*dx+dy*dy+1)*slackFactor;
                float a = tex2DLod(stage1OutputPyramid, (x+(0.5f + dx)*0.5f) * scaleX,
                                                        (y+(0.5f + dy)*0.5f) * scaleY, lod-1.0f);

                a -= slack;
                v = max(v, a-centralScore);
            }

        scoreDiffs[fullIndex] = v;

        for (unsigned i = 8*4; i > 0; i/=2) {
            __syncthreads();
            if (fullIndex < i)
                scoreDiffs[fullIndex] += scoreDiffs[fullIndex + i];
        }

        __syncthreads();

        if (scoreDiffs[0] > 0.0f)
            return;

        __syncthreads();
    }
*/

    {
        float maxOtherScore;
        float a;

        int dx;
        int dy;
        float slack;

        dx = -8 + (int)threadIdx.x;
        dy = -7 + (int)threadIdx.y;
        slack = (dx*dx+dy*dy)>5*5?(dx*dx+dy*dy)*slackFactor:0.0f;

        a = tex2DLod(stage1OutputPyramid, (x+0.5f +dx) * scaleX,
                                          (y+0.5f +dy) * scaleY, lod);
        a -= slack;
        maxOtherScore = a;

        dx = -0 + (int)threadIdx.x;
        dy = -8 + (int)threadIdx.y;
        slack = (dx*dx+dy*dy)>5*5?(dx*dx+dy*dy)*slackFactor:0.0f;

        a = tex2DLod(stage1OutputPyramid, (x+0.5f +dx) * scaleX,
                                          (y+0.5f +dy) * scaleY, lod);
        a -= slack;
        maxOtherScore = fmax(maxOtherScore, a);

        dx = -7 + (int)threadIdx.x;
        dy = +1 + (int)threadIdx.y;
        slack = (dx*dx+dy*dy)>5*5?(dx*dx+dy*dy)*slackFactor:0.0f;

        a = tex2DLod(stage1OutputPyramid, (x+0.5f +dx) * scaleX,
                                          (y+0.5f +dy) * scaleY, lod);
        a -= slack;
        maxOtherScore = fmax(maxOtherScore, a);

        dx = +1 + (int)threadIdx.x;
        dy = 0 + (int)threadIdx.y;
        slack = (dx*dx+dy*dy)>5*5?(dx*dx+dy*dy)*slackFactor:0.0f;

        a = tex2DLod(stage1OutputPyramid, (x+0.5f +dx) * scaleX,
                                          (y+0.5f +dy) * scaleY, lod);
        a -= slack;
        maxOtherScore = fmax(maxOtherScore, a);

#if 0
        scoreDiffs[fullIndex] = v;

        for (unsigned i = 8*4; i > 0; i/=2) {
            __syncthreads();
            if (fullIndex < i)
                scoreDiffs[fullIndex] += scoreDiffs[fullIndex + i];
        }

        __syncthreads();

        if (scoreDiffs[0] > 0.0f)
            return;
#else
        scoreDiffs[fullIndex] = maxOtherScore;
        __syncthreads();
        if (fullIndex < 32) {
            float maximum = fmax(scoreDiffs[fullIndex], scoreDiffs[fullIndex + 32]);

            #pragma unroll
            for (int i=16; i>=1; i/=2)
                maximum = fmax(maximum, __shfl_xor(maximum, i, 32));

            if (fullIndex == 0)
                scoreDiffs[0] = maximum;
        }
        __syncthreads();

        if (scoreDiffs[0] >= centralScore)
            return;
#endif
    }
/*
    if (lod < maxlod) {
        float v = 0.0f;
        float a;

        float dx = -4 + (int)threadIdx.x;
        float dy = -4 + (int)threadIdx.y;
        float slack = (dx*dx+dy*dy+1)*slackFactor;

        a = tex2DLod(stage1OutputPyramid, (x+(0.5f + dx) * 2.0f) * scaleX,
                                          (y+(0.5f + dy) * 2.0f) * scaleY, lod + 1.0f);
        a -= slack;
        v = max(v, a-centralScore);
        scoreDiffs[fullIndex] = v;

        for (unsigned i = 8*4; i > 0; i/=2) {
            __syncthreads();
            if (fullIndex < i)
                scoreDiffs[fullIndex] += scoreDiffs[fullIndex + i];
        }

        __syncthreads();

        if (scoreDiffs[0] > 0.0f)
            return;
    }
*/
    if (fullIndex == 0) {
        unsigned index = atomicAdd(candidateData, 1);
        if (index < maxEntries) {
            candidateData[index*2+1] = x | (y << 16);
            candidateData[index*2+1+1] = lod;
        }
    }

}


struct PatchExtractionPatches {
    float score;
    float x, y;
    float angle;
    float lod;
    uint32_t data[16*16];
//    float covarMat[3];
    //float debugPath[10*4];
} __attribute__((packed));


#if 1
__constant__ float blobbResponseBig[8*8] = {
-0.1f, -0.1f, 0.1f, 0.1f, 0.1f, 0.1f, -0.1f, -0.1f,
-0.3f, -0.3f, 0.3f, 0.3f, 0.3f, 0.3f, -0.3f, -0.3f,
-0.8f, -0.8f, 0.8f, 0.8f, 0.8f, 0.8f, -0.8f, -0.8f,
-1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f,
-1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f,
-0.8f, -0.8f, 0.8f, 0.8f, 0.8f, 0.8f, -0.8f, -0.8f,
-0.3f, -0.3f, 0.3f, 0.3f, 0.3f, 0.3f, -0.3f, -0.3f,
-0.1f, -0.1f, 0.1f, 0.1f, 0.1f, 0.1f, -0.1f, -0.1f
};
#elif 1
__constant__ float blobbResponseBig[8*8] = {
-3.0f*0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, -3.0f*0.1f,
-3.0f*0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, -3.0f*0.3f,
-3.0f*0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, -3.0f*0.8f,
-3.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -3.0f,
-3.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -3.0f,
-3.0f*0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, -3.0f*0.8f,
-3.0f*0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, -3.0f*0.3f,
-3.0f*0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, -3.0f*0.1f
};
#else
__constant__ float blobbResponseBig[8*8] = {
-0.1f*0.1f, -0.2f*0.1f, -0.7f*0.1f, 0.1f, 0.1f, -0.7f*0.1f, -0.2f*0.1f, -0.1f*0.1f,
-0.1f*0.3f, -0.2f*0.3f, -0.7f*0.3f, 0.3f, 0.3f, -0.7f*0.3f, -0.2f*0.3f, -0.1f*0.3f,
-0.1f*0.8f, -0.2f*0.8f, -0.7f*0.8f, 0.8f, 0.8f, -0.7f*0.8f, -0.2f*0.8f, -0.1f*0.8f,
-0.1f, -0.2f, -0.7f, 1.0f, 1.0f, -0.7f, -0.2f, -0.1f,
-0.1f, -0.2f, -0.7f, 1.0f, 1.0f, -0.7f, -0.2f, -0.1f,
-0.1f*0.8f, -0.2f*0.8f, -0.7f*0.8f, 0.8f, 0.8f, -0.7f*0.8f, -0.2f*0.8f, -0.1f*0.8f,
-0.1f*0.3f, -0.2f*0.3f, -0.7f*0.3f, 0.3f, 0.3f, -0.7f*0.3f, -0.2f*0.3f, -0.1f*0.3f,
-0.1f*0.1f, -0.2f*0.1f, -0.7f*0.1f, 0.1f, 0.1f, -0.7f*0.1f, -0.2f*0.1f, -0.1f*0.1f,
};
#endif

__device__ void evalPatchLocationScale(float &scaleX, float &scaleY,
                                       unsigned &lod,
                                       float &cosAngle, float &sinAngle,
                                       float x, float y, float scale,
                                       float *accuX, float *accuY,
                                       float &dst, const unsigned fullIndex)
{
    float u = (threadIdx.x - 3.5f) * scale;
    float v = (threadIdx.y - 3.5f) * scale;

    float4 srcColor = tex2DLod(sourceImagePyramid, (x + cosAngle * u - sinAngle * v) * scaleX,
                                                   (y + sinAngle * u + cosAngle * v) * scaleY, lod);

    float value = (srcColor.x+srcColor.y+srcColor.z)*srcColor.w;
    accuX[fullIndex] = value * blobbResponseBig[threadIdx.x + threadIdx.y*8];
    accuY[fullIndex] = value * blobbResponseBig[threadIdx.x*8 + threadIdx.y];


    for (unsigned i = 8*4; i > 0; i/=2) {
        __syncthreads();
        if (fullIndex < i) {
            accuX[fullIndex] += accuX[fullIndex + i];
            accuY[fullIndex] += accuY[fullIndex + i];
        }
    }

    __syncthreads();

    if (fullIndex == 0) {
        dst = fabs(accuX[0]*accuY[0]);
    }

    __syncthreads();
}


__device__ void evalPatchAngle(float &scaleX, float &scaleY,
                               float &currentX, float &currentY, float &currentScale,
                               unsigned &lod,
                               float *accuX, float *accuY,
                               float &dstAngle, float &dstCosAngle, float &dstSinAngle,
                               const unsigned fullIndex)
{
    {
        float angle = fullIndex * (float)M_PI / (8*8);

        float cosAngle = cosf(angle);
        float sinAngle = sinf(angle);

        float diff;
        {
            float4 sample;
            sample = tex2DLod(sourceImagePyramid, (currentX + 7.0f * cosAngle * currentScale) * scaleX, (currentY + 7.0f * sinAngle * currentScale) * scaleY, lod + 1.2f);
            diff = (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid, (currentX + 5.0f * cosAngle * currentScale) * scaleX, (currentY + 5.0f * sinAngle * currentScale) * scaleY, lod + 1.2f);
            diff += (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid, (currentX - 7.0f * cosAngle * currentScale) * scaleX, (currentY - 7.0f * sinAngle * currentScale) * scaleY, lod + 1.2f);
            diff -= (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid, (currentX - 5.0f * cosAngle * currentScale) * scaleX, (currentY - 5.0f * sinAngle * currentScale) * scaleY, lod + 1.2f);
            diff -= (sample.x+sample.y+sample.z)*sample.w;
        }
        accuX[fullIndex] = diff * cosAngle;
        accuY[fullIndex] = diff * sinAngle;
    }

    for (unsigned i = 8*4; i > 0; i/=2) {
        __syncthreads();
        if (fullIndex < i) {
            accuX[fullIndex] += accuX[fullIndex + i];
            accuY[fullIndex] += accuY[fullIndex + i];
        }
    }
    __syncthreads();

    if (fullIndex == 0) {
        dstAngle = atan2f(accuY[0], accuX[0]);
        dstCosAngle = cosf(dstAngle);
        dstSinAngle = sinf(dstAngle);
    }
    __syncthreads();

}

__device__ inline float4 operator+(const float4 &op1, const float4 &op2) {
    float4 r;
    r.x = op1.x + op2.x;
    r.y = op1.y + op2.y;
    r.z = op1.z + op2.z;
    r.w = op1.w + op2.w;
    return r;
}


#if 0
__device__ void computeNormalizationMatrix(float &scaleX, float &scaleY,
                                       unsigned &lod,
                                       float &cosAngle, float &sinAngle,
                                       float x, float y, float scale,
                                       float *dst, const unsigned fullIndex)
{

    float weight;
    unsigned warpIndex = fullIndex % 32;
    unsigned warpGroup = fullIndex / 32;
    float r = (warpIndex - 15.5f);
    {
        float d = r / 16.0f;
        weight = std::exp(-d*d);
    }

    __shared__ float covarMatrix[3*2];
    if (fullIndex < 6)
        covarMatrix[fullIndex] = 0.0f;

    __syncthreads();

    for (unsigned aIdx = warpGroup; aIdx < 32; aIdx+=2) {
        float angle = aIdx / 32.0f * (float)M_PI;

        float cosA = cosf(angle);
        float sinA = sinf(angle);


        float u = (r * cosA) * scale;
        float v = (r * sinA) * scale;

        float4 srcColor = tex2DLod(sourceImagePyramid, (x + cosAngle * u - sinAngle * v) * scaleX,
                                                       (y + sinAngle * u + cosAngle * v) * scaleY, lod);


        float lum = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w;

        float weightedLum = lum * weight;

        float sum = 0.0f;

        for (unsigned band = 1; band < 16; band++) {
            float v = weightedLum * cosf((float)M_PI * warpIndex * band * (1.0f / 31.0f));

            #pragma unroll
            for (int i=16; i>=1; i/=2)
                v += __shfl_xor(v, i, 32);

            //if (k == 0)
                sum += fabs(v)*band;

        }

        if (warpIndex == 0) {
            float vx = cosA * sum;
            float vy = sinA * sum;

            covarMatrix[warpGroup*3+0] += vx*vx;
            covarMatrix[warpGroup*3+1] += vx*vy;
            covarMatrix[warpGroup*3+2] += vy*vy;
        }

    }
    __syncthreads();

    if (fullIndex < 3) {
        covarMatrix[fullIndex] += covarMatrix[3+fullIndex];


        if (fullIndex == 0) {
#if 0
            float det = covarMatrix[0]*covarMatrix[2] - covarMatrix[1]*covarMatrix[1];

            dst[0] = covarMatrix[0] / sqrtf(det);
            dst[1] = covarMatrix[1] / sqrtf(det);
            dst[2] = covarMatrix[2] / sqrtf(det);
#else
            float det = covarMatrix[0]*covarMatrix[2] - covarMatrix[1]*covarMatrix[1];

            dst[2] = covarMatrix[0] / sqrtf(det);
            dst[1] = -covarMatrix[1] / sqrtf(det);
            dst[0] = covarMatrix[2] / sqrtf(det);
#endif
/*
            dst[0] = 1.0f;
            dst[1] = 0.0f;
            dst[2] = 1.0f;
*/
        }
    }
    __syncthreads();

}
#endif

#if 0

extern "C" __global__ void patchExtraction(unsigned lod0Width, unsigned lod0Height, uint32_t *candidateData, PatchExtractionPatches *patchData, unsigned offset)
{
    const unsigned candidateIndex = blockIdx.x + gridDim.x * blockIdx.y + offset;

    const unsigned fullIndex = threadIdx.x + threadIdx.y*8;

    __shared__ float scaleX;
    __shared__ float scaleY;
    __shared__ unsigned lod;

    __shared__ float currentX;
    __shared__ float currentY;
    __shared__ float currentScale;

    if (fullIndex == 0) {
        lod = candidateData[candidateIndex*2+1];
        scaleX = 1.0f / max(1, lod0Width >> lod);
        scaleY = 1.0f / max(1, lod0Height >> lod);

        uint32_t v = candidateData[candidateIndex*2+0];
        currentX = (v & 0xFFFF)*2 + 0.5f;
        currentY = ((v >> 16) & 0xFFFF)*2 + 0.5f;

        currentScale = 1.0f;
    }
    __syncthreads();

    __shared__ float accuX[8*8];
    __shared__ float accuY[8*8];

    __shared__ float samples[3*2];

    __shared__ float angle;
    __shared__ float cosAngle;
    __shared__ float sinAngle;

    __shared__ float score;

    PatchExtractionPatches *patchDst = &patchData[candidateIndex];

#if 1
    for (unsigned iter = 0; iter < 100; iter++) {
        evalPatchAngle(scaleX, scaleY, currentX, currentY, currentScale, lod, accuX, accuY, angle, cosAngle, sinAngle, fullIndex);

        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX, currentY, currentScale, accuX, accuY,
                               score, fullIndex);


        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX-1.0f, currentY+0.0f, currentScale+0.0f, accuX, accuY,
                               samples[0*2+0], fullIndex);
        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX+1.0f, currentY+0.0f, currentScale+0.0f, accuX, accuY,
                               samples[0*2+1], fullIndex);

        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX+0.0f, currentY-1.0f, currentScale+0.0f, accuX, accuY,
                               samples[1*2+0], fullIndex);
        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX+0.0f, currentY+1.0f, currentScale+0.0f, accuX, accuY,
                               samples[1*2+1], fullIndex);

        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX+0.0f, currentY+0.0f, currentScale-0.1f, accuX, accuY,
                               samples[2*2+0], fullIndex);
        evalPatchLocationScale(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX+0.0f, currentY+0.0f, currentScale+0.1f, accuX, accuY,
                               samples[2*2+1], fullIndex);

        if (fullIndex == 0) {
                /*
            if (iter % 10 == 0) {
                patchDst->debugPath[iter/10*4+0] = currentX;
                patchDst->debugPath[iter/10*4+1] = currentY;
                patchDst->debugPath[iter/10*4+2] = score;
                patchDst->debugPath[iter/10*4+3] = currentScale;
            }
*/

            float Dx = samples[0*2+1] - samples[0*2+0];
            float Dy = samples[1*2+1] - samples[1*2+0];
            float Ds = (samples[2*2+1] - samples[2*2+0]) * 0.5f;

            Dx = fmin(fmax(Dx, -10.0f), 10.0f);
            Dy = fmin(fmax(Dy, -10.0f), 10.0f);
            Ds = fmin(fmax(Ds, -2.0f), 2.0f);

            currentX += Dx * 0.01f;
            currentY += Dy * 0.01f;
            currentScale += Ds * 0.1f;
            currentScale = fmin(fmax(currentScale, 1.0f), 1.99f);
        }
    }
#endif
    evalPatchAngle(scaleX, scaleY, currentX, currentY, currentScale, lod, accuX, accuY, angle, cosAngle, sinAngle, fullIndex);

/*
    __shared__ float covarMat[3];
    computeNormalizationMatrix(scaleX, scaleY, lod, cosAngle, sinAngle,
                               currentX, currentY, currentScale,
                               covarMat, fullIndex);

    if (fullIndex < 3) {
        patchDst->covarMat[fullIndex] = covarMat[fullIndex];
    }
*/
    if (fullIndex == 0) {
        patchDst->x = currentX;
        patchDst->y = currentY;
        patchDst->angle = angle;
        patchDst->lod = lod + (currentScale - 1.0f);
        patchDst->score = 0.0f;
    }

#if 0
    for (int oy = 0; oy < 16; oy += 8)
        for (int ox = 0; ox < 16; ox += 8) {
            float patch_r;
            float patch_g;
            float patch_b;
            {
                float u = (threadIdx.x - 7.5f + ox) * currentScale;
                float v = (threadIdx.y - 7.5f + oy) * currentScale;

                float4 srcColor = tex2DLod(sourceImagePyramid, (currentX + cosAngle * u - sinAngle * v) * scaleX,
                                                               (currentY + sinAngle * u + cosAngle * v) * scaleY, lod);

                patch_r = srcColor.x * srcColor.w;
                patch_g = srcColor.y * srcColor.w;
                patch_b = srcColor.z * srcColor.w;
            }

            float m = fmax(patch_r, fmax(patch_g, patch_b));

            unsigned w = std::min((int)(m * 255+1), 255);

            float rcpM = 255.0f*255.0f / w;

            uint32_t data = patch_r * rcpM;
            data |= ((unsigned)(patch_g * rcpM)) << 8;
            data |= ((unsigned)(patch_b * rcpM)) << 16;
            data |= w << 24;

            patchDst->data[threadIdx.x + ox + (threadIdx.y + oy)*16] = data;
        }
#else
    __syncthreads();
    accuX[fullIndex] = 0.0f;
    accuY[fullIndex] = 0.0f;
    for (int oy = 0; oy < 16; oy += 8)
        for (int ox = 0; ox < 16; ox += 8) {
            float s = (threadIdx.x - 7.5f + ox);
            float t = (threadIdx.y - 7.5f + oy);
#if 1
            float u = s * currentScale;
            float v = t * currentScale;
#else
            float u = (s * covarMat[0] + t * covarMat[1]) * currentScale;
            float v = (s * covarMat[1] + t * covarMat[2]) * currentScale;
#endif

            float r = s*s+t*t;
#if 1
            float4 srcColor = tex2DLod(sourceImagePyramid, (currentX + cosAngle * u - sinAngle * v) * scaleX,
                                                           (currentY + sinAngle * u + cosAngle * v) * scaleY, lod + r/32.0f);
#else
            float4 srcColor = tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 0.9 - sinAngle * v * 0.9) * scaleX,
                                                           (currentY + sinAngle * u * 0.9 + cosAngle * v * 0.9) * scaleY, lod + r/64.0f) +
                              tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 0.95 - sinAngle * v * 0.95) * scaleX,
                                                           (currentY + sinAngle * u * 0.95 + cosAngle * v * 0.95) * scaleY, lod + r/64.0f) +
                              tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 1.05 - sinAngle * v * 1.05) * scaleX,
                                                           (currentY + sinAngle * u * 1.05 + cosAngle * v * 1.05) * scaleY, lod + r/64.0f) +
                              tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 1.1 - sinAngle * v * 1.1) * scaleX,
                                                           (currentY + sinAngle * u * 1.1 + cosAngle * v * 1.1) * scaleY, lod + r/64.0f);
#endif

            float lum = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w;

            accuX[fullIndex] += lum;
            accuY[fullIndex] += lum*lum;
        }


    for (unsigned i = 8*4; i > 0; i/=2) {
        __syncthreads();
        if (fullIndex < i) {
            accuX[fullIndex] += accuX[fullIndex + i];
            accuY[fullIndex] += accuY[fullIndex + i];
        }
    }
    __syncthreads();
    __shared__ float lumScale, lumOffset;
    if (fullIndex == 0) {
        float mean = accuX[0] / (16.0f*16.0f);
        float var = sqrtf(accuY[0] / (16.0f*16.0f) - mean*mean);

        float min = mean - 2.5f * var;
        float max = mean + 2.5f * var;

        lumScale = 3.0f / fmax(max-min, 0.0001f);
        lumOffset = -(min * lumScale) * 0.333f;

       // lumScale = (0.2f*16.0f*16.0f) / fmax(accuX[0], 0.01f);
    }
    __syncthreads();

    for (int oy = 0; oy < 16; oy += 8)
        for (int ox = 0; ox < 16; ox += 8) {
            float patch_r;
            float patch_g;
            float patch_b;
            {
                float s = (threadIdx.x - 7.5f + ox);
                float t = (threadIdx.y - 7.5f + oy);
#if 1
                float u = s * currentScale;
                float v = t * currentScale;
#else
                float u = (s * covarMat[0] + t * covarMat[1]) * currentScale;
                float v = (s * covarMat[1] + t * covarMat[2]) * currentScale;
#endif


                float r = s*s+t*t;

#if 1
                float4 srcColor = tex2DLod(sourceImagePyramid, (currentX + cosAngle * u - sinAngle * v) * scaleX,
                                                               (currentY + sinAngle * u + cosAngle * v) * scaleY, lod + r/32.0f);
#else
                float4 srcColor = tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 0.9 - sinAngle * v * 0.9) * scaleX,
                                                               (currentY + sinAngle * u * 0.9 + cosAngle * v * 0.9) * scaleY, lod + r/64.0f) +
                                  tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 0.95 - sinAngle * v * 0.95) * scaleX,
                                                               (currentY + sinAngle * u * 0.95 + cosAngle * v * 0.95) * scaleY, lod + r/64.0f) +
                                  tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 1.05 - sinAngle * v * 1.05) * scaleX,
                                                               (currentY + sinAngle * u * 1.05 + cosAngle * v * 1.05) * scaleY, lod + r/64.0f) +
                                  tex2DLod(sourceImagePyramid, (currentX + cosAngle * u * 1.1 - sinAngle * v * 1.1) * scaleX,
                                                               (currentY + sinAngle * u * 1.1 + cosAngle * v * 1.1) * scaleY, lod + r/64.0f);
#endif
                float fac = lumScale * srcColor.w;

                patch_r = fmin(fmax(srcColor.x * fac + lumOffset, 0.0f), 1.0f);
                patch_g = fmin(fmax(srcColor.y * fac + lumOffset, 0.0f), 1.0f);
                patch_b = fmin(fmax(srcColor.z * fac + lumOffset, 0.0f), 1.0f);
#if 0
float avgLum = (patch_r + patch_g + patch_b) * 0.3333f;
patch_r = patch_r * 0.3f + avgLum * 0.7f;
patch_g = patch_g * 0.3f + avgLum * 0.7f;
patch_b = patch_b * 0.3f + avgLum * 0.7f;
#endif
            }

            float m = fmax(patch_r, fmax(patch_g, patch_b));

            unsigned w = min((int)(m * 255+1), 255);

            float rcpM = 255.0f*255.0f / w;

            uint32_t data = patch_r * rcpM;
            data |= ((unsigned)(patch_g * rcpM)) << 8;
            data |= ((unsigned)(patch_b * rcpM)) << 16;
            data |= w << 24;

            patchDst->data[threadIdx.x + ox + (threadIdx.y + oy)*16] = data;
        }
#endif
}


#else



extern "C" __global__ void patchExtraction(unsigned lod0Width, unsigned lod0Height, uint32_t *candidateData, PatchExtractionPatches *patchData, unsigned offset, unsigned numCandidates)
{
    const unsigned candidateIndex = blockIdx.x*4 + threadIdx.y + offset;

    if (candidateIndex >= numCandidates)
        return;


    __shared__ float scaleX[4];
    __shared__ float scaleY[4];
    __shared__ unsigned lod[4];

    __shared__ float currentX[4];
    __shared__ float currentY[4];
    __shared__ float currentScale[4];

    if (threadIdx.x == 0) {
        lod[threadIdx.y] = candidateData[candidateIndex*2+1];
        scaleX[threadIdx.y] = 1.0f / max(1, lod0Width >> lod[threadIdx.y]);
        scaleY[threadIdx.y] = 1.0f / max(1, lod0Height >> lod[threadIdx.y]);

        uint32_t v = candidateData[candidateIndex*2+0];
        currentX[threadIdx.y] = (v & 0xFFFF)*2 + 0.5f;
        currentY[threadIdx.y] = ((v >> 16) & 0xFFFF)*2 + 0.5f;

        currentScale[threadIdx.y] = 1.5f;
    }

    __shared__ float blobOffset[4];
    __shared__ float blobScale[4];

    unsigned offsetX = threadIdx.x % 16;
    unsigned offsetY = threadIdx.x / 16;
    {
        float angle = threadIdx.x * ((float)M_PI * 2.0f / 32.0f);

        float4 srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + cosf(angle) * currentScale[threadIdx.y] * 0.1f) * scaleX[threadIdx.y],
                                                       (currentY[threadIdx.y] + sinf(angle) * currentScale[threadIdx.y] * 0.1f) * scaleY[threadIdx.y],
                                                        lod[threadIdx.y] + 0.0f);

        float v1 = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w * 0.33f;

        srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + cosf(angle) * currentScale[threadIdx.y] * 6.0f) * scaleX[threadIdx.y],
                                                (currentY[threadIdx.y] + sinf(angle) * currentScale[threadIdx.y] * 6.0f) * scaleY[threadIdx.y],
                                                lod[threadIdx.y] + 1.0f);

        float v2 = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w * 0.33f;

        float avg = (v1+v2);
        float diff = (v1-v2);


        for (int i=16; i>=1; i/=2) {
            avg += __shfl_xor(avg, i, 32);
            diff += __shfl_xor(diff, i, 32);
        }

        if (threadIdx.x == 0) {
            if (diff > 0.0f) {
                blobScale[threadIdx.y] = 1.0f;
                blobOffset[threadIdx.y] = -avg * (1.0f/64.0f) - fabs(diff) * (1.0f / 32.0f) * 0.2f;
            } else {
                blobScale[threadIdx.y] = -1.0f;
                blobOffset[threadIdx.y] = avg * (1.0f/64.0f) - fabs(diff) * (1.0f / 32.0f) * 0.2f;
            }
        }
    }


    for (unsigned iter = 0; iter < 5; iter++) {
        float sumWeights = 0.0f;
        float sumX = 0.0f;
        float sumY = 0.0f;
        float sumSQRX = 0.0f;
        float sumSQRY = 0.0f;
        float sumXY = 0.0f;

        const float subSamplingFactor = 0.3f;
        for (int oy = offsetY; oy < 16; oy += 2) {
            float s = (offsetX - 7.5f);
            float t = (oy - 7.5f);
            float u = s * currentScale[threadIdx.y] * subSamplingFactor;
            float v = t * currentScale[threadIdx.y] * subSamplingFactor;

            float4 srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + u) * scaleX[threadIdx.y],
                                                           (currentY[threadIdx.y] + v) * scaleY[threadIdx.y], lod[threadIdx.y]);

            float lum = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w * 0.33f;

            lum = fmax(lum * blobScale[threadIdx.y] + blobOffset[threadIdx.y], 0.0f);

            sumX += s*lum;
            sumY += t*lum;

            sumSQRX += s*s*lum;
            sumSQRY += t*t*lum;
            sumXY += s*t*lum;

            sumWeights += lum;
        }

        for (int i=16; i>=1; i/=2) {
            sumX += __shfl_xor(sumX, i, 32);
            sumY += __shfl_xor(sumY, i, 32);
            sumSQRX += __shfl_xor(sumSQRX, i, 32);
            sumSQRY += __shfl_xor(sumSQRY, i, 32);
            sumXY += __shfl_xor(sumXY, i, 32);
            sumWeights += __shfl_xor(sumWeights, i, 32);
        }

        if (threadIdx.x == 0) {
            float meanX = sumX * (1.0f / sumWeights);
            float meanY = sumY * (1.0f / sumWeights);

            float varX = fmax(sumSQRX * (1.0f / sumWeights) - meanX*meanX, 0.0f);
            float varY = fmax(sumSQRY * (1.0f / sumWeights) - meanY*meanY, 0.0f);
            float varXY = fmax(sumXY * (1.0f / sumWeights) - meanX*meanY, 0.0f);

            currentX[threadIdx.y] += meanX * currentScale[threadIdx.y] * subSamplingFactor;
            currentY[threadIdx.y] += meanY * currentScale[threadIdx.y] * subSamplingFactor;

            float det = varX*varY - varXY*varXY;

            //currentScale[threadIdx.y] = fmin(fmax(currentScale[threadIdx.y] * sqrtf((varX + varY) * 0.5f) / 3.0f, 1.0f), 1.99f);
            //currentScale[threadIdx.y] = fmin(fmax(currentScale[threadIdx.y] * sqrtf(sqrtf(det)) / 4.0f, 1.0f), 1.99f);
            //currentScale[threadIdx.y] = currentScale[threadIdx.y] * subSamplingFactor * sqrtf(sqrtf(det)) / 1.0f;
            currentScale[threadIdx.y] = fmin(fmax(currentScale[threadIdx.y] * subSamplingFactor * sqrtf(sqrtf(det)) / 1.0f, 1.0f), 1.99f);
        }
    }

    __shared__ float angle[4];
    __shared__ float cosAngle[4];
    __shared__ float sinAngle[4];


    {
        float testAngle = threadIdx.x * ((float)M_PI / 32.0f);

        float testCosAngle = cosf(testAngle);
        float testSinAngle = sinf(testAngle);

        float diff;
        {
            float4 sample;
            sample = tex2DLod(sourceImagePyramid,
                              (currentX[threadIdx.y] + 7.0f * testCosAngle * currentScale[threadIdx.y]) * scaleX[threadIdx.y],
                              (currentY[threadIdx.y] + 7.0f * testSinAngle * currentScale[threadIdx.y]) * scaleY[threadIdx.y], lod[threadIdx.y] + 1.2f);
            diff = (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid,
                              (currentX[threadIdx.y] + 5.0f * testCosAngle * currentScale[threadIdx.y]) * scaleX[threadIdx.y],
                              (currentY[threadIdx.y] + 5.0f * testSinAngle * currentScale[threadIdx.y]) * scaleY[threadIdx.y], lod[threadIdx.y] + 1.2f);
            diff += (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid,
                              (currentX[threadIdx.y] - 7.0f * testCosAngle * currentScale[threadIdx.y]) * scaleX[threadIdx.y],
                              (currentY[threadIdx.y] - 7.0f * testSinAngle * currentScale[threadIdx.y]) * scaleY[threadIdx.y], lod[threadIdx.y] + 1.2f);
            diff -= (sample.x+sample.y+sample.z)*sample.w;

            sample = tex2DLod(sourceImagePyramid,
                              (currentX[threadIdx.y] - 5.0f * testCosAngle * currentScale[threadIdx.y]) * scaleX[threadIdx.y],
                              (currentY[threadIdx.y] - 5.0f * testSinAngle * currentScale[threadIdx.y]) * scaleY[threadIdx.y], lod[threadIdx.y] + 1.2f);
            diff -= (sample.x+sample.y+sample.z)*sample.w;
        }

        float X = diff * testCosAngle;
        float Y = diff * testSinAngle;

        for (int i=16; i>=1; i/=2) {
            X += __shfl_xor(X, i, 32);
            Y += __shfl_xor(Y, i, 32);
        }


        if (threadIdx.x == 0) {
            angle[threadIdx.y] = atan2f(Y, X);
            cosAngle[threadIdx.y] = cosf(angle[threadIdx.y]);
            sinAngle[threadIdx.y] = sinf(angle[threadIdx.y]);
        }
    }
/*
    currentScale[threadIdx.y] *= 2.0f;
    lod[threadIdx.y] += 1;
*/
    PatchExtractionPatches *patchDst = &patchData[candidateIndex];

    if (threadIdx.x == 0) {
        patchDst->x = currentX[threadIdx.y];
        patchDst->y = currentY[threadIdx.y];
        patchDst->angle = angle[threadIdx.y];
        patchDst->lod = lod[threadIdx.y] + (currentScale[threadIdx.y] - 1.0f);
        patchDst->score = 0.0f;
    }

    __shared__ float lumScale[4], lumOffset[4];
    {
        float sumLum = 0.0f;
        float sumSQRLum = 0.0f;
        for (int oy = offsetY; oy < 16; oy += 2) {
            float s = (offsetX - 7.5f);
            float t = (oy - 7.5f);
            float u = s * currentScale[threadIdx.y];
            float v = t * currentScale[threadIdx.y];

            float r = s*s+t*t;
            float4 srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + cosAngle[threadIdx.y] * u - sinAngle[threadIdx.y] * v) * scaleX[threadIdx.y],
                                                           (currentY[threadIdx.y] + sinAngle[threadIdx.y] * u + cosAngle[threadIdx.y] * v) * scaleY[threadIdx.y], lod[threadIdx.y] + r/32.0f);

            float lum = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w;

            sumLum += lum;
            sumSQRLum += lum*lum;
        }

        #pragma unroll
        for (int i=16; i>=1; i/=2) {
            sumLum += __shfl_xor(sumLum, i, 32);
            sumSQRLum += __shfl_xor(sumSQRLum, i, 32);
        }

        if (threadIdx.x == 0) {
            float mean = sumLum / (16.0f*16.0f);
            float var = sqrtf(sumSQRLum / (16.0f*16.0f) - mean*mean);

            float min = mean - 2.5f * var;
            float max = mean + 2.5f * var;

            lumScale[threadIdx.y] = 3.0f / fmax(max-min, 0.0001f);
            lumOffset[threadIdx.y] = -(min * lumScale[threadIdx.y]) * 0.333f;

           // lumScale = (0.2f*16.0f*16.0f) / fmax(accuX[0], 0.01f);
        }
    }

    for (int oy = offsetY; oy < 16; oy += 2) {
        float patch_r;
        float patch_g;
        float patch_b;
        {
            float s = (offsetX - 7.5f);
            float t = (oy - 7.5f);

            float u = s * currentScale[threadIdx.y];
            float v = t * currentScale[threadIdx.y];

            float r = s*s+t*t;

            float4 srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + cosAngle[threadIdx.y] * u - sinAngle[threadIdx.y] * v) * scaleX[threadIdx.y],
                                                           (currentY[threadIdx.y] + sinAngle[threadIdx.y] * u + cosAngle[threadIdx.y] * v) * scaleY[threadIdx.y], lod[threadIdx.y] + r/32.0f);

#if 0
            srcColor = tex2DLod(sourceImagePyramid, (currentX[threadIdx.y] + cosAngle[threadIdx.y] * u - sinAngle[threadIdx.y] * v) * scaleX[threadIdx.y],
                                                    (currentY[threadIdx.y] + sinAngle[threadIdx.y] * u + cosAngle[threadIdx.y] * v) * scaleY[threadIdx.y], lod[threadIdx.y]);

            float lum1 = (srcColor.x + srcColor.y + srcColor.z) * srcColor.w * 0.33f;


            float fac = blobScale[threadIdx.y];
            patch_r = fmin(fmax(lum1 * fac + blobOffset[threadIdx.y], 0.0f) * 5.0f, 1.0f);
            patch_g = fmin(fmax(lum1 * fac + blobOffset[threadIdx.y], 0.0f) * 5.0f, 1.0f);
            patch_b = fmin(fmax(lum1 * fac + blobOffset[threadIdx.y], 0.0f) * 5.0f, 1.0f);
/*
            if (offsetX + oy*16 == 0) {
                patch_r = 0.0f;
                patch_g = fmin(fmax(-fac, 0.0f), 1.0f);;
                patch_b = fmin(fmax(fac, 0.0f), 1.0f);
            }

            if (offsetX == 15) {
                patch_r =
                patch_g =
                patch_b = fmin(fmax(fabs(blobOffset[threadIdx.y]), 0.0f), 1.0f);
            }

            if (oy == 15) {

                patch_r = 0.0f;
                patch_g = fmin(fmax(-differences[threadIdx.y], 0.0f), 1.0f);
                patch_b = fmin(fmax(differences[threadIdx.y], 0.0f), 1.0f);
            }
*/
#else
            float fac = lumScale[threadIdx.y] * srcColor.w;

            patch_r = fmin(fmax(srcColor.x * fac + lumOffset[threadIdx.y], 0.0f), 1.0f);
            patch_g = fmin(fmax(srcColor.y * fac + lumOffset[threadIdx.y], 0.0f), 1.0f);
            patch_b = fmin(fmax(srcColor.z * fac + lumOffset[threadIdx.y], 0.0f), 1.0f);
#endif
#if 0
float avgLum = (patch_r + patch_g + patch_b) * 0.3333f;
patch_r = patch_r * 0.3f + avgLum * 0.7f;
patch_g = patch_g * 0.3f + avgLum * 0.7f;
patch_b = patch_b * 0.3f + avgLum * 0.7f;
#endif
        }

        float m = fmax(patch_r, fmax(patch_g, patch_b));

        unsigned w = min((int)(m * 255+1), 255);

        float rcpM = 255.0f*255.0f / w;

        uint32_t data = patch_r * rcpM;
        data |= ((unsigned)(patch_g * rcpM)) << 8;
        data |= ((unsigned)(patch_b * rcpM)) << 16;
        data |= w << 24;

        patchDst->data[offsetX + oy*16] = data;
    }

}


#endif
