/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "CudaSift.cuh"

#include <stdio.h>

texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> sourceRGBAImage;
texture<float, cudaTextureType2DLayered, cudaReadModeElementType> sourceLumImage;
texture<float, cudaTextureType2DLayered, cudaReadModeElementType> sourcePrevGaussLumImage;

texture<float, cudaTextureType2D, cudaReadModeElementType> sourceTmpImage;

surface<void, cudaSurfaceType2D> outputTmp;
surface<void, cudaSurfaceType2DLayered> outputGaussian;
surface<void, cudaSurfaceType2DLayered> outputDifferenceOfGaussian;

surface<void, cudaSurfaceType2DLayered> downsampleInputSurface;
surface<void, cudaSurfaceType2DLayered> downsampleOutputSurface;


texture<float, cudaTextureType2DLayered, cudaReadModeElementType> sourceDoGImage;

/*
texture<float, cudaTextureType2D, cudaReadModeElementType> sourcePointFilteredDiffImage;
texture<float2, cudaTextureType2D, cudaReadModeElementType> sourceHMaxImage;


texture<float, cudaTextureType2D, cudaReadModeElementType> sourceDiffImage0;
texture<float, cudaTextureType2D, cudaReadModeElementType> sourceDoGImage;
texture<float, cudaTextureType2D, cudaReadModeElementType> sourceDiffImage2;

texture<float2, cudaTextureType2D, cudaReadModeElementType> sourceAbsMaxImage0;
texture<float2, cudaTextureType2D, cudaReadModeElementType> sourceAbsMaxImage1;
texture<float2, cudaTextureType2D, cudaReadModeElementType> sourceAbsMaxImage2;
*/




texture<float, cudaTextureType2DLayered, cudaReadModeElementType> debugLayeredArray;

extern "C" __global__ void debugExtractHalfFloatFromLayeredArray(CudaSiftDebugExtractHalfFloatFromLayeredArrayKernelParams kernelParams)
{
    const unsigned x = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned y = blockDim.y * blockIdx.y + threadIdx.y;

    if ((x < kernelParams.width) && (y < kernelParams.height)) {
        kernelParams.dst[y*kernelParams.width+x] = tex2DLayered(debugLayeredArray, x, y, kernelParams.layer);
    }
}


//////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////


extern "C" __global__ void convertRGBMImage(CudaSiftConvertRGBMImageKernelParams kernelParams)
{
#if 0
    unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if ((x < kernelParams.width) && (y < kernelParams.height)) {
        float4 data = tex2D(sourceRGBAImage, (x+0.5f) * kernelParams.rcpWidth, (y+0.5f) * kernelParams.rcpHeight);

        float r = data.x;
        float g = data.y;
        float b = data.z;

        float lum = 0.299f * r + 0.587f * g + 0.114f * b; // Mimic JPEG compression

        surf2Dwrite(__float2half_rn(lum), outputImage, x * 2, y);
    }
#else
    const unsigned numBlocks = 4;
    unsigned x = blockIdx.x * blockDim.x*numBlocks + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < kernelParams.height) {
        float lum[numBlocks];
        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            lum[i] = 0.0f;

        float4 data[numBlocks];
        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            data[i] = tex2D(sourceRGBAImage, (x + blockDim.x*i + 0.5f) * kernelParams.rcpWidth, (y + 0.5f) * kernelParams.rcpHeight);

        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++) {
            float r = data[i].x;
            float g = data[i].y;
            float b = data[i].z;

            lum[i] = 0.299f * r + 0.587f * g + 0.114f * b; // Mimic JPEG compression
        }

        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            if (x + blockDim.x*i < kernelParams.width)
                surf2DLayeredwrite(__float2half_rn(lum[i]), outputGaussian, (x + blockDim.x*i) * 2, y, 0);
    }
#endif
}


//////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////

__constant__ float filterCoeffs[8];
__constant__ float filterOffsets[8];

template<bool horizontal, unsigned size>
__device__ void gaussianFilter(CudaSiftGaussianFilterKernelParams kernelParams)
{
    const unsigned numBlocks = 4;
    unsigned x = blockIdx.x * blockDim.x*numBlocks + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < kernelParams.dstHeight) {
        float fx[numBlocks];
        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            fx[i] = (x+i*blockDim.x+0.5f) * kernelParams.rcpDstWidth;

        float fy = (y+0.5f) * kernelParams.rcpDstHeight;

        float sum[numBlocks];
        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            if (horizontal)
                sum[i] = tex2DLayered(sourceLumImage, fx[i], fy, kernelParams.sourceLayer) * filterCoeffs[0];
            else
                sum[i] = tex2D(sourceTmpImage, fx[i], fy) * filterCoeffs[0];


        for (unsigned j = 1; j < size; j++) {
            #pragma unroll
            for (unsigned i = 0; i < numBlocks; i++)
                if (horizontal) {
                    sum[i] += (
                                tex2DLayered(sourceLumImage, fx[i] + filterOffsets[j], fy, kernelParams.sourceLayer) +
                                tex2DLayered(sourceLumImage, fx[i] - filterOffsets[j], fy, kernelParams.sourceLayer)
                           ) * filterCoeffs[j];
                } else {
                    sum[i] += (
                                tex2D(sourceTmpImage, fx[i], fy + filterOffsets[j]) +
                                tex2D(sourceTmpImage, fx[i], fy - filterOffsets[j])
                           ) * filterCoeffs[j];
                }
        }

        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            if (x+i*blockDim.x < kernelParams.dstWidth) {
                if (horizontal)
                    surf2Dwrite(__float2half_rn(sum[i]), outputTmp, (x+i*blockDim.x) * 2, y);
                else {
                    surf2DLayeredwrite(__float2half_rn(sum[i]), outputGaussian, (x+i*blockDim.x) * 2, y, kernelParams.destinationLayer);
                    float prevGauss = tex2DLayered(sourcePrevGaussLumImage, fx[i], fy, kernelParams.sourceLayer);
                    surf2DLayeredwrite(__float2half_rn(sum[i] - prevGauss), outputDifferenceOfGaussian, (x+i*blockDim.x) * 2, y, kernelParams.DoGDestinationLayer);
                }
            }
    }
}

#define InstanceGaussianFilterKernel(name, horizontal, size) \
extern "C" __global__ void  __launch_bounds__(256, 8) name(CudaSiftGaussianFilterKernelParams kernelParams) \
{ \
    gaussianFilter<horizontal, size>(kernelParams); \
}


InstanceGaussianFilterKernel(guassianFilterH_2, true, 2)
InstanceGaussianFilterKernel(guassianFilterV_2, false, 2)
InstanceGaussianFilterKernel(guassianFilterH_3, true, 3)
InstanceGaussianFilterKernel(guassianFilterV_3, false, 3)
InstanceGaussianFilterKernel(guassianFilterH_4, true, 4)
InstanceGaussianFilterKernel(guassianFilterV_4, false, 4)
InstanceGaussianFilterKernel(guassianFilterH_5, true, 5)
InstanceGaussianFilterKernel(guassianFilterV_5, false, 5)
InstanceGaussianFilterKernel(guassianFilterH_6, true, 6)
InstanceGaussianFilterKernel(guassianFilterV_6, false, 6)
InstanceGaussianFilterKernel(guassianFilterH_7, true, 7)
InstanceGaussianFilterKernel(guassianFilterV_7, false, 7)
InstanceGaussianFilterKernel(guassianFilterH_8, true, 8)
InstanceGaussianFilterKernel(guassianFilterV_8, false, 8)


//////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////


extern "C" __global__ void downsample(CudaSiftDownsampleKernelParams kernelParams)
{
    const unsigned numBlocks = 4;
    unsigned x = blockIdx.x * blockDim.x*numBlocks + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < kernelParams.dstHeight) {
        #pragma unroll
        for (unsigned i = 0; i < numBlocks; i++)
            if (x+i*blockDim.x < kernelParams.dstWidth) {
                unsigned short data;

                surf2DLayeredread(&data, downsampleInputSurface, (x+i*blockDim.x)*2 * 2, y*2, kernelParams.sourceLayer);
                surf2DLayeredwrite(data, downsampleOutputSurface, (x+i*blockDim.x) * 2, y, kernelParams.destinationLayer);
            }
    }
}



//////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////


extern "C" __global__ void __launch_bounds__(256, 8) locatePossibleFeaturePoints(CudaSiftLocatePossibleFeaturePointsKernelParams kernelParams)
{
    unsigned x = blockIdx.x * 14 + threadIdx.x;
    unsigned y = blockIdx.y * 14 + threadIdx.y;

    const unsigned maxPossibleLocations = 300;
    __shared__ unsigned numFeaturePointsInBlock;
    __shared__ CudaSiftPossibleFeaturePointLocation locations[maxPossibleLocations];

    const unsigned fullIndex = threadIdx.x + 16 * threadIdx.y;

    if (fullIndex == 0)
        numFeaturePointsInBlock = 0;

    __syncthreads();

    __shared__ float minimumsMaximums[16*16+16*16];


    bool outOfBounds = (x < 16) || (y < 16) || (x >= kernelParams.width - 16) || (y >= kernelParams.height-16);

    float fx = (x+0.5f) * kernelParams.rcpWidth;
    float fy = (y+0.5f) * kernelParams.rcpHeight;

    float values[CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2];

    #pragma unroll
    for (unsigned i = 0; i < CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2; i++)
        values[i] = tex2DLayered(sourceDoGImage, fx, fy, i);
/*
    const unsigned leftIndex = max((int)threadIdx.x-1, 0) + 16 * threadIdx.y;
    const unsigned rightIndex = min((int)threadIdx.x+1, 15) + 16 * threadIdx.y;
    const unsigned topIndex = threadIdx.x + 16 * max((int)threadIdx.y-1, 0);
    const unsigned bottomIndex = threadIdx.x + 16 * min((int)threadIdx.y+1, 15);
*/
    const unsigned leftIdx = max((int)threadIdx.x-1, 0);
    const unsigned rightIdx = min((int)threadIdx.x+1, 15);

    const unsigned topIdx = max((int)threadIdx.y-1, 0) * 16;
    const unsigned bottomIdx = min((int)threadIdx.y+1, 15) * 16;

/*
    if ((blockIdx.x == 0) && (blockIdx.y == 0)) {
        printf("%i %i:   %i %i %i %i\n", threadIdx.x, threadIdx.y, leftIndex, rightIndex, topIndex, bottomIndex);
    }
*/
    #pragma unroll
    for (unsigned i = 1; i < CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+1; i++) {
        float scaleMin = fmin(values[i+1], values[i-1]);
        float scaleMax = fmax(values[i+1], values[i-1]);
        minimumsMaximums[fullIndex + 16*16*0] = fmin(scaleMin, values[i]);
        minimumsMaximums[fullIndex + 16*16*1] = fmax(scaleMax, values[i]);

        __syncthreads();

        if ((threadIdx.y > 0) && (threadIdx.y < 15)) {
            unsigned offset = values[i] > 0.0f?16*16*1:16*16*0;
            float sign = values[i] > 0.0f?1.0f:-1.0f;

            float sv = values[i] * sign;

            bool isExtremPoint = (!outOfBounds) && ((threadIdx.x > 0) && (threadIdx.x < 15)) && (sv > kernelParams.minimalDoGThreshold);

            //isExtremPoint &= sv >= sign*(values[i] > 0.0f?scaleMax:scaleMin);
            isExtremPoint &= sv >= sign*minimumsMaximums[fullIndex + offset];

            isExtremPoint &= sv >= sign*minimumsMaximums[leftIdx + topIdx + offset];
            isExtremPoint &= sv >= sign*minimumsMaximums[threadIdx.x + topIdx + offset];
            isExtremPoint &= sv >= sign*minimumsMaximums[rightIdx + topIdx + offset];

            isExtremPoint &= sv >= sign*minimumsMaximums[leftIdx + threadIdx.y*16 + offset];
            isExtremPoint &= sv >= sign*minimumsMaximums[rightIdx + threadIdx.y*16 + offset];

            isExtremPoint &= sv >= sign*minimumsMaximums[leftIdx + bottomIdx + offset];
            isExtremPoint &= sv >= sign*minimumsMaximums[threadIdx.x + bottomIdx + offset];
            isExtremPoint &= sv >= sign*minimumsMaximums[rightIdx + bottomIdx + offset];

            if (isExtremPoint) {
                unsigned localIndex = atomicAdd(&numFeaturePointsInBlock, 1);
                if (localIndex < maxPossibleLocations) {
                    locations[localIndex].x = fx;
                    locations[localIndex].y = fy;
                    locations[localIndex].layer = i;
                }
            }
        }
        __syncthreads();
    }

    unsigned numFeatures = min(numFeaturePointsInBlock, maxPossibleLocations);

    __shared__ unsigned globalIndex;
    if (fullIndex == 0) {
        globalIndex = atomicAdd(kernelParams.atomicCounter, numFeatures);
    }

    __syncthreads();

    unsigned numLocationsThatFit = min(numFeatures, (int)kernelParams.maxFeatures - (int)globalIndex);
    float *src = (float*)locations;
    float *dst = (float*)(kernelParams.featurePoints + globalIndex);

    for (unsigned i = fullIndex; i < numLocationsThatFit * sizeof(CudaSiftPossibleFeaturePointLocation)/4; i+=16*16)
        dst[i] = src[i];

}


//////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////


extern "C" __global__ void  __launch_bounds__(128, 16) extractFeaturePoints(CudaSiftExtractFeaturePointsKernelParams kernelParams)
{
    unsigned possibleFeatureLocationIndex = blockIdx.x * 4 + threadIdx.y;

    if (possibleFeatureLocationIndex >= kernelParams.numPossibleLocations)
        return;

    float fx;
    float fy;
    unsigned featureIndex;
    bool bail = false;
    unsigned layer = kernelParams.possibleLocations[possibleFeatureLocationIndex].layer;
    float scale;
    float locationPrecision;
    if (threadIdx.x == 0) {
        fx = kernelParams.possibleLocations[possibleFeatureLocationIndex].x;
        fy = kernelParams.possibleLocations[possibleFeatureLocationIndex].y;

        float D111 = tex2DLayered(sourceDoGImage, fx, fy, layer);

        const float stretch = 1.0f;
        float D001 = tex2DLayered(sourceDoGImage, fx - kernelParams.rcpWidth*stretch, fy - kernelParams.rcpHeight*stretch, layer);
        float D201 = tex2DLayered(sourceDoGImage, fx + kernelParams.rcpWidth*stretch, fy - kernelParams.rcpHeight*stretch, layer);
        float D021 = tex2DLayered(sourceDoGImage, fx - kernelParams.rcpWidth*stretch, fy + kernelParams.rcpHeight*stretch, layer);
        float D221 = tex2DLayered(sourceDoGImage, fx + kernelParams.rcpWidth*stretch, fy + kernelParams.rcpHeight*stretch, layer);

        float Dx0 = (D201 - D001) / (2.0f * stretch);
        float Dx2 = (D221 - D021) / (2.0f * stretch);

        float Dxy = (Dx2 - Dx0) / (2.0f * stretch);

        float D011 = tex2DLayered(sourceDoGImage, fx - kernelParams.rcpWidth*stretch, fy, layer);
        float D211 = tex2DLayered(sourceDoGImage, fx + kernelParams.rcpWidth*stretch, fy, layer);
        float D101 = tex2DLayered(sourceDoGImage, fx, fy - kernelParams.rcpHeight*stretch, layer);
        float D121 = tex2DLayered(sourceDoGImage, fx, fy + kernelParams.rcpHeight*stretch, layer);

        float Dx = (D211 - D011) / (2.0f * stretch);
        float Dxx = ((D011 + D211) - D111 * 2.0f) / (stretch * stretch);
        float Dy = (D121 - D101) / (2.0f * stretch);
        float Dyy = ((D101 + D121) - D111 * 2.0f) / (stretch * stretch);
#if 1
        float traceH = Dxx + Dyy;
        float detH = Dxx*Dyy - Dxy*Dxy;

        if (fabs(detH) < 1e-20f)
            bail = true;

        float R = traceH*traceH / detH;

        if (R > kernelParams.maxElongation_Rth)
            bail = true;

#else
        // from SiftGPU
        float temp1 = Dxx * Dyy - Dxy * Dxy;
        float temp2 = (Dxx + Dyy) * (Dxx + Dyy);
        float rth = 10.0f;
        if (temp1 <= 0.0f || temp2 > rth * temp1)
            bail = true;
#endif
        float D110 = tex2DLayered(sourceDoGImage, fx, fy, layer-1);
        float D112 = tex2DLayered(sourceDoGImage, fx, fy, layer+1);

        float Ds = (D112 - D110) * 0.5f;
        float Dss = (D112 + D110) - D111 * 2.0f;


        float D100 = tex2DLayered(sourceDoGImage, fx, fy - kernelParams.rcpHeight*stretch, layer-1);
        float D120 = tex2DLayered(sourceDoGImage, fx, fy + kernelParams.rcpHeight*stretch, layer-1);

        float D010 = tex2DLayered(sourceDoGImage, fx - kernelParams.rcpWidth*stretch, fy, layer-1);
        float D210 = tex2DLayered(sourceDoGImage, fx + kernelParams.rcpWidth*stretch, fy, layer-1);

        float D102 = tex2DLayered(sourceDoGImage, fx, fy - kernelParams.rcpHeight*stretch, layer+1);
        float D122 = tex2DLayered(sourceDoGImage, fx, fy + kernelParams.rcpHeight*stretch, layer+1);

        float D012 = tex2DLayered(sourceDoGImage, fx - kernelParams.rcpWidth*stretch, fy, layer+1);
        float D212 = tex2DLayered(sourceDoGImage, fx + kernelParams.rcpWidth*stretch, fy, layer+1);

        float Dxs = ((D212 - D012) - (D210 - D010)) / (4.0f * stretch*stretch);
        float Dys = ((D122 - D102) - (D120 - D100)) / (4.0f * stretch*stretch);

        float determinant = Dxx*Dyy*Dss + 2.0f*Dxy*Dys*Dxs - Dxx*Dys*Dys - Dxy*Dxy*Dss - Dxs*Dxs*Dyy;

        if (fabs(determinant) < 1e-20f)
            bail = true;

        float rcpDet = 1.0f / determinant;

        float optX = rcpDet * (-Dx * (Dyy*Dss - Dys*Dys)
                               -Dy * (Dxs*Dys - Dxy*Dss)
                               -Ds * (Dxy*Dys - Dxs*Dyy));
        float optY = rcpDet * (-Dx * (Dys*Dxs - Dxy*Dss)
                               -Dy * (Dxx*Dss - Dxs*Dxs)
                               -Ds * (Dxs*Dxy - Dxx*Dys));
        float optS = rcpDet * (-Dx * (Dxy*Dys - Dyy*Dxs)
                               -Dy * (Dxy*Dxs - Dxx*Dys)
                               -Ds * (Dxx*Dyy - Dxy*Dxy));

//optX = optY = optS = 0.0f;
//optX = 0.0f;
        //optS = 0.0f;
        //optX = 0.0f;
/*
        optS = 0.0f;

        determinant = Dxx*Dyy - Dxy*Dxy;
        optX = (-Dx * Dyy + Dy * Dxy) / determinant;
        optY = (Dx * Dxy - Dy * Dxx) / determinant;
*/
        //printf("%f %f %f\n", optX, optY, optS);

        if ((optX < -0.85f) || (optX > 0.85f))
            bail = true;

        if ((optY < -0.85f) || (optY > 0.85f))
            bail = true;

        if ((optS < -0.85f) || (optS > 0.85f))
            bail = true;

        fx = fx + optX * kernelParams.rcpWidth;
        fy = fy + optY * kernelParams.rcpHeight;

        scale = powf(2.0f, (layer+optS) / CUDA_SIFT_NUM_OCTAVE_SUBSTEPS);
        locationPrecision = (Dxx*Dyy - Dxy*Dxy);
    }

    __syncthreads();

    bail = __shfl(bail, 0);

    if (bail)
        return;

    fx = __shfl(fx, 0);
    fy = __shfl(fy, 0);
    scale = __shfl(scale, 0);

    float angle;
#if 1
    {
        float a = threadIdx.x * (float)M_PI / 32.0f;
        float cosAngle = cosf(a);
        float sinAngle = sinf(a);

        float stepX = cosAngle * kernelParams.rcpWidth * scale;
        float stepY = sinAngle * kernelParams.rcpHeight * scale;

        const float scale1 = 1.0f;
        const float scale2 = 2.0f;
        const float scale3 = 3.0f;

        float responseA = tex2DLayered(sourceLumImage, fx + stepX * scale1,
                                                    fy + stepY * scale1, layer-1) +
                         tex2DLayered(sourceLumImage, fx + stepX * scale2,
                                                    fy + stepY * scale2, layer+0) +
                         tex2DLayered(sourceLumImage, fx + stepX * scale3,
                                                    fy + stepY * scale3, layer+1);

        float responseB = tex2DLayered(sourceLumImage, fx - stepX * scale1,
                                               fy - stepY * scale1, layer-1) +
                    tex2DLayered(sourceLumImage, fx - stepX * scale2,
                                               fy - stepY * scale2, layer+0) +
                    tex2DLayered(sourceLumImage, fx - stepX * scale3,
                                               fy - stepY * scale3, layer+1);

        float response = responseA - responseB;

#if 0
        float rX = cosAngle * response;
        float rY = sinAngle * response;

        #pragma unroll
        for (int i=16; i>=1; i/=2) {
            rX += __shfl_xor(rX, i, 32);
            rY += __shfl_xor(rY, i, 32);
        }

        if (threadIdx.x == 0) {
            angle = atan2(rY, rX);

            kernelParams.featurePoints[featureIndex].angle = angle;
        }
        angle = __shfl(angle, 0);
#else
        unsigned maxBucket = threadIdx.x;
        float maxResponse = response;
        //float sqrAvgResponse = response*response;

        #pragma unroll
        for (int i=16; i>=1; i/=2) {
            float otherResponse = __shfl_xor(maxResponse, i);
            unsigned otherMaxBucket = __shfl_xor((int)maxBucket, i);
            if (fabs(otherResponse) > fabs(maxResponse)) {
                maxResponse = otherResponse;
                maxBucket = otherMaxBucket;
            }

          //  sqrAvgResponse += __shfl_xor(sqrAvgResponse, i);
        }
        //sqrAvgResponse *= 1.0f / 32.0f;
/*
        float closeness = fmax(0.0f, fabs(response) - maxResponse * 0.9f);
        for (int i=16; i>=1; i/=2)
            closeness += __shfl_xor(closeness, i);
*/

        if (threadIdx.x == 0) {
            angle = maxBucket / 32.0f * (float)M_PI;
            if (maxResponse < 0.0f)
                angle += (float) M_PI;

            //if (maxResponse*maxResponse < sqrAvgResponse*(1.5f*1.5f))
            if (fabs(maxResponse) < 0.0001f)
            //if (closeness > 0.5f)
                bail = true;
        }
        angle = __shfl(angle, 0);
        bail = __shfl(bail, 0);

#endif
    }
#else
    {
        float a = threadIdx.x * (float)M_PI / 16.0f;
        float cosAngle = cosf(a);
        float sinAngle = sinf(a);

        float buckets = 0.0f;
#if 0
        for (unsigned i = 1; i < 33; i++) {
            float sampleX = fx + cosAngle * kernelParams.rcpWidth * i * scale;
            float sampleY = fy + sinAngle * kernelParams.rcpHeight * i * scale;

            float gradX = tex2DLayered(sourceLumImage, sampleX + kernelParams.rcpWidth * 1.5f * scale,
                                                    sampleY, layer+1) -
                          tex2DLayered(sourceLumImage, sampleX - kernelParams.rcpWidth * 1.5f * scale,
                                                    sampleY, layer+1);

            float gradY = tex2DLayered(sourceLumImage, sampleX,
                                                    sampleY + kernelParams.rcpHeight * 1.5f * scale, layer+1) -
                          tex2DLayered(sourceLumImage, sampleX,
                                                    sampleY - kernelParams.rcpHeight * 1.5f * scale, layer+1);
#else
        for (unsigned i = 0; i < 32; i++) {
            float offsetX = ((int)threadIdx.x - 16);
            float offsetY = ((int)i - 16);
            float sampleX = ((int)(fx * kernelParams.width) + 0.5f + offsetX) * kernelParams.rcpWidth;
            float sampleY = ((int)(fy * kernelParams.height) + 0.5f + offsetY) * kernelParams.rcpHeight;

            float gradX = tex2DLayered(sourceLumImage, sampleX + kernelParams.rcpWidth,
                                                    sampleY, layer-1) -
                          tex2DLayered(sourceLumImage, sampleX - kernelParams.rcpWidth,
                                                    sampleY, layer-1);

            float gradY = tex2DLayered(sourceLumImage, sampleX,
                                                    sampleY + kernelParams.rcpHeight, layer-1) -
                          tex2DLayered(sourceLumImage, sampleX,
                                                    sampleY - kernelParams.rcpHeight, layer-1);

            float weight = scale*scale / (1.0f + (offsetX*offsetX + offsetY*offsetY) * 0.001);
            gradX *= weight;
            gradY *= weight;
#endif
            float sqrLen = gradX*gradX + gradY*gradY;
            float len = sqrtf(sqrLen);
            float dirX;
            float dirY;
            if (sqrLen < 1e-10f) {
                dirX = 0.0f;
                dirY = 0.0f;
            } else {
                dirX = gradX / len;
                dirY = gradY / len;
            }


            for (unsigned j = 0; j < 32; j++) {
                float dot = cosAngle * __shfl(dirX, j) +
                            sinAngle * __shfl(dirY, j);

                float resp = powf(fmax(dot, 0.0f), 16.0f) * __shfl(sqrLen, j);
                buckets += resp;
            }
        }

/*
        if (possibleFeatureLocationIndex < 5)
            printf("%i: %i  -  %f\n", possibleFeatureLocationIndex, threadIdx.x, buckets);
*/
        unsigned maxBucket = threadIdx.x;
        float maxResponse = buckets;

        #pragma unroll
        for (int i=16; i>=1; i/=2) {
            float otherRespone = __shfl_xor(maxResponse, i);
            unsigned otherMaxBucket = __shfl_xor((int)maxBucket, i);
            if (otherRespone > maxResponse) {
                maxResponse = otherRespone;
                maxBucket = otherMaxBucket;
            }
        }

        if (threadIdx.x == 0) {
            angle = maxBucket / 16.0f * (float)M_PI;

            kernelParams.featurePoints[featureIndex].angle = angle;
        }
        angle = __shfl(angle, 0);
    }
#endif

    if (bail)
        return;

    if (threadIdx.x == 0) {
        featureIndex = atomicAdd(kernelParams.atomicCounter, 1);
        if (featureIndex >= kernelParams.maxFeatures)
            bail = true;
        else {
            kernelParams.featurePoints[featureIndex].x = fx;
            kernelParams.featurePoints[featureIndex].y = fy;
            kernelParams.featurePoints[featureIndex].scale = scale;
            kernelParams.featurePoints[featureIndex].layer = layer;
            kernelParams.featurePoints[featureIndex].octave = kernelParams.octave;
            kernelParams.featurePoints[featureIndex].angle = angle;
            kernelParams.featurePoints[featureIndex].locationPrecision = locationPrecision;
        }
    }

    featureIndex = __shfl((int)featureIndex, 0);

    bail = __shfl(bail, 0);
    if (bail)
        return;

    {
        float cosAngle = cosf(angle);
        float sinAngle = sinf(angle);


        unsigned cellX = threadIdx.x % 4;
        unsigned cellY = (threadIdx.x / 4) % 4;
        unsigned intraCellY = (threadIdx.x / 16) * 2;


        float hist[8];
        for (unsigned i = 0; i < 8; i++) {
            hist[i] = 0.0f;
        }

        for (unsigned j = 0; j < 2; j++)
            for (unsigned i = 0; i < 4; i++) {
                float u = (cellX*4 + i) - 7.5f;
                float v = (cellY*4 + j + intraCellY) - 7.5f;

                float s = fx + (cosAngle * u - sinAngle * v) * scale * kernelParams.rcpWidth;
                float t = fy + (sinAngle * u + cosAngle * v) * scale * kernelParams.rcpHeight;

                float sampleWeight = 1.0f / (1.0f + (u*u+v*v) * 0.03f);

                float ddx = (tex2DLayered(sourceLumImage, s + kernelParams.rcpWidth * 0.5f,
                                                      t, layer-1) -
                            tex2DLayered(sourceLumImage, s - kernelParams.rcpWidth * 0.5f,
                                                      t, layer-1)) * sampleWeight;

                float ddy = (tex2DLayered(sourceLumImage, s,
                                                      t + kernelParams.rcpHeight * 0.5f, layer-1) -
                            tex2DLayered(sourceLumImage, s,
                                                      t - kernelParams.rcpHeight * 0.5f, layer-1)) * sampleWeight;

                float gradX = cosAngle * ddx + sinAngle * ddy;
                float gradY = -sinAngle * ddx + cosAngle * ddy;

                float sqrGradMagnitude = gradX*gradX + gradY*gradY;
                float gradDirX = 0.0f;
                float gradDirY = 0.0f;
                if (sqrGradMagnitude > 1e-30f) {
                    gradDirX = gradX * (1.0f / sqrtf(sqrGradMagnitude));
                    gradDirY = gradY * (1.0f / sqrtf(sqrGradMagnitude));
                }

                float mag = sqrtf(sqrGradMagnitude);
                mag = powf(sqrGradMagnitude, 0.25f);
                const float exp = 4.0f;

                hist[0] += powf(fmax(gradDirX, 0.0f), exp) * mag;
                hist[1] += powf(fmax(-gradDirX, 0.0f), exp) * mag;
                hist[2] += powf(fmax(gradDirY, 0.0f), exp) * mag;
                hist[3] += powf(fmax(-gradDirY, 0.0f), exp) * mag;

                hist[4] += powf(fmax(gradDirX + gradDirY, 0.0f) * sqrtf(1.0f/2.0f), exp) * mag;
                hist[5] += powf(fmax(-gradDirX + gradDirY, 0.0f) * sqrtf(1.0f/2.0f), exp) * mag;
                hist[6] += powf(fmax(gradDirX - gradDirY, 0.0f) * sqrtf(1.0f/2.0f), exp) * mag;
                hist[7] += powf(fmax(-gradDirX - gradDirY, 0.0f) * sqrtf(1.0f/2.0f), exp) * mag;
            }


        for (unsigned i = 0; i < 8; i++) {
            hist[i] += __shfl_xor(hist[i], 16);
        }

        #if 1
        {
            float sqrLen = 0.0f;
            for (unsigned i = 0; i < 8; i++) {
                sqrLen += hist[i]*hist[i];
            }

            for (int i=8; i>=1; i/=2)
                sqrLen += __shfl_xor(sqrLen, i, 32);

            float maxValue = sqrtf(sqrLen) * 0.2f;
            for (unsigned i = 0; i < 8; i++) {
                hist[i] = fmin(hist[i], maxValue);
            }
        }
        #endif


        float maxLen = 0.0f;
        for (unsigned i = 0; i < 8; i++)
            maxLen = fmax(maxLen, hist[i]);

        #pragma unroll
        for (int i=8; i>=1; i/=2) {
            maxLen = fmax(maxLen, __shfl_xor(maxLen, i, 32));
        }
        float scale = 256.0f / maxLen;

        if (threadIdx.x < 16) {
            for (unsigned i = 0; i < 2; i++) {
                unsigned a = fmin(hist[0+i*4] * scale, 255.0f);
                unsigned b = fmin(hist[1+i*4] * scale, 255.0f);
                unsigned c = fmin(hist[2+i*4] * scale, 255.0f);
                unsigned d = fmin(hist[3+i*4] * scale, 255.0f);

                unsigned packed = (a << 0) |
                                  (b << 8) |
                                  (c << 16) |
                                  (d << 24);

                unsigned dstIndex = threadIdx.x + i*16;
                kernelParams.featurePoints[featureIndex].descriptor[dstIndex] = packed;
            }
        }
    }

#ifdef CudaSift_EXTRACT_PATCH_DATA

    {
        int iCenterX = fx * kernelParams.width + 0.5f;
        int iCenterY = fy * kernelParams.height + 0.5f;

        if (threadIdx.x == 0) {
            kernelParams.featurePoints[featureIndex].patchOctave = kernelParams.octave;
            kernelParams.featurePoints[featureIndex].patchX = iCenterX;
            kernelParams.featurePoints[featureIndex].patchY = iCenterY;
        }

        for (unsigned y = 0; y < 64; y++) {
            float a = tex2DLayered(sourceLumImage,
                                   (iCenterX + (threadIdx.x*2+0 - 31.5f)*1) * kernelParams.rcpWidth,
                                   (iCenterY + (y - 31.5f)*1) * kernelParams.rcpHeight, layer-1);
            float b = tex2DLayered(sourceLumImage,
                                   (iCenterX + (threadIdx.x*2+1 - 31.5f)*1) * kernelParams.rcpWidth,
                                   (iCenterY + (y - 31.5f)*1) * kernelParams.rcpHeight, layer-1);

            unsigned ia = fmin(fmax(a*256.0f, 0.0f), 255.0f);
            unsigned ib = fmin(fmax(b*256.0f, 0.0f), 255.0f);


            unsigned short packed = (ia << 0) |
                                    (ib << 8);

            (unsigned short&)(kernelParams.featurePoints[featureIndex].patch[threadIdx.x*2 + y*64]) = packed;
        }
    }
#endif

}
