/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "CudaSift.h"
#include <cmath>

#include "../cudaKernels/CudaSift.cuh"
#include "../config/FeatureExtractionConfig.h"
#include "../config/CudaConfig.h"
#include "../tools/HalfFloat.hpp"
#include <sstream>
#include <iomanip>
#include <string.h>
#include <assert.h>

extern const unsigned char PTX_CudaSift[];

CudaSift::CudaSift(const SFM::config::FeatureExtractionConfig &config, const SFM::config::CudaConfig &cudaConfig) : m_config(config)
{
    m_codeModule.loadFromMemory(PTX_CudaSift);

    m_extractLuminanceKernel = std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("convertRGBMImage"));

    m_gaussFilterKernel[guassianFilterH_2] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_2"));
    m_gaussFilterKernel[guassianFilterV_2] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_2"));
    m_gaussFilterKernel[guassianFilterH_3] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_3"));
    m_gaussFilterKernel[guassianFilterV_3] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_3"));
    m_gaussFilterKernel[guassianFilterH_4] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_4"));
    m_gaussFilterKernel[guassianFilterV_4] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_4"));
    m_gaussFilterKernel[guassianFilterH_5] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_5"));
    m_gaussFilterKernel[guassianFilterV_5] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_5"));
    m_gaussFilterKernel[guassianFilterH_6] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_6"));
    m_gaussFilterKernel[guassianFilterV_6] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_6"));
    m_gaussFilterKernel[guassianFilterH_7] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_7"));
    m_gaussFilterKernel[guassianFilterV_7] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_7"));
    m_gaussFilterKernel[guassianFilterH_8] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterH_8"));
    m_gaussFilterKernel[guassianFilterV_8] =
        std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("guassianFilterV_8"));





    m_sourceRGBAImageTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("sourceRGBAImage"));
    m_sourceLumImageTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("sourceLumImage"));
    m_sourcePrevGaussImageTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("sourcePrevGaussLumImage"));
    m_sourceTmpImageTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("sourceTmpImage"));
    m_sourceDoGImageTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("sourceDoGImage"));

    m_outputGaussianImageSurfRef = std::unique_ptr<CudaUtils::CudaSurfaceReference>(m_codeModule.getSurfReference("outputGaussian"));
    m_outputDifferenceOfGaussianImageSurfRef = std::unique_ptr<CudaUtils::CudaSurfaceReference>(m_codeModule.getSurfReference("outputDifferenceOfGaussian"));


    m_outputTmpSurfRef = std::unique_ptr<CudaUtils::CudaSurfaceReference>(m_codeModule.getSurfReference("outputTmp"));


    m_sourceRGBAImageTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceRGBAImageTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceRGBAImageTexRef->setCoordinateNormalization(true);

    m_sourceLumImageTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_LINEAR);
    m_sourceLumImageTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceLumImageTexRef->setCoordinateNormalization(true);

    m_sourceTmpImageTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_LINEAR);
    m_sourceTmpImageTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceTmpImageTexRef->setCoordinateNormalization(true);

    m_sourcePrevGaussImageTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourcePrevGaussImageTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourcePrevGaussImageTexRef->setCoordinateNormalization(true);

    m_sourceDoGImageTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceDoGImageTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_sourceDoGImageTexRef->setCoordinateNormalization(true);

    m_filterKernelScales = std::unique_ptr<CudaUtils::CudaConstantMemory>(m_codeModule.getConstantMemory("filterCoeffs"));
    m_filterKernelOffsets = std::unique_ptr<CudaUtils::CudaConstantMemory>(m_codeModule.getConstantMemory("filterOffsets"));



    m_downsampleKernel = std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("downsample"));
    m_downsampleInputSurfRef = std::unique_ptr<CudaUtils::CudaSurfaceReference>(m_codeModule.getSurfReference("downsampleInputSurface"));
    m_downsampleOutputSurfRef = std::unique_ptr<CudaUtils::CudaSurfaceReference>(m_codeModule.getSurfReference("downsampleOutputSurface"));


    m_locatePossibleFeaturePointsKernel = std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("locatePossibleFeaturePoints"));

    m_extractFeaturePointsKernel = std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("extractFeaturePoints"));



    m_debugExtractHalfFloatFromLayeredArrayKernel = std::unique_ptr<CudaUtils::CudaKernel>(m_codeModule.getKernel("debugExtractHalfFloatFromLayeredArray"));
    m_debugLayeredArrayTexRef = std::unique_ptr<CudaUtils::CudaTextureReference>(m_codeModule.getTexReference("debugLayeredArray"));
    m_debugLayeredArrayTexRef->setTexelFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_debugLayeredArrayTexRef->setMipmapFilterMode(CudaUtils::CudaTextureReference::FILTER_MODE_NEAREST);
    m_debugLayeredArrayTexRef->setCoordinateNormalization(false);

}


std::vector<float> constructBlurFilterKernel(float srcSigma, float dstSigma)
{
    float sqrSigma = dstSigma*dstSigma - srcSigma*srcSigma;

    unsigned width = (unsigned)(sqrtf(sqrSigma) * 3.5f) | 1u;

    std::vector<float> result;
    result.resize(width);
    for (unsigned i = 0; i < width; i++) {
        result[i] = 1.0f/sqrtf(2.0f*M_PI*sqrSigma) * expf(-0.5f*(i*i)/sqrSigma);
    }

    float sum = result[0];
    for (unsigned i = 1; i < width; i++)
        sum += 2.0f*result[i];
    float rcpSum = 1.0f/sum;
    for (unsigned i = 0; i < width; i++)
        result[i] *= rcpSum;

    return result;
}

void buildInterpolationFilter(const std::vector<float> &src, std::vector<float> &scale, std::vector<float> &offset, float texelSize)
{
    assert(src.size() & 1);
    unsigned numTaps = src.size()/2 + 1;
    scale.resize(numTaps);
    offset.resize(numTaps);
    offset[0] = 0.0f;
    scale[0] = src[0];
    for (unsigned i = 1; i < numTaps; i++) {
        float a = src[i*2-1];
        float b = src[i*2];
        scale[i] = a+b;
        assert(scale[i] > 1e-20f);
        float lambda = b/scale[i];
        assert((lambda >= 0.0f) && (lambda <= 1.0f));
        offset[i] = (i*2-1)*texelSize + lambda*texelSize;
    }
}

RasterImage upsampleRasterImage(const RasterImage &srcImage)
{
    RasterImage image;
    image.resize(srcImage.getWidth()*2, srcImage.getHeight()*2);
    for (unsigned j = 0; j < srcImage.getHeight()-1; j++)
        for (unsigned i = 0; i < srcImage.getWidth()-1; i++) {
            image.getData()[j*2*image.getWidth()+i*2] = srcImage.getData()[j*srcImage.getWidth() + i];
            const unsigned char *topLeft = (const unsigned char*)(srcImage.getData() + j*srcImage.getWidth() + i);
            const unsigned char *topRight = topLeft+4;
            const unsigned char *bottomLeft = topLeft + 4*srcImage.getWidth();
            const unsigned char *bottomRight = bottomLeft+4;

            unsigned char *dstRight = (unsigned char*)(image.getData() + j*2*image.getWidth() + i*2 + 1);
            unsigned char *dstBottom = (unsigned char*)(image.getData() + (j*2+1)*image.getWidth() + i*2);
            unsigned char *dstBottomRight = (unsigned char*)(image.getData() + (j*2+1)*image.getWidth() + i*2 + 1);

            for (unsigned c = 0; c < 4; c++) {
                dstRight[c] = ((unsigned)topLeft[c] + (unsigned)topRight[c]) / 2;
                dstBottom[c] = ((unsigned)topLeft[c] + (unsigned)bottomLeft[c]) / 2;
                dstBottomRight[c] = ((unsigned)topLeft[c] +
                                     (unsigned)topRight[c] +
                                     (unsigned)bottomLeft[c] +
                                     (unsigned)bottomRight[c]
                                    ) / 4;
            }
        }
    return image;
}

void CudaSift::gatherFeaturePoints(const RasterImage &image, std::vector<FeaturePoint> &featurePoints)
{
//RasterImage image = upsampleRasterImage(image_);


    CudaUtils::CudaMipmappedTexture octaveGauss1, octaveGauss2;
    CudaUtils::CudaMipmappedTexture octaveDoG1, octaveDoG2;
    CudaUtils::CudaMipmappedTexture tmpTexture;

    CudaUtils::CudaMipmappedTexture *currentOctaveGauss = &octaveGauss1;
    CudaUtils::CudaMipmappedTexture *nextOctaveGauss = &octaveGauss2;

    CudaUtils::CudaMipmappedTexture *currentOctaveDoG = &octaveDoG1;
    CudaUtils::CudaMipmappedTexture *nextOctaveDoG = &octaveDoG2;

    m_sourceRGBAImage.resize(image.getWidth(), image.getHeight(), 0, CU_AD_FORMAT_UNSIGNED_INT8, 4, 0);
    m_sourceRGBAImage.syncUploadAll(image.getData(), image.getWidth()*4);

    currentOctaveGauss->resize(image.getWidth(), image.getHeight(), CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+3,
                    CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);
    {
        m_sourceRGBAImageTexRef->bindTexture(&m_sourceRGBAImage);
        m_outputGaussianImageSurfRef->bindTexture(&currentOctaveGauss->getLevel(0));

        CudaSiftConvertRGBMImageKernelParams kernelParams;
        kernelParams.width = image.getWidth();
        kernelParams.height = image.getHeight();
        kernelParams.rcpWidth = 1.0f / kernelParams.width;
        kernelParams.rcpHeight = 1.0f / kernelParams.height;

        m_extractLuminanceKernel->launch(LinAlg::Fill(16u, 16u, 1u),
                                          LinAlg::Fill((kernelParams.width+16u*4u-1u)/(16u*4u), (kernelParams.height+15u)/16u, 1u),
                                          &kernelParams, sizeof(CudaSiftConvertRGBMImageKernelParams));
    }


    tmpTexture.resize(image.getWidth(), image.getHeight(), 1,
                    CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);

    currentOctaveDoG->resize(image.getWidth(), image.getHeight(), CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2,
                    CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);

    {
        std::vector<float> filter = constructBlurFilterKernel(0.75f,
                                                              1.0f);

        std::vector<float> scale, offset;

        CudaSiftGaussianFilterKernelParams kernelParams;
        kernelParams.dstWidth = image.getWidth();
        kernelParams.dstHeight = image.getHeight();
        kernelParams.rcpDstWidth = 1.0f / kernelParams.dstWidth;
        kernelParams.rcpDstHeight = 1.0f / kernelParams.dstHeight;
        kernelParams.sourceLayer = 0;
        kernelParams.destinationLayer = 0;
        kernelParams.DoGDestinationLayer = 0;

        buildInterpolationFilter(filter, scale, offset, kernelParams.rcpDstWidth);

        m_filterKernelScales->upload(&scale[0], scale.size()*4);
        m_filterKernelOffsets->upload(&offset[0], offset.size()*4);

        m_sourceLumImageTexRef->bindTexture(&currentOctaveGauss->getLevel(0));
        m_outputTmpSurfRef->bindTexture(&tmpTexture.getLevel(0));

        CudaUtils::CudaKernel *hKernel = getGaussianFilterKernel(true, scale.size());
        hKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                        LinAlg::Fill<unsigned>((kernelParams.dstWidth + 16*4-1) / (16*4), (kernelParams.dstHeight + 15)/16, 1),
                        &kernelParams, sizeof(kernelParams));

        /////////////////////////////////////////////////////////////////////////////////

        buildInterpolationFilter(filter, scale, offset, kernelParams.rcpDstHeight);
        m_filterKernelScales->upload(&scale[0], scale.size()*4);
        m_filterKernelOffsets->upload(&offset[0], offset.size()*4);

        m_sourcePrevGaussImageTexRef->bindTexture(&currentOctaveGauss->getLevel(0));
        m_sourceTmpImageTexRef->bindTexture(&tmpTexture.getLevel(0));

        m_outputGaussianImageSurfRef->bindTexture(&currentOctaveGauss->getLevel(0));
        m_outputDifferenceOfGaussianImageSurfRef->bindTexture(&currentOctaveDoG->getLevel(0));

        CudaUtils::CudaKernel *vKernel = getGaussianFilterKernel(false, scale.size());
        vKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                        LinAlg::Fill<unsigned>((kernelParams.dstWidth + 16*4-1) / (16*4), (kernelParams.dstHeight + 15)/16, 1),
                        &kernelParams, sizeof(kernelParams));
    }

   // dumpLayer("gaussian_O_0_L_0.png", currentOctaveGauss, 0, 1.0f);

    const unsigned maxPossibleFPLocations = m_config.maxPossibleFPLocations;
    m_possibleFeaturepointLocationArray.resize((maxPossibleFPLocations+1) * sizeof(CudaSiftPossibleFeaturePointLocation));

    const unsigned maxFPs = m_config.maxFPs;
    m_featurePointArray.resize((maxFPs+1) * sizeof(CudaSiftFeaturePoint));
    cuMemsetD32((CUdeviceptr)m_featurePointArray.getPtr(), 0, 1);

    std::vector<CudaSiftPossibleFeaturePointLocation> cpuPossibleFeaturepointLocations;
    cpuPossibleFeaturepointLocations.resize(maxPossibleFPLocations+1);

    const unsigned numOctaves = 6;
    for (unsigned octave = 0; octave < numOctaves; octave++) {
        unsigned octaveWidth = image.getWidth() >> octave;
        unsigned octaveHeight = image.getHeight() >> octave;

        currentOctaveDoG->resize(octaveWidth, octaveHeight, CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2,
                        CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);

        tmpTexture.resize(octaveWidth, octaveHeight, 1,
                        CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);

        for (unsigned layer = 0; layer < CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2; layer++) {
            std::vector<float> filter = constructBlurFilterKernel(1.0f*powf(2.0f, layer/(float)CUDA_SIFT_NUM_OCTAVE_SUBSTEPS),
                                                                  1.0f*powf(2.0f, (layer+1)/(float)CUDA_SIFT_NUM_OCTAVE_SUBSTEPS));

            std::vector<float> scale, offset;

            CudaSiftGaussianFilterKernelParams kernelParams;
            kernelParams.dstWidth = octaveWidth;
            kernelParams.dstHeight = octaveHeight;
            kernelParams.rcpDstWidth = 1.0f / kernelParams.dstWidth;
            kernelParams.rcpDstHeight = 1.0f / kernelParams.dstHeight;
            kernelParams.sourceLayer = layer;
            kernelParams.destinationLayer = layer+1;
            kernelParams.DoGDestinationLayer = layer;

            buildInterpolationFilter(filter, scale, offset, kernelParams.rcpDstWidth);
            /*
            std::cout << "layer: " << layer << " filter size: " << scale.size() << std::endl;
            for (unsigned i = 0; i < scale.size(); i++)
                std::cout << scale[i] << "   " << offset[i] << std::endl;
            */
            m_filterKernelScales->upload(&scale[0], scale.size()*4);
            m_filterKernelOffsets->upload(&offset[0], offset.size()*4);

            m_sourceLumImageTexRef->bindTexture(&currentOctaveGauss->getLevel(0));
            m_outputTmpSurfRef->bindTexture(&tmpTexture.getLevel(0));

            CudaUtils::CudaKernel *hKernel = getGaussianFilterKernel(true, scale.size());
            hKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                            LinAlg::Fill<unsigned>((kernelParams.dstWidth + 16*4-1) / (16*4), (kernelParams.dstHeight + 15)/16, 1),
                            &kernelParams, sizeof(kernelParams));



/*
            {
                std::stringstream filename;
                filename << "halfgaussian_O_"<<octave<<"_L_"<<layer<<".png";
                dumpTexture(filename.str(), &tmpTexture, 1.0f);
            }
*/
            /////////////////////////////////////////////////////////////////////////////////

            buildInterpolationFilter(filter, scale, offset, kernelParams.rcpDstHeight);
            m_filterKernelScales->upload(&scale[0], scale.size()*4);
            m_filterKernelOffsets->upload(&offset[0], offset.size()*4);

            m_sourcePrevGaussImageTexRef->bindTexture(&currentOctaveGauss->getLevel(0));
            m_sourceTmpImageTexRef->bindTexture(&tmpTexture.getLevel(0));

            m_outputGaussianImageSurfRef->bindTexture(&currentOctaveGauss->getLevel(0));
            m_outputDifferenceOfGaussianImageSurfRef->bindTexture(&currentOctaveDoG->getLevel(0));

            CudaUtils::CudaKernel *vKernel = getGaussianFilterKernel(false, scale.size());
            vKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                            LinAlg::Fill<unsigned>((kernelParams.dstWidth + 16*4-1) / (16*4), (kernelParams.dstHeight + 15)/16, 1),
                            &kernelParams, sizeof(kernelParams));
        }

        {
            cuMemsetD32((CUdeviceptr)m_possibleFeaturepointLocationArray.getPtr(), 0, 1);

            CudaSiftLocatePossibleFeaturePointsKernelParams kernelParams;
            kernelParams.width = octaveWidth;
            kernelParams.height = octaveHeight;
            kernelParams.rcpWidth = 1.0f / octaveWidth;
            kernelParams.rcpHeight = 1.0f / octaveHeight;
            kernelParams.atomicCounter = (unsigned*) m_possibleFeaturepointLocationArray.getPtr();
            kernelParams.maxFeatures = maxPossibleFPLocations;
            kernelParams.featurePoints = ((CudaSiftPossibleFeaturePointLocation *)m_possibleFeaturepointLocationArray.getPtr())+1;
            kernelParams.minimalDoGThreshold = m_config.minimalDoGThreshold;

            m_sourceDoGImageTexRef->bindTexture(&currentOctaveDoG->getLevel(0));

            m_locatePossibleFeaturePointsKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                            LinAlg::Fill<unsigned>((kernelParams.width + 13) / 14, (kernelParams.height + 13)/14, 1),
                            &kernelParams, sizeof(kernelParams));

        }

        unsigned numPossibleFeaturePoints;
        m_possibleFeaturepointLocationArray.download(&numPossibleFeaturePoints, 4);
        numPossibleFeaturePoints = std::min(numPossibleFeaturePoints, maxPossibleFPLocations);

        if ((numPossibleFeaturePoints > 0) && (octave >= 0)) {
            CudaSiftExtractFeaturePointsKernelParams kernelParams;

            kernelParams.width = octaveWidth;
            kernelParams.height = octaveHeight;
            kernelParams.rcpWidth = 1.0f / octaveWidth;
            kernelParams.rcpHeight = 1.0f / octaveHeight;
            kernelParams.octave = octave;
            kernelParams.numPossibleLocations = numPossibleFeaturePoints;
            kernelParams.possibleLocations = ((CudaSiftPossibleFeaturePointLocation *)m_possibleFeaturepointLocationArray.getPtr())+1;
            kernelParams.atomicCounter = (unsigned*) m_featurePointArray.getPtr();
            kernelParams.maxFeatures = maxFPs;
            kernelParams.featurePoints = ((CudaSiftFeaturePoint *)m_featurePointArray.getPtr())+1;
            kernelParams.maxElongation_Rth = (m_config.maxElongation_rth+1)*(m_config.maxElongation_rth+1) / m_config.maxElongation_rth;

            m_sourceLumImageTexRef->bindTexture(&currentOctaveGauss->getLevel(0));
            m_sourceDoGImageTexRef->bindTexture(&currentOctaveDoG->getLevel(0));

            m_extractFeaturePointsKernel->launch(LinAlg::Fill<unsigned>(32, 4, 1),
                            LinAlg::Fill<unsigned>((numPossibleFeaturePoints + 3) / 4, 1, 1),
                            &kernelParams, sizeof(kernelParams));

        }

#if 0
        m_possibleFeaturepointLocationArray.download(&cpuPossibleFeaturepointLocations[0], cpuPossibleFeaturepointLocations.size() * sizeof(CudaSiftPossibleFeaturePointLocation));

        for (unsigned layer = 0; layer < CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+2; layer++) {
            {
                std::stringstream filename;
                filename << "gaussian_O_"<<octave<<"_L_"<<layer+1<<".jpg";
                dumpLayer(filename.str(), currentOctaveGauss, layer+1, 1.0f, numPossibleFeaturePoints, &cpuPossibleFeaturepointLocations[1]);
            }

            {
                std::stringstream filename;
                filename << "DoG_O_"<<octave<<"_L_"<<layer<<".jpg";
                dumpLayer(filename.str(), currentOctaveDoG, layer, 5.0f, numPossibleFeaturePoints, &cpuPossibleFeaturepointLocations[1]);
            }
        }
#endif
        if (octave+1 < numOctaves) {
            nextOctaveGauss->resize(octaveWidth/2, octaveHeight/2, CUDA_SIFT_NUM_OCTAVE_SUBSTEPS+3,
                    CU_AD_FORMAT_HALF, 1, CUDA_ARRAY3D_SURFACE_LDST | CUDA_ARRAY3D_LAYERED, 1);

            m_downsampleInputSurfRef->bindTexture(&currentOctaveGauss->getLevel(0));
            m_downsampleOutputSurfRef->bindTexture(&nextOctaveGauss->getLevel(0));

            CudaSiftDownsampleKernelParams kernelParams;
            kernelParams.dstWidth = octaveWidth/2;
            kernelParams.dstHeight = octaveHeight/2;
            kernelParams.sourceLayer = CUDA_SIFT_NUM_OCTAVE_SUBSTEPS;
            kernelParams.destinationLayer = 0;

            m_downsampleKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                            LinAlg::Fill<unsigned>((kernelParams.dstWidth + 16*4-1) / (16*4), (kernelParams.dstHeight + 15)/16, 1),
                            &kernelParams, sizeof(kernelParams));

            std::swap(currentOctaveGauss, nextOctaveGauss);
            std::swap(currentOctaveDoG, nextOctaveDoG);
        }
    }



    std::vector<CudaSiftFeaturePoint> cpuFPs;
    cpuFPs.resize(maxFPs+1);
    m_featurePointArray.download(&cpuFPs[0], cpuFPs.size()*sizeof(CudaSiftFeaturePoint));

    unsigned numFeaturePoints;
    memcpy(&numFeaturePoints, &cpuFPs[0], 4);
    //std::cout << "total: " << numFeaturePoints << std::endl;
    numFeaturePoints = std::min(numFeaturePoints, maxFPs);
    featurePoints.resize(numFeaturePoints);
    for (unsigned i = 0; i < numFeaturePoints; i++) {
        unsigned octave = (int)(cpuFPs[i+1].octave);
        unsigned octaveWidth = image.getWidth() >> octave;
        unsigned octaveHeight = image.getHeight() >> octave;
        featurePoints[i].x = (cpuFPs[i+1].x - 0.5f / octaveWidth) * (octaveWidth << octave) + 0.5f;
        featurePoints[i].y = (cpuFPs[i+1].y - 0.5f / octaveHeight) * (octaveHeight << octave) + 0.5f;
        featurePoints[i].scale = cpuFPs[i+1].scale * (1 << octave);
        featurePoints[i].angle = cpuFPs[i+1].angle;
        featurePoints[i].layer = cpuFPs[i+1].layer;

        featurePoints[i].locationPrecision = cpuFPs[i+1].locationPrecision / (1 << (2*octave));

#ifdef CudaSift_EXTRACT_PATCH_DATA
        featurePoints[i].packedPatch.imageTexelSizeX = (1 << octave) / (float) image.getWidth();
        featurePoints[i].packedPatch.imageTexelSizeY = (1 << octave) / (float) image.getHeight();
        /*
        featurePoints[i].packedPatch.imageCenterX = (cpuFPs[i+1].patchX << octave) / (float) image.getWidth(); // no .5f offset because the patch is pixel aligned and thus the center lies between pixels.
        featurePoints[i].packedPatch.imageCenterY = (cpuFPs[i+1].patchY << octave) / (float) image.getHeight();
        */
        featurePoints[i].packedPatch.imageCenterX = ((cpuFPs[i+1].patchX - 0.5f) * (1 << octave) + 0.5f) / (float) image.getWidth();
        featurePoints[i].packedPatch.imageCenterY = ((cpuFPs[i+1].patchY - 0.5f) * (1 << octave) + 0.5f) / (float) image.getHeight();

        featurePoints[i].packedPatch.screenTexelSizeX =
        featurePoints[i].packedPatch.screenTexelSizeY = 2*(1 << octave) / (float) image.getWidth();

        featurePoints[i].packedPatch.screenCenterX = featurePoints[i].packedPatch.imageCenterX * 2.0f - 1.0f;
        featurePoints[i].packedPatch.screenCenterY = (featurePoints[i].packedPatch.imageCenterY * 2.0f - 1.0f) / image.getWidth() * image.getHeight();

        memcpy(featurePoints[i].packedPatch.data, cpuFPs[i+1].patch, PackedPatch::PACKED_PATCH_DATA_SIZE);
#endif

/*
featurePoints[i].x *= 0.5f;
featurePoints[i].y *= 0.5f;
*/
        memcpy(featurePoints[i].descriptor, cpuFPs[i+1].descriptor, 128);
    }



}


void CudaSift::dumpLayer(const std::string &filename, CudaUtils::CudaMipmappedTexture *texture, unsigned layer, float scale, unsigned numFP, CudaSiftPossibleFeaturePointLocation *locations)
{
    CudaUtils::CudaMipmappedTextureLevelMemory &texLevel = texture->getLevel(0);

    CudaUtils::CudaDeviceMemory vRam;
    vRam.resize(texLevel.getWidth() * texLevel.getHeight() * 4);

    CudaSiftDebugExtractHalfFloatFromLayeredArrayKernelParams kernelParams;
    kernelParams.width = texLevel.getWidth();
    kernelParams.height = texLevel.getHeight();
    kernelParams.layer = layer;
    kernelParams.dst = (float*) vRam.getPtr();


    m_debugLayeredArrayTexRef->bindTexture(&texLevel);

    m_debugExtractHalfFloatFromLayeredArrayKernel->launch(LinAlg::Fill<unsigned>(16, 16, 1),
                    LinAlg::Fill<unsigned>((kernelParams.width + 15) / 16, (kernelParams.height + 15)/16, 1),
                    &kernelParams, sizeof(kernelParams));


    RasterImage dstImage;
    std::vector<float> cpuData;

    cpuData.resize(texLevel.getWidth() * texLevel.getHeight());
    vRam.download(&cpuData[0], cpuData.size()*4);

    dstImage.resize(texLevel.getWidth(), texLevel.getHeight());

    for (unsigned i = 0; i < dstImage.getWidth() * dstImage.getHeight(); i++) {
        float f = cpuData[i] * scale;
        if (f >= 0.0f)
            f = pow(f, 1.0f/2.2f);
        else
            f = -pow(-f, 1.0f/2.2f);

        unsigned char *pixel = (unsigned char*)&dstImage.getData()[i];
#if 1
        pixel[0] = std::max<int>(std::min<int>(-f * 256, 255), 0);
        pixel[1] = std::max<int>(std::min<int>(f * 256, 255), 0);
        pixel[2] = 0;
#else
        pixel[0] = std::max<int>(std::min<int>(f * 256, 255), 0);
        pixel[1] = std::max<int>(std::min<int>(f * 256, 255), 0);
        pixel[2] = std::max<int>(std::min<int>(f * 256, 255), 0);
#endif
        pixel[3] = 255;
    }

#if 0
    for (unsigned j = 0; j < numFP; j++) {
        if (locations[j].layer != layer) continue;

        float x = locations[j].x * dstImage.getWidth();
        float y = locations[j].y * dstImage.getHeight();

        dstImage.drawCircle(LinAlg::Fill<int>(x, y), 8, 0xFFFF0000);
        /*
        dstImage.drawLine(LinAlg::Fill<int>(x, y),
                            LinAlg::Fill<int>(x + std::cos((*overlay)[j].angle) * 8,
                                              y + std::sin((*overlay)[j].angle) * 8), 0xFFFF0000);
        */
    }
#endif
    dstImage.writeToFile(filename.c_str());
}

void CudaSift::dumpTexture(const std::string &filename, CudaUtils::CudaMipmappedTexture *texture, float scale)
{
    RasterImage dstImage;
    std::vector<uint16_t> cpuData;

    CudaUtils::CudaMipmappedTextureLevelMemory &texLevel = texture->getLevel(0);

    cpuData.resize(texLevel.getWidth() * texLevel.getHeight());
    texLevel.syncDownloadAll(&cpuData[0], texLevel.getWidth()*2);

    dstImage.resize(texLevel.getWidth(), texLevel.getHeight());

    for (unsigned i = 0; i < dstImage.getWidth() * dstImage.getHeight(); i++) {
        float f = HalfFloat::halfFloatToFloat(cpuData[i]) * scale;
        if (f >= 0.0f)
            f = pow(f, 1.0f/2.2f);
        else
            f = -pow(-f, 1.0f/2.2f);

        unsigned char *pixel = (unsigned char*)&dstImage.getData()[i];
        pixel[0] = std::max<int>(std::min<int>(-f * 256, 255), 0);
        pixel[1] = std::max<int>(std::min<int>(f * 256, 255), 0);
        pixel[2] = 0;
        pixel[3] = 255;
    }


    dstImage.writeToFile(filename.c_str());
}


#if 0
void CudaSift::dumpPyramid(const std::string &baseFilename, CudaUtils::CudaMipmappedTexture *pyramid, float scale, std::vector<FeaturePoint> *overlay)
{
    RasterImage dstImage;
    std::vector<uint16_t> cpuData;
    for (unsigned i = 0; i < m_level.size(); i++) {
        const Level &level = m_level[i];

        CudaUtils::CudaMipmappedTextureLevelMemory &texLevel = pyramid[level.texture].getLevel(level.lod);

        cpuData.resize(texLevel.getWidth() * texLevel.getHeight());
        texLevel.syncDownloadAll(&cpuData[0], texLevel.getWidth()*2);

        dstImage.resize(texLevel.getWidth(), texLevel.getHeight());

        for (unsigned i = 0; i < dstImage.getWidth() * dstImage.getHeight(); i++) {
            float f = HalfFloat::halfFloatToFloat(cpuData[i]) * scale;
            if (f >= 0.0f)
                f = pow(f, 1.0f/2.2f);
            else
                f = -pow(-f, 1.0f/2.2f);

            unsigned char *pixel = (unsigned char*)&dstImage.getData()[i];
            pixel[0] = std::max<int>(std::min<int>(-f * 256, 255), 0);
            pixel[1] = std::max<int>(std::min<int>(f * 256, 255), 0);
            pixel[2] = 0;
            pixel[3] = 255;
        }

        if (overlay != NULL) {
            for (unsigned j = 0; j < overlay->size(); j++) {
                if ((*overlay)[j].level != i) continue;

                float x = (*overlay)[j].x / pyramid[0].getLevel(1).getWidth() * dstImage.getWidth();
                float y = (*overlay)[j].y / pyramid[0].getLevel(1).getHeight() * dstImage.getHeight();

                dstImage.drawCircle(LinAlg::Fill<int>(x, y), 8, 0xFFFF0000);
                dstImage.drawLine(LinAlg::Fill<int>(x, y),
                                    LinAlg::Fill<int>(x + std::cos((*overlay)[j].angle) * 8,
                                                      y + std::sin((*overlay)[j].angle) * 8), 0xFFFF0000);

            }
        }


        std::stringstream str;
        str << baseFilename << std::setw(3) << std::setfill('0')<<i<<".png";

        dstImage.writeToFile(str.str().c_str());
    }
}

void CudaSift::dumpMinMaxPyramid(const std::string &baseFilename, CudaUtils::CudaMipmappedTexture *pyramid, float scale)
{
    RasterImage dstImage;
    std::vector<uint16_t> cpuData;
    for (unsigned i = 0; i < m_level.size(); i++) {
        const Level &level = m_level[i];

        CudaUtils::CudaMipmappedTextureLevelMemory &texLevel = pyramid[level.texture].getLevel(level.lod);

        cpuData.resize(texLevel.getWidth() * texLevel.getHeight()*2);
        texLevel.syncDownloadAll(&cpuData[0], texLevel.getWidth()*4);

        dstImage.resize(texLevel.getWidth(), texLevel.getHeight());

        for (unsigned i = 0; i < dstImage.getWidth() * dstImage.getHeight(); i++) {
            float fmin = HalfFloat::halfFloatToFloat(cpuData[i*2+0]) * scale;
            float fmax = HalfFloat::halfFloatToFloat(cpuData[i*2+1]) * scale;

            fmin = pow(fmin, 1.0f/2.2f);
            fmax = pow(fmax, 1.0f/2.2f);

            unsigned char *pixel = (unsigned char*)&dstImage.getData()[i];
            pixel[0] = std::max<int>(std::min<int>(-fmin * 256, 255), 0);
            pixel[1] = std::max<int>(std::min<int>(fmax * 256, 255), 0);
            pixel[2] = 0;
            pixel[3] = 255;
        }

        std::stringstream str;
        str << baseFilename << std::setw(3) << std::setfill('0')<<i<<".png";

        dstImage.writeToFile(str.str().c_str());
    }
}

#endif
