/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "PatchDescriptorDB.h"

#include "../tools/TaskScheduler.h"


#include <xmmintrin.h>
#include <immintrin.h>


namespace SFM {


PatchDescriptorDB::PatchDescriptorDB(unsigned gridDim, unsigned gridRes)
{
    FILE *file = fopen("GridDBBasis.bin", "rb");
    fread(&m_basis, 1, sizeof(m_basis), file);
    fclose(file);


    m_gridDim = gridDim;
    m_gridResolution = gridRes;

    for (unsigned i = 0; i < 8; i++) {
        m_projectionOffset[i] = 0.5f;
        for (unsigned y = 0; y < 16; y++) {
            for (unsigned x = 0; x < 16; x++) {
                m_projectionOffset[i] += m_basis.offset[i][(y*16+x)*3+0];
                m_projectionOffset[i] += m_basis.offset[i][(y*16+x)*3+1];
                m_projectionOffset[i] += m_basis.offset[i][(y*16+x)*3+2];

                unsigned bx = x / 4;
                unsigned rx = x % 4;

                m_projectionBasis[y*16*3*8 + bx*3*8*4 + rx * 8 + i + 0*8*4] = m_basis.direction[i][(y*16+x)*3+0];
                m_projectionBasis[y*16*3*8 + bx*3*8*4 + rx * 8 + i + 1*8*4] = m_basis.direction[i][(y*16+x)*3+1];
                m_projectionBasis[y*16*3*8 + bx*3*8*4 + rx * 8 + i + 2*8*4] = m_basis.direction[i][(y*16+x)*3+2];
            }
        }
    }

}

PatchDescriptorDB::~PatchDescriptorDB()
{
    //dtor
}


void PatchDescriptorDB::computeProjectionIndicesSubrange(PatchDescriptor *descriptors, unsigned count) const
{
    const __m128i singleByteMask = _mm_set1_epi32(0xFF);
    const __m128 factor = _mm_set1_ps(1.0f/(255.0f*255.0f));

    const unsigned selectFirst = (0 << 0) |
                                 (0 << 2) |
                                 (0 << 4) |
                                 (0 << 6);

    const unsigned selectSecond = (1 << 0) |
                                  (1 << 2) |
                                  (1 << 4) |
                                  (1 << 6);

    const unsigned selectThird = (2 << 0) |
                                 (2 << 2) |
                                 (2 << 4) |
                                 (2 << 6);

    const unsigned selectFourth = (3 << 0) |
                                  (3 << 2) |
                                  (3 << 4) |
                                  (3 << 6);


    for (unsigned patchIndex = 0; patchIndex < count; patchIndex++) {
        __m128 sums1 = _mm_load_ps(m_projectionOffset);
        __m128 sums2 = _mm_load_ps(m_projectionOffset+4);
#if 0
        for (unsigned y = 0; y < 16; y++) {
            for (unsigned x = 0; x < 4; x++) {
                unsigned prjBasisIndex = 4*2*3*x + y*4*2*3*4;
                __m128i srcRGBM = _mm_load_si128((const __m128i*)(descriptors[patchIndex].data + y*16 + x*4));
#else
        for (unsigned i = 0; i < 16*4; i++) {
            {
                unsigned prjBasisIndex = i*2*3*4;
                __m128i srcRGBM = _mm_load_si128((const __m128i*)(descriptors[patchIndex].data + i*4));
#endif
                __m128 m = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcRGBM, 24), singleByteMask)), factor);
                __m128 r = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               srcRGBM,      singleByteMask)), m);
                __m128 g = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcRGBM, 8),  singleByteMask)), m);
                __m128 b = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcRGBM, 16), singleByteMask)), m);


                __m128 r1 = _mm_shuffle_ps(r, r, selectFirst);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+0*2+0)), r1));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+0*2+1)), r1));

                __m128 r2 = _mm_shuffle_ps(r, r, selectSecond);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+1*2+0)), r2));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+1*2+1)), r2));

                __m128 r3 = _mm_shuffle_ps(r, r, selectThird);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+2*2+0)), r3));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+2*2+1)), r3));

                __m128 r4 = _mm_shuffle_ps(r, r, selectFourth);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+3*2+0)), r4));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+3*2+1)), r4));




                __m128 g1 = _mm_shuffle_ps(g, g, selectFirst);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+4*2+0)), g1));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+4*2+1)), g1));

                __m128 g2 = _mm_shuffle_ps(g, g, selectSecond);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+5*2+0)), g2));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+5*2+1)), g2));

                __m128 g3 = _mm_shuffle_ps(g, g, selectThird);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+6*2+0)), g3));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+6*2+1)), g3));

                __m128 g4 = _mm_shuffle_ps(g, g, selectFourth);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+7*2+0)), g4));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+7*2+1)), g4));




                __m128 b1 = _mm_shuffle_ps(b, b, selectFirst);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+8*2+0)), b1));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+8*2+1)), b1));

                __m128 b2 = _mm_shuffle_ps(b, b, selectSecond);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+9*2+0)), b2));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+9*2+1)), b2));

                __m128 b3 = _mm_shuffle_ps(b, b, selectThird);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+10*2+0)), b3));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+10*2+1)), b3));

                __m128 b4 = _mm_shuffle_ps(b, b, selectFourth);
                sums1 = _mm_add_ps(sums1, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+11*2+0)), b4));
                sums2 = _mm_add_ps(sums2, _mm_mul_ps(_mm_load_ps(m_projectionBasis + 4*(prjBasisIndex+11*2+1)), b4));

            }
        }

        sums1 = _mm_mul_ps(sums1, _mm_set1_ps(m_gridResolution));
        sums2 = _mm_mul_ps(sums2, _mm_set1_ps(m_gridResolution));


        __m128i buckets1 = _mm_min_epi32(_mm_max_epi32(_mm_cvtps_epi32(sums1), _mm_setzero_si128()), _mm_set1_epi32(m_gridResolution-1));
        __m128i buckets2 = _mm_min_epi32(_mm_max_epi32(_mm_cvtps_epi32(sums2), _mm_setzero_si128()), _mm_set1_epi32(m_gridResolution-1));


        unsigned projectionIndex = _mm_extract_epi32(buckets2, 3);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets2, 2);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets2, 1);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets2, 0);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets1, 3);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets1, 2);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets1, 1);
        projectionIndex = projectionIndex * m_gridResolution + _mm_extract_epi32(buckets1, 0);

        descriptors[patchIndex].projectionIndex = projectionIndex;
    }
}

void PatchDescriptorDB::computeProjectionIndices(PatchDescriptor *descriptors, unsigned count) const
{
    if (count > 100) {
        TaskGroup group;
        for (unsigned i = 0; i < count; i+= 100) {
            group.add(
                      boost::bind(&PatchDescriptorDB::computeProjectionIndicesSubrange, this, descriptors + i,
                                  std::min<unsigned>(100, count-i)),
                      TaskScheduler::get());
        }
        TaskScheduler::get().waitFor(&group);
    } else
        computeProjectionIndicesSubrange(descriptors, count);

}



struct PatchGridEntry {
    unsigned patchIndex;
    unsigned gridIndex;
    inline bool operator<(const PatchGridEntry &other) const { return gridIndex < other.gridIndex; }
};


void PatchDescriptorDB::compile(PatchDescriptor *descriptors, unsigned count)
{
    m_descriptors.resize(count);

    memcpy(&m_descriptors[0], descriptors, count*sizeof(PatchDescriptor));
    computeProjectionIndices(&m_descriptors[0], count);



    unsigned gridCellCount = 1;
    for (unsigned i = 0; i < m_gridDim; i++)
        gridCellCount *= m_gridResolution;

    std::vector<PatchGridEntry> entries;
    entries.resize(m_descriptors.size());
    for (unsigned i = 0; i < m_descriptors.size(); i++) {
        entries[i].patchIndex = i;
        entries[i].gridIndex = m_descriptors[i].projectionIndex % gridCellCount;
    }
    std::sort(entries.begin(), entries.end());


    m_grid.resize(gridCellCount);
    memset(&m_grid[0], 0, m_grid.size()*sizeof(GridEntry));
    std::vector<PatchDescriptor> sortedDescriptors;
    sortedDescriptors.resize(m_descriptors.size());
    for (unsigned i = 0; i < entries.size(); i++) {
        sortedDescriptors[i] = m_descriptors[entries[i].patchIndex];
        if (i > 0) {
            if (entries[i-1].gridIndex != entries[i].gridIndex) {
                m_grid[entries[i-1].gridIndex].count = i-m_grid[entries[i-1].gridIndex].start;
                m_grid[entries[i].gridIndex].start = i;
            }
        }
    }
    if (entries.size() > 0)
        m_grid[entries[entries.size()-1].gridIndex].count = entries.size() - m_grid[entries[entries.size()-1].gridIndex].start;

    sortedDescriptors.swap(m_descriptors);
}

template<unsigned index>
inline float extract_ps(const __m128 &v)
{
    return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, index)));
}


static const __m128i singleByteMask = _mm_set1_epi32(0xFF);
static const __m128 factor = _mm_set1_ps(1.0f/(255.0f*255.0f));

float PatchDescriptorDB::PatchDescriptor::computeSquaredDifference(const PatchDescriptor &other) const
{
    __m128 sums = _mm_setzero_ps();
#if 0
    for (unsigned y = 0; y < 16; y++) {
        for (unsigned x = 0; x < 4; x++) {
            __m128i src1RGBM = _mm_load_si128((const __m128i*)(other.data + y*16 + x*4));
            __m128i src2RGBM = _mm_load_si128((const __m128i*)(data + y*16 + x*4));
#else
    for (unsigned i = 0; i < 16*4; i++) {
        {
            __m128i src1RGBM = _mm_load_si128((const __m128i*)(other.data + i*4));
            __m128i src2RGBM = _mm_load_si128((const __m128i*)(data + i*4));
#endif

            __m128 m1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src1RGBM, 24), singleByteMask)), factor);
            __m128 r1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               src1RGBM,      singleByteMask)), m1);
            __m128 g1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src1RGBM, 8),  singleByteMask)), m1);
            __m128 b1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src1RGBM, 16), singleByteMask)), m1);


            __m128 m2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src2RGBM, 24), singleByteMask)), factor);
            __m128 r2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               src2RGBM,      singleByteMask)), m2);
            __m128 g2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src2RGBM, 8),  singleByteMask)), m2);
            __m128 b2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(src2RGBM, 16), singleByteMask)), m2);


            __m128 deltaR = _mm_sub_ps(r1, r2);
            __m128 deltaG = _mm_sub_ps(g1, g2);
            __m128 deltaB = _mm_sub_ps(b1, b2);

            sums = _mm_add_ps(sums, _mm_mul_ps(deltaR, deltaR));
            sums = _mm_add_ps(sums, _mm_mul_ps(deltaG, deltaG));
            sums = _mm_add_ps(sums, _mm_mul_ps(deltaB, deltaB));
        }
    }

    //return extract_ps<0>(sums) + extract_ps<1>(sums) + extract_ps<2>(sums) + extract_ps<3>(sums);
    float sArr[4];
    _mm_storeu_ps(sArr, sums);
    return sArr[0] + sArr[1] + sArr[2] + sArr[3];
}

void PatchDescriptorDB::findMatchesSubrange(const PatchDescriptor *descriptors, unsigned count, unsigned offset, Match *matches) const
{
    unsigned numGridCellsToVisit = 1;
    for (unsigned i = 0; i < m_gridDim; i++)
        numGridCellsToVisit *= 3;

    //unsigned totalNumCompares = 0;

    for (unsigned patchIndex = 0; patchIndex < count; patchIndex++) {
        unsigned bestMatch = -1;
        float bestMatchSqrDiff = 1e30f;
        float bestMatchNormDiff = 1e30f;

        const PatchDescriptor &otherPatch = descriptors[patchIndex];

        for (unsigned cell = 0; cell < numGridCellsToVisit; cell++) {

            bool validCell = true;

            unsigned cellIndex;
            {
                unsigned bucketsIndices[8];

                unsigned srcCellIndex = otherPatch.projectionIndex % m_grid.size();
                unsigned destrCell = cell;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    int bucket = srcCellIndex % m_gridResolution;
                    int bucketOffset = (int)(destrCell % 3) - 1;

                    bucket += bucketOffset;

                    if ((bucket < 0) || (bucket >= (int)m_gridResolution)) {
                        validCell = false;
                        break;
                    }

                    bucketsIndices[i] = bucket;

                    srcCellIndex /= m_gridResolution;
                    destrCell /= 3;
                }

                if (!validCell)
                    continue;
                cellIndex = 0;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    cellIndex = cellIndex * m_gridResolution + bucketsIndices[m_gridDim-1-i];
                }
            }

            for (unsigned selfPatchIndex = m_grid[cellIndex].start; selfPatchIndex < m_grid[cellIndex].start+m_grid[cellIndex].count; selfPatchIndex++) {
            //for (unsigned selfPatchIndex = 0; selfPatchIndex < m_candidates.size(); selfPatchIndex++) {

                const PatchDescriptor &thisPatch = m_descriptors[selfPatchIndex];
/*
                {
                    if ((thisPatch.x-otherPatch.x)*(thisPatch.x-otherPatch.x) + (thisPatch.y-otherPatch.y)*(thisPatch.y-otherPatch.y) < 1e-6f)
                        continue;
                }
*/

                float sum = thisPatch.computeSquaredDifference(otherPatch);
                //totalNumCompares++;

                if (sum < bestMatchSqrDiff) {
                    bestMatchSqrDiff = sum;
                    bestMatchNormDiff = sum / ((otherPatch.score + thisPatch.score) * 0.5f);
                    bestMatch = selfPatchIndex;
                }
            }
        }

        matches[patchIndex].srcIndex = offset+patchIndex;
        matches[patchIndex].closestMatchIndex = bestMatch;
        matches[patchIndex].sqrDiff = bestMatchSqrDiff;
        matches[patchIndex].normalizedDiff = bestMatchNormDiff;
    }
    //std::cout << "Average number of compares per patch: " << totalNumCompares / (float)count << std::endl;
}


void PatchDescriptorDB::findMatchesConstrainedSubrange(MatchConstraint *constraint, const PatchDescriptor **descriptors, unsigned offset, unsigned count) const
{
    unsigned numGridCellsToVisit = 1;
    for (unsigned i = 0; i < m_gridDim; i++)
        numGridCellsToVisit *= 3;

    //unsigned totalNumCompares = 0;

    for (unsigned patchIndex = offset; patchIndex < offset+count; patchIndex++) {
        const PatchDescriptor &otherPatch = *descriptors[patchIndex];

        for (unsigned cell = 0; cell < numGridCellsToVisit; cell++) {

            bool validCell = true;

            unsigned cellIndex;
            {
                unsigned bucketsIndices[8];

                unsigned srcCellIndex = otherPatch.projectionIndex % m_grid.size();
                unsigned destrCell = cell;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    int bucket = srcCellIndex % m_gridResolution;
                    int bucketOffset = (int)(destrCell % 3) - 1;

                    bucket += bucketOffset;

                    if ((bucket < 0) || (bucket >= (int)m_gridResolution)) {
                        validCell = false;
                        break;
                    }

                    bucketsIndices[i] = bucket;

                    srcCellIndex /= m_gridResolution;
                    destrCell /= 3;
                }

                if (!validCell)
                    continue;
                cellIndex = 0;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    cellIndex = cellIndex * m_gridResolution + bucketsIndices[m_gridDim-1-i];
                }
            }

            for (unsigned selfPatchIndex = m_grid[cellIndex].start; selfPatchIndex < m_grid[cellIndex].start+m_grid[cellIndex].count; selfPatchIndex++) {
            //for (unsigned selfPatchIndex = 0; selfPatchIndex < m_candidates.size(); selfPatchIndex++) {

                const PatchDescriptor &thisPatch = m_descriptors[selfPatchIndex];

                if (constraint->preMatchTest(patchIndex, selfPatchIndex)) {

                    float sum = thisPatch.computeSquaredDifference(otherPatch);

                    constraint->postMatchOperation(patchIndex, selfPatchIndex, sum / ((otherPatch.score + thisPatch.score) * 0.5f));
                }
            }
        }
    }
}



void PatchDescriptorDB::findMatches(const PatchDescriptor *descriptors, unsigned count, std::vector<Match> &matches) const
{
    matches.resize(count);
#if 1
    if (count > 100) {
        TaskGroup group;
        for (unsigned i = 0; i < count; i+= 100) {
            group.add(boost::bind(&PatchDescriptorDB::findMatchesSubrange, this,
                                  descriptors + i, std::min<unsigned>(100, count-i), i, &matches[i]), TaskScheduler::get());
        }
        TaskScheduler::get().waitFor(&group);
    } else
#endif
        findMatchesSubrange(descriptors, count, 0, &matches[0]);

    std::sort(matches.begin(), matches.end());
}


void PatchDescriptorDB::findMatches(const PatchDescriptorDB &other, std::vector<Match> &matches) const
{
    findMatches(&other.m_descriptors[0], other.m_descriptors.size(), matches);
}


void PatchDescriptorDB::findMatchesConstrained(MatchConstraint *constraint) const
{
    const unsigned count = constraint->getNumDescriptors();
    const PatchDescriptor **descriptors = constraint->getDescriptors();
#if 1
    if (count > 100) {
        TaskGroup group;
        for (unsigned i = 0; i < count; i+= 100) {
            group.add(boost::bind(&PatchDescriptorDB::findMatchesConstrainedSubrange, this,
                                  constraint, descriptors, i, std::min<unsigned>(100, count-i)), TaskScheduler::get());
        }
        TaskScheduler::get().waitFor(&group);
    } else
#endif
        findMatchesConstrainedSubrange(constraint, descriptors, 0, count);
}



void PatchDescriptorDB::recomputeScoresSubrange(const std::vector<LinAlg::Vector2f> &screenPositions, unsigned offset, unsigned count)
{
    unsigned numGridCellsToVisit = 1;
    for (unsigned i = 0; i < m_gridDim; i++)
        numGridCellsToVisit *= 3;

    for (unsigned patchIndex = offset; patchIndex < offset+count; patchIndex++) {
        //float bestMatchSqrDiff = 1e30f;
        float bestMatchSqrDiff = 20.0f;

        PatchDescriptor &otherPatch = m_descriptors[patchIndex];

        for (unsigned cell = 0; cell < numGridCellsToVisit; cell++) {

            bool validCell = true;

            unsigned cellIndex;
            {
                unsigned bucketsIndices[8];

                unsigned srcCellIndex = otherPatch.projectionIndex % m_grid.size();
                unsigned destrCell = cell;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    int bucket = srcCellIndex % m_gridResolution;
                    int bucketOffset = (int)(destrCell % 3) - 1;

                    bucket += bucketOffset;

                    if ((bucket < 0) || (bucket >= (int)m_gridResolution)) {
                        validCell = false;
                        break;
                    }

                    bucketsIndices[i] = bucket;

                    srcCellIndex /= m_gridResolution;
                    destrCell /= 3;
                }

                if (!validCell)
                    continue;
                cellIndex = 0;
                for (unsigned i = 0; i < m_gridDim; i++) {
                    cellIndex = cellIndex * m_gridResolution + bucketsIndices[m_gridDim-1-i];
                }
            }

            for (unsigned selfPatchIndex = m_grid[cellIndex].start; selfPatchIndex < m_grid[cellIndex].start+m_grid[cellIndex].count; selfPatchIndex++) {
            //for (unsigned selfPatchIndex = 0; selfPatchIndex < m_candidates.size(); selfPatchIndex++) {

                const PatchDescriptor &thisPatch = m_descriptors[selfPatchIndex];

                if (selfPatchIndex == patchIndex)
                    continue;
                {
                    if ((screenPositions[thisPatch.userData] - screenPositions[otherPatch.userData]).SQRLen() < 1e-6f)
                        continue;
                }

                float sum = thisPatch.computeSquaredDifference(otherPatch);

                if (sum < bestMatchSqrDiff) {
                    bestMatchSqrDiff = sum;
                }
            }
        }

        otherPatch.score = std::max(bestMatchSqrDiff, 1e-20f);
    }
}

void PatchDescriptorDB::recomputeScores(const std::vector<LinAlg::Vector2f> &screenPositions)
{
#if 1
    if (m_descriptors.size() > 200) {
        TaskGroup group;
        for (unsigned i = 0; i < m_descriptors.size(); i+= 200) {
            group.add(boost::bind(&PatchDescriptorDB::recomputeScoresSubrange, this,
                                  screenPositions, i, std::min<unsigned>(200, m_descriptors.size()-i)), TaskScheduler::get());
        }
        TaskScheduler::get().waitFor(&group);
    } else
#endif
        recomputeScoresSubrange(screenPositions, 0, m_descriptors.size());
}


}
