/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


#if 0

#undef _GLIBCXX_USE_INT128

#include "cudaKernelUtils/WarpLinAlg.hpp"
#include "cudaKernelUtils/WarpLoadStore.hpp"
#include "cub/util_ptx_reduced.cuh"

using cuUtils::WarpVector;
using cuUtils::WarpMatrix;

texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> sourceRGBAImage;

// launch as 32x4 threads per block, processes 4 elements per block
extern "C" __global__ void __launch_bounds__(128, 16) kernel(const float *__restrict__ src, const float * __restrict__ matrix, float * __restrict__ dst)
{
#if 1
    volatile float a = tex2D(sourceRGBAImage, src[0], src[1]).x;
#elif 1
    unsigned fullIndex = threadIdx.x + threadIdx.y * 32;
    const unsigned elementIndex = blockIdx.x * 4 + threadIdx.y;
    volatile float a;
    volatile float b;
    volatile float r;

    if (cub::WarpId() == 0)
        r = a-b;
    else
        r = a+b;

#elif 1
    const unsigned elementIndex = blockIdx.x * 4 + threadIdx.y;
    const unsigned dimension = 16;

    WarpVector<dimension> a;
    a.load(src + elementIndex * dimension);

    WarpMatrix<dimension, dimension> M;
    M.load(matrix + elementIndex * dimension*dimension);

    WarpVector<dimension> b = conjugateGradientSolve(M, a, 16);
//    WarpVector<dimension> b = M * a;


    b.store(dst + elementIndex * dimension);

#elif 0
    const unsigned elementIndex = blockIdx.x * 16 + threadIdx.y;
    const unsigned dimension = 16;

    WarpVector<dimension> a1, a2, a3, a4;
    a1.load(src + elementIndex * dimension);
    a2.load(src + (elementIndex+2) * dimension);
    a3.load(src + (elementIndex+4) * dimension);
    a4.load(src + (elementIndex+6) * dimension);

    WarpMatrix<dimension, dimension> M1, M2, M3, M4;
    M1.load(matrix + elementIndex * dimension*dimension);
    M2.load(matrix + (elementIndex+2) * dimension*dimension);
    M3.load(matrix + (elementIndex+4) * dimension*dimension);
    M4.load(matrix + (elementIndex+6) * dimension*dimension);

    WarpVector<dimension> b1 = conjugateGradientSolve(M1, a1, 16);
    WarpVector<dimension> b2 = conjugateGradientSolve(M2, a2, 16);
    WarpVector<dimension> b3 = conjugateGradientSolve(M3, a3, 16);
    WarpVector<dimension> b4 = conjugateGradientSolve(M4, a4, 16);
//    WarpVector<dimension> b = M * a;


    b1.store(dst + elementIndex * dimension);
    b2.store(dst + (elementIndex+2) * dimension);
    b3.store(dst + (elementIndex+4) * dimension);
    b4.store(dst + (elementIndex+6) * dimension);

#else
    __shared__ Matrix sharedM[4];
    const Matrix *srcMatrices = (const Matrix*)matrix;
    cuWarpUtils::warpCopy<Matrix>(sharedM + threadIdx.y, srcMatrices + blockIdx.x * 4);
#endif
}

#endif
