/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


#include "../cudaInterface/PatchAtlasConstants.h"

#include "sourceImageToPatchAtlasTransfer.h"

texture<uchar4, 2, cudaReadModeNormalizedFloat> sourceImage;
surface<void, cudaSurfaceType2DLayered> outputLayerLevel;


__constant__ TransferGlobals transferGlobals;

extern "C" __global__ void computeNormalizationParams(TransferInfo *transferInfo, float *params)
{
    const unsigned fullIndex = threadIdx.y * 8 + threadIdx.x;

    __shared__ TransferInfo info;

    if (fullIndex < sizeof(TransferInfo)/4) {
        uint32_t *dstPtr = (uint32_t *) &info;
        uint32_t *srcPtr = (uint32_t *) &transferInfo[blockIdx.x];
        dstPtr[fullIndex] = srcPtr[fullIndex];
    }
    __syncthreads();

    __shared__ float avgLumAcc1[8*8];
    __shared__ float avgLumAcc2[8*8];
    avgLumAcc1[fullIndex] = 0.0f;
    avgLumAcc2[fullIndex] = 0.0f;

    for (unsigned y = threadIdx.y; y < 16; y+=8) {
        float sourceY = info.sourceY + (y - 7.5f) * transferGlobals.stepY * info.stepScale;
        for (unsigned x = threadIdx.x; x < 16; x+=8) {
            float sourceX = info.sourceX + (x - 7.5f) * transferGlobals.stepX * info.stepScale;

            float4 color = tex2DLod(sourceImage, sourceX, sourceY, info.sourceBaseMipLevel + 2.0f);//transferGlobals.mipLevel + 1.0f);

            float lum = (color.x + color.y + color.z) * color.w;
            avgLumAcc1[fullIndex] += lum;
            avgLumAcc2[fullIndex] += lum*lum;
        }
    }

    __syncthreads();
    for (unsigned i = 8*8/2; i > 0; i >>= 1) {
        if (fullIndex < i) {
            avgLumAcc1[fullIndex] += avgLumAcc1[fullIndex+i];
            avgLumAcc2[fullIndex] += avgLumAcc2[fullIndex+i];
        }
        __syncthreads();
    }
    if (fullIndex == 0) {
        //scaleFactor = 0.25f / fmax(0.01f, avgLum[0] / transferGlobals.w / transferGlobals.h / 3.0f);
        float mean = avgLumAcc1[0] / (16*16);
        float var = sqrtf(avgLumAcc2[0] / (16*16) - mean*mean);

        float min = mean - 4.5f * var;
        float max = mean + 4.5f * var;

        params[blockIdx.x*2+0] = 3.0f / fmax(max-min, 0.0001f);
        params[blockIdx.x*2+1] = -(min * params[blockIdx.x*2+0]) * 0.333f;
    }

}

extern "C" __global__ void transferLayerLevel(TransferInfo *transferInfo, float *scaleParams)
{
    const unsigned fullIndex = threadIdx.y * 8 + threadIdx.x;

    __shared__ TransferInfo info;

    if (fullIndex < sizeof(TransferInfo)/4) {
        uint32_t *dstPtr = (uint32_t *) &info;
        uint32_t *srcPtr = (uint32_t *) &transferInfo[blockIdx.x];
        dstPtr[fullIndex] = srcPtr[fullIndex];
    }
    __syncthreads();

    __shared__ float lumScale, lumOffset;
    if (fullIndex == 0) {
        lumScale = 1.0f;//scaleParams[blockIdx.x*2+0];
        lumOffset = 0.0f;//scaleParams[blockIdx.x*2+1];
    }
    __syncthreads();

    for (unsigned y = threadIdx.y; y < transferGlobals.h; y+=8) {
        float sourceY = info.sourceY + (y - transferGlobals.hHalf) * transferGlobals.stepY * info.stepScale;
        for (unsigned x = threadIdx.x; x < transferGlobals.w; x+=8) {
            float sourceX = info.sourceX + (x - transferGlobals.wHalf) * transferGlobals.stepX * info.stepScale;

            float4 color = tex2DLod(sourceImage, sourceX, sourceY, info.sourceBaseMipLevel + transferGlobals.mipLevel);


            float patch_r;
            float patch_g;
            float patch_b;

            float fac = lumScale * color.w;

            patch_r = fmin(fmax(color.x * fac + lumOffset, 0.0f), 1.0f);
            patch_g = fmin(fmax(color.y * fac + lumOffset, 0.0f), 1.0f);
            patch_b = fmin(fmax(color.z * fac + lumOffset, 0.0f), 1.0f);
#if 0
float avgLum = (patch_r + patch_g + patch_b) * 0.3333f;
patch_r = patch_r * 0.3f + avgLum * 0.7f;
patch_g = patch_g * 0.3f + avgLum * 0.7f;
patch_b = patch_b * 0.3f + avgLum * 0.7f;
#endif
            float m = fmax(patch_r, fmax(patch_g, patch_b));

            unsigned w = min((int)(m * 255+1), 255);

            float rcpM = 255.0f*255.0f / w;

            uchar4 data;
            data.x = min((int)(patch_r * rcpM), 255);
            data.y = min((int)(patch_g * rcpM), 255);
            data.z = min((int)(patch_b * rcpM), 255);
            data.w = w;

            surf2DLayeredwrite(data, outputLayerLevel,
                               ((info.destinationX >> transferGlobals.mipLevel)+x)*4,
                               (info.destinationY >> transferGlobals.mipLevel)+y,
                               info.destinationLayer);
        }
    }
}
