/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdint.h>

texture<uchar4, 2, cudaReadModeNormalizedFloat> sourceImage;
surface<void, 2> outputImage;


__device__ void decodeColor(uint32_t color, float &r, float &g, float &b)
{
    float w = ((color >> 24) & 0xFF) / (255.0f*255.0f);
    r = ((color >> 0) & 0xFF) * w;
    g = ((color >> 8) & 0xFF) * w;
    b = ((color >> 16) & 0xFF) * w;
}

__device__ void decodeColor(float4 srcColor, float &r, float &g, float &b)
{
    r = srcColor.x * srcColor.w;
    g = srcColor.y * srcColor.w;
    b = srcColor.z * srcColor.w;
}

__device__ void encodeColor(float r, float g, float b, uchar4 &data)
{
    float m = fmax(r, fmax(g, b));

    data.w = min((int)(m * 255+1), 255);

    float rcpM = 255.0f*255.0f / data.w;

    data.x = r * rcpM;
    data.y = g * rcpM;
    data.z = b * rcpM;
}

__device__ void sample(float u, float v, float &sum_r, float &sum_g, float &sum_b)
{
    float4 srcColor = tex2D(sourceImage, u, v);
    float r;
    float g;
    float b;
    decodeColor(srcColor, r, g, b);
    sum_r += r;
    sum_g += g;
    sum_b += b;
}

extern "C" __global__ void downsampleRGBMImage(unsigned dstWidth, unsigned dstHeight, float scaleX, float scaleY)
{
    unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if ((x < dstWidth) && (y < dstHeight)) {
#if 0
        float4 srcColor = tex2D(sourceImage, x*scaleX + 1.0f, y*scaleY + 1.0f);

        float r;
        float g;
        float b;

        decodeColor(srcColor, r, g, b);

        uchar4 data;

        encoderColor(r, g, b, data);

        // Write to output surface
        surf2Dwrite(data, outputImage, x * 4, y);
#else
        float sum_r = 0.0f;
        float sum_g = 0.0f;
        float sum_b = 0.0f;

        sample((x + 0.5f - 0.5f)*scaleX + 0.0f, (y + 0.5f - 0.5f)*scaleY + 0.0f, sum_r, sum_g, sum_b);
        sample((x + 0.5f + 0.5f)*scaleX + 0.0f, (y + 0.5f - 0.5f)*scaleY + 0.0f, sum_r, sum_g, sum_b);
        sample((x + 0.5f - 0.5f)*scaleX + 0.0f, (y + 0.5f + 0.5f)*scaleY + 0.0f, sum_r, sum_g, sum_b);
        sample((x + 0.5f + 0.5f)*scaleX + 0.0f, (y + 0.5f + 0.5f)*scaleY + 0.0f, sum_r, sum_g, sum_b);

        sum_r *= 0.25f;
        sum_g *= 0.25f;
        sum_b *= 0.25f;

        uchar4 data;

        encodeColor(sum_r, sum_g, sum_b, data);
        surf2Dwrite(data, outputImage, x * 4, y);
#endif
    }
}


extern "C" __global__ void convertRGBMImage(unsigned dstWidth, unsigned dstHeight)
{
    unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if ((x < dstWidth) && (y < dstHeight)) {
        uchar4 data;
        surf2Dread(&data, outputImage, x * 4, y);

        float r = data.x / 255.0f;
        float g = data.y / 255.0f;
        float b = data.z / 255.0f;
#if 0
        r = powf(r, 2.2f);
        g = powf(g, 2.2f);
        b = powf(b, 2.2f);
#endif
        encodeColor(r, g, b, data);
        surf2Dwrite(data, outputImage, x * 4, y);
    }
}


