/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "PFJacobi.h"

#include <string.h>
#include <assert.h>
#include <immintrin.h>
#include "../../tools/RasterImage.h"
#include "../../tools/TaskScheduler.h"


namespace SFM {

namespace PFBundleAdjustment {




template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::resize(unsigned numObservationBlocks)
{
    m_tracks.resize(numObservationBlocks);
    m_cameraHeaders.resize(numObservationBlocks);
    m_cameraData.resize(numObservationBlocks);
    m_calibHeaders.resize(numObservationBlocks);
    m_calibData.resize(numObservationBlocks);

    memset(&m_tracks[0], 0, m_tracks.size() * sizeof(PFJacobiTransposedTrackBlock));
    memset(&m_cameraHeaders[0], 0, m_cameraHeaders.size() * sizeof(PFJacobiTransposedCameraBlockHeader));
    memset(&m_cameraData[0], 0, m_cameraData.size() * sizeof(SizedPFJacobiTransposedCameraBlockData));
    memset(&m_calibHeaders[0], 0, m_calibHeaders.size() * sizeof(PFJacobiTransposedCalibBlockHeader));
    memset(&m_calibData[0], 0, m_calibData.size() * sizeof(SizedPFJacobiTransposedCalibBlockData));
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setupObservationBlock(unsigned observationBlockIndex, unsigned trackParameterIndices[32], unsigned cameraParameterIndex, unsigned calibParameterIndex)
{
    for (unsigned j = 0; j < 32; j++)
        m_tracks[observationBlockIndex].trackParameterIndices[j] = trackParameterIndices[j];

    m_cameraHeaders[observationBlockIndex].cameraParameterIndex = cameraParameterIndex;
    m_calibHeaders[observationBlockIndex].calibParameterIndex = calibParameterIndex;
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleTrackData(unsigned observationIdx, float *data)
{
    unsigned obsBlockIdx = observationIdx / 32;
    unsigned obsLaneIdx = observationIdx % 32;
    for (unsigned i = 0; i < NUM_TRACK_PARAMS; i++) {
        m_tracks[obsBlockIdx].xDerivatives[obsLaneIdx + i*32] = data[i];
        m_tracks[obsBlockIdx].yDerivatives[obsLaneIdx + i*32] = data[i + NUM_TRACK_PARAMS];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleCameraData(unsigned observationIdx, float *data)
{
    unsigned obsBlockIdx = observationIdx / 32;
    unsigned obsLaneIdx = observationIdx % 32;
    for (unsigned i = 0; i < NumCameraUpdateParameters; i++) {
        m_cameraData[obsBlockIdx].xDerivatives[obsLaneIdx + i*32] = data[i];
        m_cameraData[obsBlockIdx].yDerivatives[obsLaneIdx + i*32] = data[i + NumCameraUpdateParameters];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleInternalCalibData(unsigned observationIdx, float *data)
{
    unsigned obsBlockIdx = observationIdx / 32;
    unsigned obsLaneIdx = observationIdx % 32;
    for (unsigned i = 0; i < NumCalibrationUpdateParameters; i++) {
        m_calibData[obsBlockIdx].xDerivatives[obsLaneIdx + i*32] = data[i*2+0];
        m_calibData[obsBlockIdx].yDerivatives[obsLaneIdx + i*32] = data[i*2+1];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::multiplyWithVector(const std::vector<float, AlignedAllocator<float>> &src,
                                                                                                       std::vector<float, AlignedAllocator<float>> &dst,
                                                                                                       unsigned batchSize) const
{
    dst.resize(m_tracks.size()*32*2);
    if (m_tracks.size() > batchSize) {
        TaskGroup group;
        for (unsigned i = 0; i < m_tracks.size(); i+=batchSize)
            group.add(boost::bind(&PFJacobiTransposed::multiplyWithVectorSubrange,
                                  this, &src[0], &dst[0], i, std::min<unsigned>(batchSize, m_tracks.size()-i)),
                                TaskScheduler::get());

        TaskScheduler::get().waitFor(&group);
    } else
        multiplyWithVectorSubrange(&src[0], &dst[0], 0, m_tracks.size());
}


template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::multiplyWithVectorSubrange(const float *src, float *dst, unsigned start, unsigned count) const
{
    for (unsigned i = start; i < start+count; i++) {
        CpuWarpReg X;
        CpuWarpReg Y;

        {
            const PFJacobiTransposedTrackBlock &block = m_tracks[i];

            CpuWarpReg T;
            CpuWarpReg Dx, Dy;

            T.gather(&src[0] + 0, block.trackParameterIndices);
            Dx.loadAligned(block.xDerivatives + 0*32);
            Dy.loadAligned(block.yDerivatives + 0*32);
            X = Dx * T;
            Y = Dy * T;

            T.gather(&src[0] + 1, block.trackParameterIndices);
            Dx.loadAligned(block.xDerivatives + 1*32);
            Dy.loadAligned(block.yDerivatives + 1*32);
            X += Dx * T;
            Y += Dy * T;

            T.gather(&src[0] + 2, block.trackParameterIndices);
            Dx.loadAligned(block.xDerivatives + 2*32);
            Dy.loadAligned(block.yDerivatives + 2*32);
            X += Dx * T;
            Y += Dy * T;

            T.gather(&src[0] + 3, block.trackParameterIndices);
            Dx.loadAligned(block.xDerivatives + 3*32);
            Dy.loadAligned(block.yDerivatives + 3*32);
            X += Dx * T;
            Y += Dy * T;

        }

        {
            const SizedPFJacobiTransposedCameraBlockData &block = m_cameraData[i];
            const unsigned cameraParameterIndex = m_cameraHeaders[i].cameraParameterIndex;

            CpuWarpReg Dx, Dy;

            static_assert(NumCameraUpdateParameters <= 6, "This code is a loop that is manually unrolled for up to 6 camera parameters!");
            {
                unsigned offset = 0;
                switch (NumCameraUpdateParameters) {
                    case 6:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 5:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 4:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 3:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 2:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 1:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[cameraParameterIndex] + offset);
                        Y += Dy * (&src[cameraParameterIndex] + offset);
                        offset++;
                    case 0: {}
                }
            }
        }
        {
            const SizedPFJacobiTransposedCalibBlockData &block = m_calibData[i];
            const unsigned calibParameterIndex = m_calibHeaders[i].calibParameterIndex;

            CpuWarpReg Dx, Dy;


            static_assert(NumCalibrationUpdateParameters <= 8, "This code is a loop that is manually unrolled for up to 8 calibration parameters!");
            {
                unsigned offset = 0;
                switch (NumCalibrationUpdateParameters) {
                    case 8:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 7:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 6:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 5:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 4:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 3:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 2:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 1:
                        Dx.loadAligned(block.xDerivatives + offset*32);
                        Dy.loadAligned(block.yDerivatives + offset*32);
                        X += Dx * (&src[calibParameterIndex] + offset);
                        Y += Dy * (&src[calibParameterIndex] + offset);
                        offset++;
                    case 0: {}
                }
            }
        }
        X.storeAligned(&dst[i*32*2+0*32]);
        Y.storeAligned(&dst[i*32*2+1*32]);
    }
}


template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::writeLayoutToImage(const char *filename)
{
    unsigned numCols = m_tracks.size()*32*2;


    unsigned numRows = 0;
    for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
        numRows = std::max(numRows, m_calibHeaders[i].calibParameterIndex + NumCalibrationUpdateParameters);
    }

    RasterImage image(numCols, numRows);
    image.clear(0xFFFFFFFF);

    for (unsigned i = 0; i < m_tracks.size(); i++) {
        for (unsigned j = 0; j < 32; j++) {
            unsigned row = m_tracks[i].trackParameterIndices[j];

            image.getData()[(i*32*2 + 0*32 + j) + (row+0)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 0*32 + j) + (row+1)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 0*32 + j) + (row+2)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 0*32 + j) + (row+3)*numCols] = 0xFF101010;

            image.getData()[(i*32*2 + 1*32 + j) + (row+0)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 1*32 + j) + (row+1)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 1*32 + j) + (row+2)*numCols] = 0xFF101010;
            image.getData()[(i*32*2 + 1*32 + j) + (row+3)*numCols] = 0xFF101010;
        }
    }

    for (unsigned i = 0; i < m_cameraHeaders.size(); i++) {
        unsigned row = m_cameraHeaders[i].cameraParameterIndex;
        for (unsigned j = 0; j < 32; j++) {
            for (unsigned k = 0; k < NumCameraUpdateParameters; k++) {
                image.getData()[(i*32*2 + 0*32 + j) + (row+k)*numCols] = 0xFF101010;
                image.getData()[(i*32*2 + 1*32 + j) + (row+k)*numCols] = 0xFF101010;
            }
        }
    }

    for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
        unsigned row = m_calibHeaders[i].calibParameterIndex;
        for (unsigned j = 0; j < 32; j++) {
            for (unsigned k = 0; k < NumCalibrationUpdateParameters; k++) {
                image.getData()[(i*32*2 + 0*32 + j) + (row+k)*numCols] = 0xFF101010;
                image.getData()[(i*32*2 + 1*32 + j) + (row+k)*numCols] = 0xFF101010;
            }
        }
    }

    image.writeToFile(filename);
}
/*
template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
std::ostream &operator<<(std::ostream &stream, const PFJacobiTransposed<NumCameraUpdateParameters, NumCalibrationUpdateParameters> &jacobi)
{
    unsigned numCols = jacobi.m_tracks.size()*32*2;


    unsigned numRows = 0;
    for (unsigned i = 0; i < jacobi.m_calibHeaders.size(); i++) {
        numRows = std::max(numRows, jacobi.m_calibHeaders[i].calibParameterIndex + NUM_CALIB_PARAMS);
    }

    std::vector<float> fullMatrix;
    fullMatrix.resize(numCols * numRows);

    for (unsigned i = 0; i < jacobi.m_tracks.size(); i++) {
        for (unsigned j = 0; j < 32; j++) {
            unsigned row = jacobi.m_tracks[i].trackParameterIndices[j];
            fullMatrix[(i*32*2 + 0*32 + j) + (row+0)*numCols] = jacobi.m_tracks[i].xDerivatives[j + 0*32];
            fullMatrix[(i*32*2 + 0*32 + j) + (row+1)*numCols] = jacobi.m_tracks[i].xDerivatives[j + 1*32];
            fullMatrix[(i*32*2 + 0*32 + j) + (row+2)*numCols] = jacobi.m_tracks[i].xDerivatives[j + 2*32];
            fullMatrix[(i*32*2 + 0*32 + j) + (row+3)*numCols] = jacobi.m_tracks[i].xDerivatives[j + 3*32];

            fullMatrix[(i*32*2 + 1*32 + j) + (row+0)*numCols] = jacobi.m_tracks[i].yDerivatives[j + 0*32];
            fullMatrix[(i*32*2 + 1*32 + j) + (row+1)*numCols] = jacobi.m_tracks[i].yDerivatives[j + 1*32];
            fullMatrix[(i*32*2 + 1*32 + j) + (row+2)*numCols] = jacobi.m_tracks[i].yDerivatives[j + 2*32];
            fullMatrix[(i*32*2 + 1*32 + j) + (row+3)*numCols] = jacobi.m_tracks[i].yDerivatives[j + 3*32];
        }
    }

    for (unsigned i = 0; i < jacobi.m_cameraHeaders.size(); i++) {
        unsigned row = jacobi.m_cameraHeaders[i].cameraParameterIndex;
        for (unsigned j = 0; j < 32; j++) {
            for (unsigned k = 0; k < NUM_CAMERA_PARAMS; k++) {
                fullMatrix[(i*32*2 + 0*32 + j) + (row+k)*numCols] = jacobi.m_cameraData[i].xDerivatives[j + k*32];
                fullMatrix[(i*32*2 + 1*32 + j) + (row+k)*numCols] = jacobi.m_cameraData[i].yDerivatives[j + k*32];
            }
        }
    }

    for (unsigned i = 0; i < jacobi.m_calibHeaders.size(); i++) {
        unsigned row = jacobi.m_calibHeaders[i].calibParameterIndex;
        for (unsigned j = 0; j < 32; j++) {
            for (unsigned k = 0; k < NUM_CALIB_PARAMS; k++) {
                fullMatrix[(i*32*2 + 0*32 + j) + (row+k)*numCols] = jacobi.m_calibData[i].xDerivatives[j + k*32];
                fullMatrix[(i*32*2 + 1*32 + j) + (row+k)*numCols] = jacobi.m_calibData[i].yDerivatives[j + k*32];
            }
        }
    }

    stream << "[ ..." << std::endl;

    for (unsigned i = 0; i < numRows; i++) {
        stream << "   ";
        for (unsigned j = 0; j < numCols; j++) {
            stream << " " << fullMatrix[i*numCols + j];
        }
        if (i+1 < numRows)
            stream << "; ..." << std::endl;
        else
            stream << std::endl;
    }


    stream << "];" << std::endl;
    return stream;
}

*/









template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::resize(unsigned numTracksBlockRows, unsigned numCameras, unsigned numCalibs,
                    unsigned *numTrackBlocks, unsigned *numCameraBlocks, unsigned *numCalibBlocks)
{
    m_trackHeaders.resize(numTracksBlockRows);
    unsigned trackBlockIdx = 0;
    for (unsigned i = 0; i < m_trackHeaders.size(); i++) {
        m_trackHeaders[i].blockStart = trackBlockIdx;
        m_trackHeaders[i].blockCount = numTrackBlocks[i];
        trackBlockIdx += numTrackBlocks[i];
    }
    m_trackData.resize(trackBlockIdx);


    m_cameraHeaders.resize(numCameras);
    unsigned camBlockIdx = 0;
    for (unsigned i = 0; i < m_cameraHeaders.size(); i++) {
        m_cameraHeaders[i].blockStart = camBlockIdx;
        m_cameraHeaders[i].blockCount = numCameraBlocks[i];
        camBlockIdx += numCameraBlocks[i];
    }
    m_cameraData.resize(camBlockIdx);

    m_calibHeaders.resize(numCalibs);
    unsigned calibBlockIdx = 0;
    for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
        m_calibHeaders[i].blockStart = calibBlockIdx;
        m_calibHeaders[i].blockCount = numCalibBlocks[i];
        calibBlockIdx += numCalibBlocks[i];
    }
    m_calibData.resize(calibBlockIdx);

    memset(&m_trackData[0], 0, m_trackData.size() * sizeof(PFJacobiTrackBlockData));
    memset(&m_cameraData[0], 0, m_cameraData.size() * sizeof(SizedPFJacobiCameraBlockData));
    memset(&m_calibData[0], 0, m_calibData.size() * sizeof(SizedPFJacobiCalibBlockData));

}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setupSingleTrack(unsigned trackIndex, unsigned intraTrackObsIndex, unsigned obsResidualIndex)
{
    unsigned blockRowIdx = trackIndex / 8;
    unsigned laneIdx = trackIndex % 8;

    PFJacobiTrackBlockData &block = m_trackData[m_trackHeaders[blockRowIdx].blockStart+intraTrackObsIndex];
    block.residualIndices[laneIdx] = obsResidualIndex;
}


template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleTrackData(unsigned trackIndex, unsigned intraTrackObsIndex, float *data)
{
    unsigned blockRowIdx = trackIndex / 8;
    unsigned laneIdx = trackIndex % 8;

    PFJacobiTrackBlockData &block = m_trackData[m_trackHeaders[blockRowIdx].blockStart+intraTrackObsIndex];
    for (unsigned i = 0; i < 4; i++) {
        block.xDerivatives[laneIdx*4 + i] = data[i + 0*4];
        block.yDerivatives[laneIdx*4 + i] = data[i + 1*4];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleCameraData(unsigned cameraIdx, unsigned intraCameraObsIndex, float *data)
{
    unsigned intraBlockIdx = intraCameraObsIndex/32;
    unsigned intraBlockLane = intraCameraObsIndex%32;

    SizedPFJacobiCameraBlockData &block = m_cameraData[m_cameraHeaders[cameraIdx].blockStart+intraBlockIdx];
    for (unsigned i = 0; i < NumCameraUpdateParameters; i++) {
        block.xDerivatives[intraBlockLane + i*32] = data[i + 0*NumCameraUpdateParameters];
        block.yDerivatives[intraBlockLane + i*32] = data[i + 1*NumCameraUpdateParameters];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::setSingleCalibData(unsigned calibIdx, unsigned intraCalibObsIndex, float *data)
{
    unsigned intraBlockIdx = intraCalibObsIndex/32;
    unsigned intraBlockLane = intraCalibObsIndex%32;

    SizedPFJacobiCalibBlockData &block = m_calibData[m_calibHeaders[calibIdx].blockStart+intraBlockIdx];
    for (unsigned i = 0; i < NumCalibrationUpdateParameters; i++) {
        block.xDerivatives[intraBlockLane + i*32] = data[i*2+0];
        block.yDerivatives[intraBlockLane + i*32] = data[i*2+1];
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::multiplyWithVector(const std::vector<float, AlignedAllocator<float>> &src,
                                                                                             std::vector<float, AlignedAllocator<float>> &dst,
                                                                                             unsigned batchSize) const
{
    dst.resize(m_trackHeaders.size()*8*4 +
               m_cameraHeaders.size()*NumCameraUpdateParameters +
               m_calibHeaders.size()*NumCalibrationUpdateParameters);
    {
        for (unsigned i = 0; i < m_trackHeaders.size(); i++) {
            CpuWarpReg sum;
            sum.setZero();

            for (unsigned k = 0; k < m_trackHeaders[i].blockCount; k++) {
                const PFJacobiTrackBlockData &block = m_trackData[m_trackHeaders[i].blockStart+k];

                CpuWarpReg residualsX;
                CpuWarpReg residualsY;

                residualsX.gatherBroadcast4(&src[0], block.residualIndices);
                residualsY.gatherBroadcast4(&src[0] + 32, block.residualIndices);

                CpuWarpReg xDerivatives;
                CpuWarpReg yDerivatives;
                xDerivatives.loadAligned(block.xDerivatives);
                yDerivatives.loadAligned(block.yDerivatives);
                sum += residualsX * xDerivatives + residualsY * yDerivatives;
            }

            sum.storeAligned(&dst[i*4*8]);
        }
    }
    {
        for (unsigned i = 0; i < m_cameraHeaders.size(); i++) {
            alignas(32) float sums[32*NumCameraUpdateParameters];
            for (unsigned j = 0; j < 32*NumCameraUpdateParameters; j+=8) {
                _mm256_store_ps(sums+j, _mm256_setzero_ps());
            }

            for (unsigned k = 0; k < m_cameraHeaders[i].blockCount; k++) {

                const SizedPFJacobiCameraBlockData &block = m_cameraData[m_cameraHeaders[i].blockStart+k];
                const unsigned residualStart = (m_cameraHeaders[i].blockStart+k)*32*2;

                CpuWarpReg residualsX;
                CpuWarpReg residualsY;
                residualsX.loadAligned(&src[0] + residualStart + 0*32);
                residualsY.loadAligned(&src[0] + residualStart + 1*32);
                for (unsigned l = 0; l < NumCameraUpdateParameters; l++) {
                    CpuWarpReg sum;
                    sum.loadAligned(sums+l*32);

                    CpuWarpReg xDerivatives;
                    CpuWarpReg yDerivatives;

                    xDerivatives.loadAligned(block.xDerivatives + l*32);
                    yDerivatives.loadAligned(block.yDerivatives + l*32);

                    sum += residualsX * xDerivatives + residualsY * yDerivatives;

                    sum.storeAligned(sums+l*32);
                }
            }

            for (unsigned l = 0; l < NumCameraUpdateParameters; l++) {
                CpuWarpReg sum;
                sum.loadAligned(sums+l*32);
                sum.reduce(&dst[m_trackHeaders.size()*8*4 + i*NumCameraUpdateParameters + l]);
            }
        }
    }
    {
        for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
            alignas(32) float sums[32*NumCalibrationUpdateParameters];
            for (unsigned j = 0; j < 32*NumCalibrationUpdateParameters; j+=8) {
                _mm256_store_ps(sums+j, _mm256_setzero_ps());
            }

            for (unsigned k = 0; k < m_calibHeaders[i].blockCount; k++) {

                const SizedPFJacobiCalibBlockData &block = m_calibData[m_calibHeaders[i].blockStart+k];
                const unsigned residualStart = (m_calibHeaders[i].blockStart+k)*32*2;

                CpuWarpReg residualsX;
                CpuWarpReg residualsY;
                residualsX.loadAligned(&src[0] + residualStart + 0*32);
                residualsY.loadAligned(&src[0] + residualStart + 1*32);

                for (unsigned l = 0; l < NumCalibrationUpdateParameters; l++) {
                    CpuWarpReg sum;
                    sum.loadAligned(sums+l*32);

                    CpuWarpReg xDerivatives;
                    CpuWarpReg yDerivatives;

                    xDerivatives.loadAligned(block.xDerivatives + l*32);
                    yDerivatives.loadAligned(block.yDerivatives + l*32);

                    sum += residualsX * xDerivatives + residualsY * yDerivatives;

                    sum.storeAligned(sums+l*32);
                }
            }

            for (unsigned l = 0; l < NumCalibrationUpdateParameters; l++) {
                CpuWarpReg sum;
                sum.loadAligned(sums+l*32);
                sum.reduce(&dst[m_trackHeaders.size()*8*4 +
                                m_cameraHeaders.size()*NumCameraUpdateParameters +
                                i*NumCalibrationUpdateParameters + l]);
            }
        }
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::computeDiagonalOfJTJ(std::vector<float, AlignedAllocator<float>> &dst) const
{
    dst.resize(m_trackHeaders.size()*8*4 +
               m_cameraHeaders.size()*NumCameraUpdateParameters +
               m_calibHeaders.size()*NumCalibrationUpdateParameters);
    {
        for (unsigned i = 0; i < m_trackHeaders.size(); i++) {
            CpuWarpReg sum;
            sum.setZero();

            for (unsigned k = 0; k < m_trackHeaders[i].blockCount; k++) {
                const PFJacobiTrackBlockData &block = m_trackData[m_trackHeaders[i].blockStart+k];
                CpuWarpReg xDerivatives;
                CpuWarpReg yDerivatives;
                xDerivatives.loadAligned(block.xDerivatives);
                yDerivatives.loadAligned(block.yDerivatives);
                sum += xDerivatives * xDerivatives + yDerivatives * yDerivatives;
            }

            sum.storeAligned(&dst[i*4*8]);
        }
    }
    {
        for (unsigned i = 0; i < m_cameraHeaders.size(); i++) {
            alignas(32) float sums[32*NumCameraUpdateParameters];
            for (unsigned j = 0; j < 32*NumCameraUpdateParameters; j+=8) {
                _mm256_store_ps(sums+j, _mm256_setzero_ps());
            }

            for (unsigned k = 0; k < m_cameraHeaders[i].blockCount; k++) {

                const SizedPFJacobiCameraBlockData &block = m_cameraData[m_cameraHeaders[i].blockStart+k];
                for (unsigned l = 0; l < NumCameraUpdateParameters; l++) {
                    CpuWarpReg sum;
                    sum.loadAligned(sums+l*32);

                    CpuWarpReg xDerivatives;
                    CpuWarpReg yDerivatives;

                    xDerivatives.loadAligned(block.xDerivatives + l*32);
                    yDerivatives.loadAligned(block.yDerivatives + l*32);

                    sum += xDerivatives * xDerivatives + yDerivatives * yDerivatives;

                    sum.storeAligned(sums+l*32);
                }
            }

            for (unsigned l = 0; l < NumCameraUpdateParameters; l++) {
                CpuWarpReg sum;
                sum.loadAligned(sums+l*32);
                sum.reduce(&dst[m_trackHeaders.size()*8*4 + i*NumCameraUpdateParameters + l]);
            }
        }
    }
    {
        for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
            alignas(32) float sums[32*NumCalibrationUpdateParameters];
            for (unsigned j = 0; j < 32*NumCalibrationUpdateParameters; j+=8) {
                _mm256_store_ps(sums+j, _mm256_setzero_ps());
            }

            for (unsigned k = 0; k < m_calibHeaders[i].blockCount; k++) {

                const SizedPFJacobiCalibBlockData &block = m_calibData[m_calibHeaders[i].blockStart+k];
                for (unsigned l = 0; l < NumCalibrationUpdateParameters; l++) {
                    CpuWarpReg sum;
                    sum.loadAligned(sums+l*32);

                    CpuWarpReg xDerivatives;
                    CpuWarpReg yDerivatives;

                    xDerivatives.loadAligned(block.xDerivatives + l*32);
                    yDerivatives.loadAligned(block.yDerivatives + l*32);

                    sum += xDerivatives * xDerivatives + yDerivatives * yDerivatives;

                    sum.storeAligned(sums+l*32);
                }
            }

            for (unsigned l = 0; l < NumCalibrationUpdateParameters; l++) {
                CpuWarpReg sum;
                sum.loadAligned(sums+l*32);
                sum.reduce(&dst[m_trackHeaders.size()*8*4 +
                            m_cameraHeaders.size()*NumCameraUpdateParameters +
                            i*NumCalibrationUpdateParameters + l]);
            }
        }
    }
}

template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
void PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters>::writeLayoutToImage(const char *filename)
{
    unsigned numCols = m_calibData.size()*32*2;
    unsigned numRows = m_trackHeaders.size()*8*4 +
                       m_cameraHeaders.size()*NumCameraUpdateParameters +
                       m_calibHeaders.size()*NumCalibrationUpdateParameters;

    RasterImage image(numCols, numRows);
    image.clear(0xFFFFFFFF);

    for (unsigned i = 0; i < m_trackHeaders.size(); i++) {
        for (unsigned k = 0; k < m_trackHeaders[i].blockCount; k++) {
            const PFJacobiTrackBlockData &block = m_trackData[m_trackHeaders[i].blockStart+k];
            for (unsigned j = 0; j < 32; j++) {
                unsigned col = block.residualIndices[j / 4];
                unsigned row = i*32+j;

                assert(col + 1*32 + row*numCols < numCols*numRows);

                image.getData()[col + 0*32 + row*numCols] = 0xFF101010;
                image.getData()[col + 1*32 + row*numCols] = 0xFF101010;
            }
        }
    }

    for (unsigned i = 0; i < m_cameraHeaders.size(); i++) {
        unsigned row = m_trackHeaders.size()*8*4 + i*NumCameraUpdateParameters;
        for (unsigned l = 0; l < m_cameraHeaders[i].blockCount; l++) {
            unsigned col = (m_cameraHeaders[i].blockStart+l)*64;
            for (unsigned j = 0; j < 32; j++) {
                for (unsigned k = 0; k < NumCameraUpdateParameters; k++) {
                    assert(col + 1*32 + j + (row+k)*numCols < numCols*numRows);
                    image.getData()[col + 0*32 + j + (row+k)*numCols] = 0xFF101010;
                    image.getData()[col + 1*32 + j + (row+k)*numCols] = 0xFF101010;
                }
            }
        }
    }

    for (unsigned i = 0; i < m_calibHeaders.size(); i++) {
        unsigned row = m_trackHeaders.size()*8*4 + m_cameraHeaders.size()*NumCameraUpdateParameters + i*NumCalibrationUpdateParameters;
        for (unsigned l = 0; l < m_calibHeaders[i].blockCount; l++) {
            unsigned col = (m_calibHeaders[i].blockStart+l)*64;
            for (unsigned j = 0; j < 32; j++) {
                for (unsigned k = 0; k < NumCalibrationUpdateParameters; k++) {
                    assert(col + 1*32 + j + (row+k)*numCols < numCols*numRows);
                    image.getData()[col + 0*32 + j + (row+k)*numCols] = 0xFF101010;
                    image.getData()[col + 1*32 + j + (row+k)*numCols] = 0xFF101010;
                }
            }
        }
    }


    image.writeToFile(filename);
}

/*
template<unsigned NumCameraUpdateParameters, unsigned NumCalibrationUpdateParameters>
std::ostream &operator<<(std::ostream &stream, const PFJacobi<NumCameraUpdateParameters, NumCalibrationUpdateParameters> &jacobi)
{
    unsigned numCols = jacobi.m_calibData.size()*32*2;
    unsigned numRows = jacobi.m_trackHeaders.size()*8*4 + jacobi.m_cameraHeaders.size()*NUM_CAMERA_PARAMS + jacobi.m_calibHeaders.size()*NUM_CALIB_PARAMS;

    std::vector<float> fullMatrix;
    fullMatrix.resize(numCols * numRows);

    for (unsigned i = 0; i < jacobi.m_trackHeaders.size(); i++) {
        for (unsigned k = 0; k < jacobi.m_trackHeaders[i].blockCount; k++) {
            const PFJacobiTrackBlockData &block = jacobi.m_trackData[jacobi.m_trackHeaders[i].blockStart+k];
            for (unsigned j = 0; j < 32; j++) {
                unsigned col = block.residualIndices[j / 4];
                unsigned row = i*32+j;

                assert(col + 1*32 + row*numCols < numCols*numRows);

                fullMatrix[col + 0*32 + row*numCols] = block.xDerivatives[j];
                fullMatrix[col + 1*32 + row*numCols] = block.yDerivatives[j];
            }
        }
    }

    for (unsigned i = 0; i < jacobi.m_cameraHeaders.size(); i++) {
        unsigned row = jacobi.m_trackHeaders.size()*8*4 + i*NUM_CAMERA_PARAMS;
        for (unsigned l = 0; l < jacobi.m_cameraHeaders[i].blockCount; l++) {
            unsigned col = (jacobi.m_cameraHeaders[i].blockStart+l)*64;
            const PFJacobiCameraBlockData &block = jacobi.m_cameraData[jacobi.m_cameraHeaders[i].blockStart+l];
            for (unsigned j = 0; j < 32; j++) {
                for (unsigned k = 0; k < NUM_CAMERA_PARAMS; k++) {
                    assert(col + 1*32 + j + (row+k)*numCols < numCols*numRows);
                    fullMatrix[col + 0*32 + j + (row+k)*numCols] = block.xDerivatives[j + k*32];
                    fullMatrix[col + 1*32 + j + (row+k)*numCols] = block.yDerivatives[j + k*32];
                }
            }
        }
    }

    for (unsigned i = 0; i < jacobi.m_calibHeaders.size(); i++) {
        unsigned row = jacobi.m_trackHeaders.size()*8*4 + jacobi.m_cameraHeaders.size()*NUM_CAMERA_PARAMS + i*NUM_CALIB_PARAMS;
        for (unsigned l = 0; l < jacobi.m_calibHeaders[i].blockCount; l++) {
            unsigned col = (jacobi.m_calibHeaders[i].blockStart+l)*64;
            const PFJacobiCalibBlockData &block = jacobi.m_calibData[jacobi.m_calibHeaders[i].blockStart+l];
            for (unsigned j = 0; j < 32; j++) {
                for (unsigned k = 0; k < NUM_CALIB_PARAMS; k++) {
                    assert(col + 1*32 + j + (row+k)*numCols < numCols*numRows);
                    fullMatrix[col + 0*32 + j + (row+k)*numCols] = block.xDerivatives[j + k*32];
                    fullMatrix[col + 1*32 + j + (row+k)*numCols] = block.yDerivatives[j + k*32];
                }
            }
        }
    }
    stream << "[ ..." << std::endl;

    for (unsigned i = 0; i < numRows; i++) {
        stream << "   ";
        for (unsigned j = 0; j < numCols; j++) {
            stream << " " << fullMatrix[i*numCols + j];
        }
        if (i+1 < numRows)
            stream << "; ..." << std::endl;
        else
            stream << std::endl;
    }


    stream << "];" << std::endl;
    return stream;
}
*/


template class PFJacobiTransposed<6u,0u>;
template class PFJacobi<6u,0u>;

template class PFJacobiTransposed<6u,7u>;
template class PFJacobi<6u,7u>;
template class PFJacobiTransposed<6u,4u>;
template class PFJacobi<6u,4u>;

template class PFJacobiTransposed<6u,8u>;
template class PFJacobi<6u,8u>;
template class PFJacobiTransposed<6u,5u>;
template class PFJacobi<6u,5u>;


}
}



