/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "PFBundleAdjustment.h"
#include <assert.h>
#include "../../tools/RasterImage.h"
#include "../../tools/CPUStopWatch.h"
#include "PFJacobi.h"
#include <algorithm>
#include <string.h>

#ifdef NDEBUG
#define PFBA_FORCE_INLINE __attribute__((always_inline))
#else
#define PFBA_FORCE_INLINE
#endif

namespace SFM {

namespace PFBundleAdjustment {

namespace detail {

/** @addtogroup BundleAdjustment_Group
 *  @{
 */

struct Empty {
};

template<typename DistortionParametrization>
struct ShuffledCalibration {
    LinAlg::Matrix4x4f projectionMatrix;
    DistortionParametrization distortionParams;
    unsigned firstObservationShuffledIndex;
};
struct ShuffledCamera {
    unsigned calibrationShuffledIndex;
    unsigned firstObservationShuffledIndex;
    unsigned numObservations;

    LinAlg::Matrix4x4f viewMatrix;
    LinAlg::Matrix4x4f projectionViewMatrix;
};
struct ShuffledTrackBlock {
    float xyzw[8*4];
};
struct ShuffledObservationBlock {
    unsigned shuffledTrackIndex[32];
    unsigned shuffledIntraTrackObsIndex[32];

    float weight[32];
    float x[32];
    float y[32];
};



/// Base class implementing the polynomial radial distortion model with the coefficients kappa_2 through kappa_4.
template<bool fixed>
class DistortionModel_PolynomialRadial234
{
    public:
        typedef LinAlg::Vector3f DistortionParametrization;

        inline static void copyDataFromStructure(ShuffledCalibration<DistortionParametrization> &dst, const PFBundleAdjustment::Calibration &src) PFBA_FORCE_INLINE {
            assert(src.radialDistortion.type == config::BundleAdjustmentStructureConfig::RadialDistortionType::Polynomial_234);
            dst.distortionParams[0] = src.radialDistortion.polynomial234.kappa[0];
            dst.distortionParams[1] = src.radialDistortion.polynomial234.kappa[1];
            dst.distortionParams[2] = src.radialDistortion.polynomial234.kappa[2];
        }

        inline static void copyDataToStructure(PFBundleAdjustment::Calibration &dst, const ShuffledCalibration<DistortionParametrization> &src) PFBA_FORCE_INLINE {
            assert(dst.radialDistortion.type == config::BundleAdjustmentStructureConfig::RadialDistortionType::Polynomial_234);
            if (!fixed) {
                dst.radialDistortion.polynomial234.kappa[0] = src.distortionParams[0];
                dst.radialDistortion.polynomial234.kappa[1] = src.distortionParams[1];
                dst.radialDistortion.polynomial234.kappa[2] = src.distortionParams[2];
            }
        }

        inline static void computeDistortionForResiduals(CpuWarpReg &S, CpuWarpReg &T, const DistortionParametrization &parameters) PFBA_FORCE_INLINE {
            CpuWarpReg R2 = S*S + T*T;
            CpuWarpReg R = R2.approxSqrt();
            CpuWarpReg fac = R2 * &parameters[0] + R2*R * &parameters[1] + R2*R2 * &parameters[2] + _mm256_set1_ps(1.0f);
            S = S * fac;
            T = T * fac;
        }

        inline static void performUpdateStep(ShuffledCalibration<DistortionParametrization> &dst,
                                     const ShuffledCalibration<DistortionParametrization> &src,
                                     const float *update) PFBA_FORCE_INLINE {
            dst.distortionParams[0] = src.distortionParams[0] + update[0];
            dst.distortionParams[1] = src.distortionParams[1] + update[1];
            dst.distortionParams[2] = src.distortionParams[2] + update[2];

            assert(std::isfinite(update[0]));
            assert(std::isfinite(update[1]));
            assert(std::isfinite(update[2]));
        }

        class JacobianComputations
        {
            public:
                typedef LinAlg::Matrix2x2f DistortionJacobian;

                JacobianComputations(const DistortionParametrization &parameters) : m_kappa(parameters) { }

                inline DistortionJacobian compute(float *calibData,
                             const LinAlg::Vector2f &euclideanProjectedPos,
                             float weight, float errorFirstDerivative) const PFBA_FORCE_INLINE {
                    const float r2 = euclideanProjectedPos.SQRLen();
                    const float r1 = std::sqrt(r2);
                    const float r3 = r2 * r1;
                    const float r4 = r2 * r2;

                    if (!fixed) {
                        calibData[0*2+0] = euclideanProjectedPos[0] * r2 * weight * errorFirstDerivative;
                        calibData[0*2+1] = euclideanProjectedPos[1] * r2 * weight * errorFirstDerivative;

                        calibData[1*2+0] = euclideanProjectedPos[0] * r3 * weight * errorFirstDerivative;
                        calibData[1*2+1] = euclideanProjectedPos[1] * r3 * weight * errorFirstDerivative;

                        calibData[2*2+0] = euclideanProjectedPos[0] * r4 * weight * errorFirstDerivative;
                        calibData[2*2+1] = euclideanProjectedPos[1] * r4 * weight * errorFirstDerivative;
                    }
                    DistortionJacobian radialDistJacobi(DistortionJacobian::NO_INITIALIZATION);

                    radialDistJacobi[0][0] =
                        euclideanProjectedPos[0] * euclideanProjectedPos[0] *
                                    (2.0f*m_kappa[0] + 3.0f*m_kappa[1]*r1 + 4.0f*m_kappa[2]*r2) +
                        r2*m_kappa[0] +
                        r3*m_kappa[1] +
                        r4*m_kappa[2] +
                        1.0f;

                    radialDistJacobi[1][0] =
                    radialDistJacobi[0][1] =
                        euclideanProjectedPos[0] * euclideanProjectedPos[1] *
                                    (2.0f*m_kappa[0] + 3.0f*m_kappa[1]*r1 + 4.0f*m_kappa[2]*r2);


                    radialDistJacobi[1][1] =
                        euclideanProjectedPos[1] * euclideanProjectedPos[1] *
                                    (2.0f*m_kappa[0] + 3.0f*m_kappa[1]*r1 + 4.0f*m_kappa[2]*r2) +
                        r2*m_kappa[0] +
                        r3*m_kappa[1] +
                        r4*m_kappa[2] +
                        1.0f;

                    return radialDistJacobi;
                }

                inline void applyDistortionJacobian(LinAlg::Vector2f &grad, const DistortionJacobian &jacobian) const PFBA_FORCE_INLINE {
                    grad = jacobian * grad;
                }
            private:
                const DistortionParametrization m_kappa;
        };
};

/// Implementation of the polynomial radial distortion model with the coefficients kappa_2 through kappa_4 with fixed parameters.
class DistortionModel_PolynomialRadial234Fixed : public DistortionModel_PolynomialRadial234<true>
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 0
        };
};

/// Implementation of the polynomial radial distortion model with the coefficients kappa_2 through kappa_4 with dynamic parameters.
class DistortionModel_PolynomialRadial234Dynamic : public DistortionModel_PolynomialRadial234<false>
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 3
        };
};


/// Implementation of the "no distortion" distortion model.
class DistortionModel_NoDistortion
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 0
        };

        typedef Empty DistortionParametrization;

        inline static void copyDataFromStructure(ShuffledCalibration<DistortionParametrization> &dst, const PFBundleAdjustment::Calibration &src) PFBA_FORCE_INLINE {
            assert(src.radialDistortion.type == config::BundleAdjustmentStructureConfig::RadialDistortionType::NoRadialDistortion);
        }

        inline static void copyDataToStructure(PFBundleAdjustment::Calibration &dst, const ShuffledCalibration<DistortionParametrization> &src) PFBA_FORCE_INLINE {
            assert(dst.radialDistortion.type == config::BundleAdjustmentStructureConfig::RadialDistortionType::NoRadialDistortion);
        }

        inline static void computeDistortionForResiduals(CpuWarpReg &S, CpuWarpReg &T, const DistortionParametrization &parameters) PFBA_FORCE_INLINE {

        }

        inline static void performUpdateStep(ShuffledCalibration<DistortionParametrization> &dst,
                                     const ShuffledCalibration<DistortionParametrization> &src,
                                     const float *update) PFBA_FORCE_INLINE {

        }

        class JacobianComputations
        {
            public:
                typedef Empty DistortionJacobian;

                JacobianComputations(const DistortionParametrization &parameters) { }

                inline DistortionJacobian compute(float *calibData,
                             const LinAlg::Vector2f &euclideanProjectedPos,
                             float weight, float errorFirstDerivative) const PFBA_FORCE_INLINE {
                    return DistortionJacobian();
                }

                inline void applyDistortionJacobian(LinAlg::Vector2f &grad, const DistortionJacobian &jacobian) const PFBA_FORCE_INLINE {
                }
        };
};


/// Implementation of the fully dynamic internal calibration model.
template<class DistortionModel>
class ProjectionModel_Full
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 5
        };

        inline static void copyDataFromStructure(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst, const PFBundleAdjustment::Calibration &src) PFBA_FORCE_INLINE {
            dst.projectionMatrix = src.projectionMatrix;
        }

        inline static void copyDataToStructure(PFBundleAdjustment::Calibration &dst, const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src) PFBA_FORCE_INLINE {
            dst.projectionMatrix = src.projectionMatrix;
        }

        inline static void performUpdateStep(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst,
                                     const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src,
                                     const float *update) PFBA_FORCE_INLINE {
            dst.projectionMatrix[0][0] = src.projectionMatrix[0][0] + update[0];
            dst.projectionMatrix[0][1] = src.projectionMatrix[0][1] + update[1];
            dst.projectionMatrix[0][2] = src.projectionMatrix[0][2] + update[2];
            dst.projectionMatrix[1][1] = src.projectionMatrix[1][1] + update[3];
            dst.projectionMatrix[1][2] = src.projectionMatrix[1][2] + update[4];

            assert(std::isfinite(update[0]));
            assert(std::isfinite(update[1]));
            assert(std::isfinite(update[2]));
            assert(std::isfinite(update[3]));
            assert(std::isfinite(update[4]));
        }

        inline static void computeJacobian(float *calibData, const LinAlg::Vector4f &projectiveEyeSpacePosition,
                            const LinAlg::Vector3f &projectedPos, float weight, float errorFirstDerivative,
                            const typename DistortionModel::JacobianComputations &distortionModel,
                            const typename DistortionModel::JacobianComputations::DistortionJacobian &distortionJacobian) PFBA_FORCE_INLINE {

            LinAlg::Vector2f pointGrad;

            // d/dC11
            pointGrad[0] = projectiveEyeSpacePosition[0] / projectedPos[2];
            pointGrad[1] = 0.0f;

            pointGrad *= errorFirstDerivative;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[0*2+0] = pointGrad[0] * weight;
            calibData[0*2+1] = pointGrad[1] * weight;


            // d/dC12
            pointGrad[0] = projectiveEyeSpacePosition[1] / projectedPos[2];
            pointGrad[1] = 0.0f;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[1*2+0] = pointGrad[0] * weight;
            calibData[1*2+1] = pointGrad[1] * weight;

            // d/dC13
            pointGrad[0] = projectiveEyeSpacePosition[2] / projectedPos[2];
            pointGrad[1] = 0.0f;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[2*2+0] = pointGrad[0] * weight;
            calibData[2*2+1] = pointGrad[1] * weight;

            // d/dC22
            pointGrad[0] = 0.0f;
            pointGrad[1] = projectiveEyeSpacePosition[1] / projectedPos[2];

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[3*2+0] = pointGrad[0] * weight;
            calibData[3*2+1] = pointGrad[1] * weight;

            // d/dC23
            pointGrad[0] = 0.0f;
            pointGrad[1] = projectiveEyeSpacePosition[2] / projectedPos[2];

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[4*2+0] = pointGrad[0] * weight;
            calibData[4*2+1] = pointGrad[1] * weight;
        }
};

/// Implementation of the internal calibration model where the two focal lengths are linked together.
template<class DistortionModel>
class ProjectionModel_SharedFocalLength
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 4
        };


        inline static void copyDataFromStructure(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst, const PFBundleAdjustment::Calibration &src) PFBA_FORCE_INLINE {
            dst.projectionMatrix = src.projectionMatrix;
        }

        inline static void copyDataToStructure(PFBundleAdjustment::Calibration &dst, const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src) PFBA_FORCE_INLINE {
            dst.projectionMatrix = src.projectionMatrix;
        }

        inline static void performUpdateStep(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst,
                                     const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src,
                                     const float *update) PFBA_FORCE_INLINE {
            dst.projectionMatrix[0][0] = src.projectionMatrix[0][0] + update[0];
            dst.projectionMatrix[0][1] = src.projectionMatrix[0][1] + update[1];
            dst.projectionMatrix[0][2] = src.projectionMatrix[0][2] + update[2];
            dst.projectionMatrix[1][1] = src.projectionMatrix[1][1] + update[0];
            dst.projectionMatrix[1][2] = src.projectionMatrix[1][2] + update[3];

            assert(std::isfinite(update[0]));
            assert(std::isfinite(update[1]));
            assert(std::isfinite(update[2]));
            assert(std::isfinite(update[3]));
        }

        inline static void computeJacobian(float *calibData, const LinAlg::Vector4f &projectiveEyeSpacePosition,
                            const LinAlg::Vector3f &projectedPos, float weight, float errorFirstDerivative,
                            const typename DistortionModel::JacobianComputations &distortionModel,
                            const typename DistortionModel::JacobianComputations::DistortionJacobian &distortionJacobian) PFBA_FORCE_INLINE {

            LinAlg::Vector2f pointGrad;

            // d/dC11
            pointGrad[0] = projectiveEyeSpacePosition[0] / projectedPos[2];
            pointGrad[1] = projectiveEyeSpacePosition[1] / projectedPos[2];

            pointGrad *= errorFirstDerivative;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[0*2+0] = pointGrad[0] * weight;
            calibData[0*2+1] = pointGrad[1] * weight;


            // d/dC12
            pointGrad[0] = projectiveEyeSpacePosition[1] / projectedPos[2];
            pointGrad[1] = 0.0f;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[1*2+0] = pointGrad[0] * weight;
            calibData[1*2+1] = pointGrad[1] * weight;

            // d/dC13
            pointGrad[0] = projectiveEyeSpacePosition[2] / projectedPos[2];
            pointGrad[1] = 0.0f;

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[2*2+0] = pointGrad[0] * weight;
            calibData[2*2+1] = pointGrad[1] * weight;


            // d/dC23
            pointGrad[0] = 0.0f;
            pointGrad[1] = projectiveEyeSpacePosition[2] / projectedPos[2];

            distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

            pointGrad *= errorFirstDerivative;

            assert(std::isfinite(pointGrad[0]));
            assert(std::isfinite(pointGrad[1]));

            calibData[3*2+0] = pointGrad[0] * weight;
            calibData[3*2+1] = pointGrad[1] * weight;

        }
};

/// Implementation of the completely fixed internal calibration model.
template<class DistortionModel>
class ProjectionModel_Fixed
{
    public:
        enum {
            NUM_UPDATE_PARAMETERS = 0
        };

        inline static void copyDataFromStructure(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst, const PFBundleAdjustment::Calibration &src) PFBA_FORCE_INLINE {
            dst.projectionMatrix = src.projectionMatrix;
        }

        inline static void copyDataToStructure(PFBundleAdjustment::Calibration &dst, const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src) PFBA_FORCE_INLINE {

        }

        inline static void performUpdateStep(ShuffledCalibration<typename DistortionModel::DistortionParametrization> &dst,
                                     const ShuffledCalibration<typename DistortionModel::DistortionParametrization> &src,
                                     const float *update) PFBA_FORCE_INLINE {

        }

        inline static void computeJacobian(float *calibData, const LinAlg::Vector4f &projectiveEyeSpacePosition,
                            const LinAlg::Vector3f &projectedPos, float weight, float errorFirstDerivative,
                            const typename DistortionModel::JacobianComputations &distortionModel,
                            const typename DistortionModel::JacobianComputations::DistortionJacobian &distortionJacobian) PFBA_FORCE_INLINE {
        }
};



/**
 * @brief Templated implementation of the bundle adjustment state.
 * @details This class stores the current state of all parameters and supplies
 * methods for computing the residuals and Jacobian, as well as for performing update steps.
 * The internal calibration and radial distortion implementations are injected through template parameters
 * and, in the case of the Jacobian, are computed independently of each other by abusing the chain rule.
 */
template<class ProjectionModel, class DistortionModel>
struct BAState : public BAStateInterface {

    enum {
        NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS = ProjectionModel::NUM_UPDATE_PARAMETERS + DistortionModel::NUM_UPDATE_PARAMETERS,
        NUM_CAMERA_UPDATE_PARAMETERS = 6
    };

    typedef BAState<ProjectionModel, DistortionModel> ThisType;

    std::vector<ShuffledCalibration<typename DistortionModel::DistortionParametrization> > m_calibrations;
    std::vector<ShuffledCamera> m_cameras;
    std::vector<ShuffledTrackBlock, AlignedAllocator<ShuffledTrackBlock>> m_tracks;
    std::vector<ShuffledObservationBlock, AlignedAllocator<ShuffledObservationBlock>> m_observations;

    typedef PFJacobi<NUM_CAMERA_UPDATE_PARAMETERS, NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS> SizedJacobi;
    typedef PFJacobiTransposed<NUM_CAMERA_UPDATE_PARAMETERS, NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS> SizedJacobiTransposed;

    inline void copyFrom(const ThisType &rhs) {
        m_calibrations = rhs.m_calibrations;
        m_cameras = rhs.m_cameras;
        m_tracks = rhs.m_tracks;
        m_observations = rhs.m_observations;
    }

    void operator=(const BAStateInterface &rhs) override {
        assert(dynamic_cast<const ThisType*>(&rhs) != nullptr);
        copyFrom(static_cast<const ThisType&>(rhs));
    }

    void initializeFromStructure(PFBundleAdjustment::BAStructure &structure) override;

    void initializeJacobian(std::unique_ptr<PFJacobiInterface> &J, std::unique_ptr<PFJacobiTransposedInterface> &JT) const override;

    void copyDataFromStructure(const PFBundleAdjustment::BAStructure &structure) override;

    void copyDataToStructure(PFBundleAdjustment::BAStructure &structure) const override;

    void computeResiduals(std::vector<float, AlignedAllocator<float>> &dst) const override;

    void computeJacobi(PFJacobiInterface &J, PFJacobiTransposedInterface &JT) const override {
        assert(typeid(J) == typeid(SizedJacobi));
        assert(typeid(JT) == typeid(SizedJacobiTransposed));
        computeJacobiImpl(static_cast<SizedJacobi&>(J), static_cast<SizedJacobiTransposed&>(JT));
    }

    void performUpdateStep(const std::vector<float, AlignedAllocator<float>> &parameterStep, BAStateInterface &dst) const override {
        assert(dynamic_cast<const ThisType*>(&dst) != nullptr);
        performUpdateStepImpl(parameterStep, static_cast<ThisType&>(dst));
    }

    protected:
        inline void initializeJacobianImpl(SizedJacobi &J, SizedJacobiTransposed &JT) const PFBA_FORCE_INLINE;

        inline void performUpdateStepImpl(const std::vector<float, AlignedAllocator<float>> &parameterStep,
                                    ThisType &dst) const PFBA_FORCE_INLINE;

        inline void computeJacobiImpl(SizedJacobi &J, SizedJacobiTransposed &JT) const PFBA_FORCE_INLINE;
};


/// Bundle adjustement state implementation for a fully dynamic internal calibration and a fully dynamic polynomial_234 radial distortion model
typedef BAState<ProjectionModel_Full<DistortionModel_PolynomialRadial234Dynamic>,
                    DistortionModel_PolynomialRadial234Dynamic> BAState_Polynomial234_FullProjection;
/// Bundle adjustement state implementation for a dynamic internal calibration with shared focal lengths and a fully dynamic polynomial_234 radial distortion model
typedef BAState<ProjectionModel_SharedFocalLength<DistortionModel_PolynomialRadial234Dynamic>,
                    DistortionModel_PolynomialRadial234Dynamic> BAState_Polynomial234_SharedFocalLength;
/// Bundle adjustement state implementation for a fixed internal calibration and a fixed polynomial_234 radial distortion model
typedef BAState<ProjectionModel_Fixed<DistortionModel_PolynomialRadial234Fixed>,
                    DistortionModel_PolynomialRadial234Fixed> BAState_Polynomial234_FixedProjection;

/// Bundle adjustement state implementation for a fully dynamic internal calibration and no radial distortion model
typedef BAState<ProjectionModel_Full<DistortionModel_NoDistortion>,
                    DistortionModel_NoDistortion> BAState_NoDistortion_FullProjection;
/// Bundle adjustement state implementation for a dynamic internal calibration with shared focal lengths and no radial distortion model
typedef BAState<ProjectionModel_SharedFocalLength<DistortionModel_NoDistortion>,
                    DistortionModel_NoDistortion> BAState_NoDistortion_SharedFocalLength;
/// Bundle adjustement state implementation for a fixed internal calibration and no radial distortion model
typedef BAState<ProjectionModel_Fixed<DistortionModel_NoDistortion>,
                    DistortionModel_NoDistortion> BAState_NoDistortion_FixedProjection;


template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
    initializeFromStructure(PFBundleAdjustment::BAStructure &structure)
{
    m_tracks.resize((structure.m_numTracks+7)/8);
    m_cameras.resize(structure.m_numCameras);
    m_calibrations.resize(structure.m_numCalibrations);
    {
        std::vector<uint64_t> tags;
        tags.reserve(structure.m_numTracks);
        for (unsigned i = 0; i < structure.m_tracks.reservedSize(); i++) {
            if (!structure.m_tracks.inUse(i)) continue;

            uint64_t prio = (1 << 31) - structure.m_tracks[i].numObservations;
            uint64_t idx = i;

            tags.push_back((prio << 32) | idx);
        }
        std::sort(tags.begin(), tags.end());

        for (unsigned i = 0; i < tags.size(); i++) {
            unsigned index = tags[i] & 0xFFFFFFFF;
            structure.m_tracks[index].shuffledIndex = i;
        }
        assert(tags.size() == structure.m_numTracks);
    }
    {
        unsigned shuffleIndex = 0;
        for (unsigned i = 0; i < structure.m_calibrations.reservedSize(); i++) {
            if (!structure.m_calibrations.inUse(i)) continue;

            structure.m_calibrations[i].shuffledIndex = shuffleIndex++;
        }
        assert(shuffleIndex == structure.m_numCalibrations);
    }
    unsigned numObservationBlocks = 0;
    {
        std::vector<uint64_t> tags;
        tags.reserve(structure.m_numCameras);
        for (unsigned i = 0; i < structure.m_cameras.reservedSize(); i++) {
            if (!structure.m_cameras.inUse(i)) continue;

            uint64_t prio = structure.m_calibrations[structure.m_cameras[i].calibrationIndex].shuffledIndex;
            uint64_t idx = i;

            tags.push_back((prio << 32) | idx);
        }
        std::sort(tags.begin(), tags.end());
        assert(tags.size() == structure.m_numCameras);

        for (unsigned i = 0; i < tags.size(); i++) {
            unsigned index = tags[i] & 0xFFFFFFFF;
            structure.m_cameras[index].shuffledIndex = i;

            m_cameras[i].firstObservationShuffledIndex = numObservationBlocks*32;
            m_cameras[i].numObservations = structure.m_cameras[index].numObservations;
            m_cameras[i].calibrationShuffledIndex = structure.m_calibrations[structure.m_cameras[index].calibrationIndex].shuffledIndex;

            if ((i == 0) || (m_cameras[i-1].calibrationShuffledIndex != m_cameras[i].calibrationShuffledIndex))
                m_calibrations[m_cameras[i].calibrationShuffledIndex].firstObservationShuffledIndex = numObservationBlocks*32;

            numObservationBlocks += (structure.m_cameras[index].numObservations + 31)/32;
        }
    }
    m_observations.resize(numObservationBlocks);
    memset(&m_observations[0], 0, m_observations.size() * sizeof(ShuffledObservationBlock));
    for (unsigned i = 0; i < m_observations.size(); i++) {
        for (unsigned k = 0; k < 32; k++) {
            m_observations[i].shuffledTrackIndex[k] = -1;
            m_observations[i].shuffledIntraTrackObsIndex[k] = -1;
        }
    }
    {
        std::vector<std::pair<uint64_t, uint32_t>> tags;
        tags.reserve(structure.m_numObservations);
        for (unsigned i = 0; i < structure.m_observations.reservedSize(); i++) {
            if (!structure.m_observations.inUse(i)) continue;

            uint64_t calibShflIdx = structure.m_calibrations[structure.m_observations[i].calibrationIndex].shuffledIndex;
            uint64_t cameraShflIdx = structure.m_cameras[structure.m_observations[i].cameraIndex].shuffledIndex;
            uint64_t trackShflIdx = structure.m_tracks[structure.m_observations[i].trackIndex].shuffledIndex;
            uint64_t idx = i;

            tags.push_back(std::pair<uint64_t, uint32_t>(
                           (calibShflIdx << 48) |
                           (cameraShflIdx << 32) |
                           (trackShflIdx << 0), idx));
        }
        std::sort(tags.begin(), tags.end());

        unsigned shuffleIndex = 0;
        for (unsigned i = 0; i < tags.size(); i++) {
            unsigned index = tags[i].second;
            PFBundleAdjustment::Observation &structureObs = structure.m_observations[index];

            if ((i == 0) || (structure.m_observations[tags[i-1].second].cameraIndex != structureObs.cameraIndex)) {
                shuffleIndex = m_cameras[structure.m_cameras[structureObs.cameraIndex].shuffledIndex].firstObservationShuffledIndex;
            }

            structureObs.shuffledIndex = shuffleIndex++;

            unsigned block = structure.m_observations[index].shuffledIndex / 32;
            unsigned lane = structure.m_observations[index].shuffledIndex % 32;
            m_observations[block].shuffledTrackIndex[lane] = structure.m_tracks[structureObs.trackIndex].shuffledIndex;
        }
    }

    std::vector<unsigned> numTrackRefs;
    numTrackRefs.resize(m_tracks.size()*8);

    for (unsigned i = 0; i < m_observations.size(); i++) {
        ShuffledObservationBlock &block = m_observations[i];
        for (unsigned k = 0; k < 32; k++)
            if (block.shuffledTrackIndex[k] != (unsigned)-1)
                block.shuffledIntraTrackObsIndex[k] = numTrackRefs[block.shuffledTrackIndex[k]]++;
    }
}


template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            copyDataFromStructure(const PFBundleAdjustment::BAStructure &structure)
{
    for (const PFBundleAdjustment::Calibration &structureCalib : structure.m_calibrations) {
        assert(structureCalib.shuffledIndex != (unsigned)-1);

        ProjectionModel::copyDataFromStructure(m_calibrations[structureCalib.shuffledIndex], structureCalib);
        DistortionModel::copyDataFromStructure(m_calibrations[structureCalib.shuffledIndex], structureCalib);
    }

    for (const PFBundleAdjustment::Camera &structureCamera : structure.m_cameras) {
        assert(structureCamera.shuffledIndex != (unsigned)-1);
        m_cameras[structureCamera.shuffledIndex].viewMatrix = structureCamera.viewMatrix;
        m_cameras[structureCamera.shuffledIndex].projectionViewMatrix =
                structure.m_calibrations[structureCamera.calibrationIndex].projectionMatrix *
                structureCamera.viewMatrix;
    }


    for (const PFBundleAdjustment::Track &structureTrack : structure.m_tracks) {
        assert(structureTrack.shuffledIndex != (unsigned)-1);

        const unsigned block = structureTrack.shuffledIndex / 8;
        const unsigned offset = structureTrack.shuffledIndex % 8;

        m_tracks[block].xyzw[offset*4+0] = structureTrack.position[0];
        m_tracks[block].xyzw[offset*4+1] = structureTrack.position[1];
        m_tracks[block].xyzw[offset*4+2] = structureTrack.position[2];
        m_tracks[block].xyzw[offset*4+3] = structureTrack.position[3];
    }

    for (const PFBundleAdjustment::Observation &structureObservation : structure.m_observations) {
        assert(structureObservation.shuffledIndex != (unsigned)-1);
        const unsigned block = structureObservation.shuffledIndex / 32;
        const unsigned offset = structureObservation.shuffledIndex % 32;

        m_observations[block].weight[offset] = structureObservation.weight;
        m_observations[block].x[offset] = structureObservation.screenPos[0];
        m_observations[block].y[offset] = structureObservation.screenPos[1];
    }
}

template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            copyDataToStructure(PFBundleAdjustment::BAStructure &structure) const
{
    for (PFBundleAdjustment::Calibration &structureCalib : structure.m_calibrations) {
        assert(structureCalib.shuffledIndex != (unsigned)-1);

        ProjectionModel::copyDataToStructure(structureCalib, m_calibrations[structureCalib.shuffledIndex]);
        DistortionModel::copyDataToStructure(structureCalib, m_calibrations[structureCalib.shuffledIndex]);
    }

    for (PFBundleAdjustment::Camera &structureCamera : structure.m_cameras) {
        assert(structureCamera.shuffledIndex != (unsigned)-1);
        structureCamera.viewMatrix = m_cameras[structureCamera.shuffledIndex].viewMatrix;

        for (unsigned k = 0; k < 4; k++)
            for (unsigned l = 0; l < 4; l++)
                assert(std::isfinite(structureCamera.viewMatrix[k][l]));
    }

    for (PFBundleAdjustment::Track &structureTrack : structure.m_tracks) {
        assert(structureTrack.shuffledIndex != (unsigned)-1);

        unsigned block = structureTrack.shuffledIndex / 8;
        unsigned offset = structureTrack.shuffledIndex % 8;

        structureTrack.position[0] = m_tracks[block].xyzw[offset*4+0];
        structureTrack.position[1] = m_tracks[block].xyzw[offset*4+1];
        structureTrack.position[2] = m_tracks[block].xyzw[offset*4+2];
        structureTrack.position[3] = m_tracks[block].xyzw[offset*4+3];

        assert(std::isfinite(structureTrack.position[0]));
        assert(std::isfinite(structureTrack.position[1]));
        assert(std::isfinite(structureTrack.position[2]));
        assert(std::isfinite(structureTrack.position[3]));
    }
}


template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            initializeJacobianImpl(SizedJacobi &J, SizedJacobiTransposed &JT) const
{
    JT.resize(m_observations.size());

    unsigned trackParameterIndices[32];
    for (unsigned i = 0; i < m_cameras.size(); i++) {
        unsigned calibrationParamOffset = m_tracks.size()*4*8 +
                                          m_cameras.size()*NUM_CAMERA_UPDATE_PARAMETERS +
                                          m_cameras[i].calibrationShuffledIndex * NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS;
        unsigned cameraParamOffset = m_tracks.size()*4*8 + i*NUM_CAMERA_UPDATE_PARAMETERS;

        unsigned firstBlock = m_cameras[i].firstObservationShuffledIndex / 32;
        unsigned numBlocks = (m_cameras[i].numObservations+31) / 32;
        for (unsigned j = 0; j < numBlocks; j++) {
            const ShuffledObservationBlock &block = m_observations[firstBlock+j];
            for (unsigned k = 0; k < 32; k++)
                if (block.shuffledTrackIndex[k] == (unsigned)-1)
                    trackParameterIndices[k] = 0;
                else
                    trackParameterIndices[k] = block.shuffledTrackIndex[k]*4;

            JT.setupObservationBlock(firstBlock+j, trackParameterIndices, cameraParamOffset, calibrationParamOffset);
        }
    }

    std::vector<unsigned> numTrackRefs;
    numTrackRefs.resize(m_tracks.size()*8);

    for (unsigned i = 0; i < m_observations.size(); i++) {
        const ShuffledObservationBlock &block = m_observations[i];
        for (unsigned k = 0; k < 32; k++)
            if (block.shuffledTrackIndex[k] != (unsigned)-1)
                numTrackRefs[block.shuffledTrackIndex[k]]++;
    }
    std::vector<unsigned> numTrackBlocks;
    numTrackBlocks.resize(m_tracks.size());
    for (unsigned i = 0; i < numTrackBlocks.size(); i++) {
        unsigned num = 0;
        for (unsigned k = 0; k < 8; k++)
            num = std::max(num, numTrackRefs[i*8+k]);
        numTrackBlocks[i] = num;
    }

    std::vector<unsigned> numCamBlocks;
    numCamBlocks.resize(m_cameras.size());
    std::vector<unsigned> numCalibBlocks;
    numCalibBlocks.resize(m_calibrations.size());

    for (unsigned i = 0; i < m_cameras.size(); i++) {
        unsigned numBlocks = (m_cameras[i].numObservations+31) / 32;
        numCamBlocks[i] = numBlocks;
        numCalibBlocks[m_cameras[i].calibrationShuffledIndex] += numBlocks;
    }

    J.resize(numTrackBlocks.size(), numCamBlocks.size(), numCalibBlocks.size(),
             &numTrackBlocks[0], &numCamBlocks[0], &numCalibBlocks[0]);

    for (unsigned i = 0; i < m_cameras.size(); i++) {
        unsigned firstBlock = m_cameras[i].firstObservationShuffledIndex / 32;
        unsigned numBlocks = (m_cameras[i].numObservations+31) / 32;
        for (unsigned j = 0; j < numBlocks; j++) {
            const ShuffledObservationBlock &block = m_observations[firstBlock+j];
            for (unsigned k = 0; k < 32; k++)
                if (block.shuffledTrackIndex[k] != (unsigned)-1) {
                    J.setupSingleTrack(block.shuffledTrackIndex[k],
                                         block.shuffledIntraTrackObsIndex[k],
                                         (firstBlock+j)*32*2+k);
                }
        }
    }
}

template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            initializeJacobian(std::unique_ptr<PFJacobiInterface> &J, std::unique_ptr<PFJacobiTransposedInterface> &JT) const
{
    if ((J == NULL) || (typeid(*J) != typeid(SizedJacobi)))
        J.reset(new SizedJacobi());

    if ((JT == NULL) || (typeid(*JT) != typeid(SizedJacobi)))
        JT.reset(new SizedJacobiTransposed());

    initializeJacobianImpl(static_cast<SizedJacobi&>(*J), static_cast<SizedJacobiTransposed&>(*JT));
}


template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            computeResiduals(std::vector<float, AlignedAllocator<float>> &dst) const
{
    dst.resize(m_observations.size()*32*2);
memset(&dst[0], 0, dst.size()*4);

    for (unsigned i = 0; i < m_cameras.size(); i++) {
        const unsigned firstBlock = m_cameras[i].firstObservationShuffledIndex / 32;
        const unsigned numBlocks = (m_cameras[i].numObservations+31) / 32;
        for (unsigned j = 0; j < numBlocks; j++) {
            const ShuffledObservationBlock &block = m_observations[firstBlock+j];
            const LinAlg::Matrix4x4f &projectionViewMatrix = m_cameras[i].projectionViewMatrix;

            alignas(32) unsigned trackParameterOffsets[32];
            alignas(32) float fValids[32];
            for (unsigned i = 0; i < 8; i++) {
                __m128i a = _mm_load_si128((const __m128i*)&block.shuffledTrackIndex[i*4]);
                __m128i invalids = _mm_cmpeq_epi32(a, _mm_set1_epi32(-1));
                a = _mm_andnot_si128(invalids, a);
                a = _mm_mullo_epi32(a, _mm_set1_epi32(4));
                _mm_store_si128((__m128i*)&trackParameterOffsets[i*4], a);
                _mm_store_si128((__m128i*)&fValids[i*4], _mm_xor_si128(invalids, _mm_set1_epi32(0xFFFFFFFF)));
            }

            CpuWarpReg Track;

            CpuWarpReg U;
            CpuWarpReg V;
            CpuWarpReg W;

            Track.gather(&m_tracks[0].xyzw[0] + 0, trackParameterOffsets);
            U = Track * (&projectionViewMatrix[0][0]);
            V = Track * (&projectionViewMatrix[1][0]);
            W = Track * (&projectionViewMatrix[3][0]);

            Track.gather(&m_tracks[0].xyzw[0] + 1, trackParameterOffsets);
            U += Track * (&projectionViewMatrix[0][1]);
            V += Track * (&projectionViewMatrix[1][1]);
            W += Track * (&projectionViewMatrix[3][1]);

            Track.gather(&m_tracks[0].xyzw[0] + 2, trackParameterOffsets);
            U += Track * (&projectionViewMatrix[0][2]);
            V += Track * (&projectionViewMatrix[1][2]);
            W += Track * (&projectionViewMatrix[3][2]);

            Track.gather(&m_tracks[0].xyzw[0] + 3, trackParameterOffsets);
            U += Track * (&projectionViewMatrix[0][3]);
            V += Track * (&projectionViewMatrix[1][3]);
            W += Track * (&projectionViewMatrix[3][3]);

            //CpuWarpReg rcpW = W.approxRcp();
            CpuWarpReg rcpW = W.rcp();
            CpuWarpReg valids = W.abs().compare<_CMP_GT_OQ>(1e-10f);

            CpuWarpReg S = U*rcpW;
            CpuWarpReg T = V*rcpW;

            DistortionModel::computeDistortionForResiduals(S, T, m_calibrations[m_cameras[i].calibrationShuffledIndex].distortionParams);

            CpuWarpReg weights;
            weights.loadAligned(block.weight);

            CpuWarpReg obsX;
            obsX.loadAligned(block.x);
            CpuWarpReg deltaS = (obsX - S) * weights;

            CpuWarpReg obsY;
            obsY.loadAligned(block.y);
            CpuWarpReg deltaT = (obsY - T) * weights;

            deltaS &= valids;
            deltaT &= valids;
            valids.loadAligned(fValids);
            deltaS &= valids;
            deltaT &= valids;

            deltaS.storeAligned(&dst[(firstBlock+j)*32*2 + 0*32]);
            deltaT.storeAligned(&dst[(firstBlock+j)*32*2 + 1*32]);
        }
    }

}



template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
            performUpdateStepImpl(const std::vector<float, AlignedAllocator<float>> &parameterStep, ThisType &dst) const
{
    for (unsigned i = 0; i < m_tracks.size(); i++) {

        CpuWarpReg params;
        params.loadAligned(m_tracks[i].xyzw);
        CpuWarpReg updateStep;
        updateStep.loadAligned(&parameterStep[i*8*4]);
        params = params + updateStep;

        CpuWarpReg f = params * params;
        //f.reduce4ApproxRsqrtAndBroadcast();
        f.reduce4RsqrtAndBroadcast();
        params = params * f;

        params.storeAligned(dst.m_tracks[i].xyzw);

        for (unsigned j = 0; j < 32; j++)
            assert(std::isfinite(dst.m_tracks[i].xyzw[j]));
/*
        for (unsigned j = 0; j < 8; j++) {
            dst.m_tracks[i].xyzw[j*4+0] /= dst.m_tracks[i].xyzw[j*4+3];
            dst.m_tracks[i].xyzw[j*4+1] /= dst.m_tracks[i].xyzw[j*4+3];
            dst.m_tracks[i].xyzw[j*4+2] /= dst.m_tracks[i].xyzw[j*4+3];
            dst.m_tracks[i].xyzw[j*4+3] /= dst.m_tracks[i].xyzw[j*4+3];
        }
*/
    }

    for (unsigned i = 0; i < m_calibrations.size(); i++) {
        const unsigned paramOffset = m_tracks.size() * 8*4 +
                                     m_cameras.size() * NUM_CAMERA_UPDATE_PARAMETERS +
                                     i * NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS;

        ProjectionModel::performUpdateStep(dst.m_calibrations[i], m_calibrations[i], &parameterStep[paramOffset]);
        DistortionModel::performUpdateStep(dst.m_calibrations[i], m_calibrations[i], &parameterStep[paramOffset+ProjectionModel::NUM_UPDATE_PARAMETERS]);
    }

    for (unsigned i = 0; i < m_cameras.size(); i++) {
        const unsigned paramOffset = m_tracks.size() * 8*4 +
                                     i * NUM_CAMERA_UPDATE_PARAMETERS;

        const ShuffledCamera &srcCamera = m_cameras[i];
        ShuffledCamera &dstCamera = dst.m_cameras[i];

        LinAlg::Matrix4x4f worldToEyeRotation = srcCamera.viewMatrix;
        worldToEyeRotation[0][3] = 0.0f;
        worldToEyeRotation[1][3] = 0.0f;
        worldToEyeRotation[2][3] = 0.0f;

        LinAlg::Vector3f cameraLocation = (worldToEyeRotation.T() * LinAlg::Fill(-srcCamera.viewMatrix[0][3], -srcCamera.viewMatrix[1][3], -srcCamera.viewMatrix[2][3], 1.0f)).StripHom();

        worldToEyeRotation = LinAlg::RotateX<float>(parameterStep[paramOffset+3])
                                        * LinAlg::RotateY<float>(parameterStep[paramOffset+4])
                                        * LinAlg::RotateZ<float>(parameterStep[paramOffset+5]) * worldToEyeRotation;

        LinAlg::Vector3f a, b, c;
        a = worldToEyeRotation[0].StripHom();
        b = worldToEyeRotation[1].StripHom();
        c = worldToEyeRotation[2].StripHom();

        LinAlg::GramSchmidt(a, b, c);

        worldToEyeRotation[0] = a.AddHom(0.0f);
        worldToEyeRotation[1] = b.AddHom(0.0f);
        worldToEyeRotation[2] = c.AddHom(0.0f);
        worldToEyeRotation[3] = LinAlg::Fill<float>(0.0f, 0.0f, 0.0f, 1.0f);


        cameraLocation = cameraLocation + LinAlg::Fill(parameterStep[paramOffset+0], parameterStep[paramOffset+1], parameterStep[paramOffset+2]);

        dstCamera.viewMatrix = worldToEyeRotation
                        * LinAlg::Translation3D<float>(cameraLocation.negated());

        dstCamera.projectionViewMatrix = dst.m_calibrations[srcCamera.calibrationShuffledIndex].projectionMatrix * dstCamera.viewMatrix;

        for (unsigned k = 0; k < 4; k++)
            for (unsigned l = 0; l < 4; l++)
                assert(std::isfinite(dstCamera.viewMatrix[k][l]));
    }
}


template<class ProjectionModel, class DistortionModel>
void BAState<ProjectionModel, DistortionModel>::
                computeJacobiImpl(SizedJacobi &J, SizedJacobiTransposed &JT) const
{
    for (unsigned i = 0; i < m_cameras.size(); i++) {
        const ShuffledCamera &camera = m_cameras[i];

        const LinAlg::Matrix3x4f projectionViewMatrix = camera.projectionViewMatrix.dropRow(2);
        const LinAlg::Matrix3x4f projectionMatrix = m_calibrations[camera.calibrationShuffledIndex].projectionMatrix.dropRow(2);

        LinAlg::Matrix4x4f worldToEyeRotation = camera.viewMatrix;
        worldToEyeRotation[0][3] =
        worldToEyeRotation[1][3] =
        worldToEyeRotation[2][3] = 0.0f;

        typename DistortionModel::JacobianComputations distortionModel(m_calibrations[camera.calibrationShuffledIndex].distortionParams);

        const unsigned calibFirstObservationShuffledIndex = m_calibrations[camera.calibrationShuffledIndex].firstObservationShuffledIndex;

        const unsigned firstBlock = camera.firstObservationShuffledIndex / 32;
        const unsigned numBlocks = (camera.numObservations+31) / 32;
        for (unsigned j = 0; j < numBlocks; j++) {
            const unsigned observationBlockIndex = firstBlock+j;
            const ShuffledObservationBlock &block = m_observations[observationBlockIndex];


            for (unsigned lane = 0; lane < 32; lane++) {
                if (block.shuffledTrackIndex[lane] == (unsigned)-1)
                    continue;

                const float weight = block.weight[lane];

                const float *trackPos = &m_tracks[block.shuffledTrackIndex[lane] / 8].xyzw[(block.shuffledTrackIndex[lane] % 8)*4];

                const float errorFirstDerivative = 1.0f;


                LinAlg::Vector4f trackWorldSpacePosition = LinAlg::Fill(trackPos[0], trackPos[1], trackPos[2], trackPos[3]);

                LinAlg::Vector4f projectiveEyeSpacePosition = camera.viewMatrix * trackWorldSpacePosition;

                LinAlg::Vector3f projectedPos = projectionViewMatrix * trackWorldSpacePosition;
                if (std::abs(projectedPos[2]*projectedPos[2]) <= 1e-30f)
                    continue;

                LinAlg::Vector2f euclideanProjectedPos = projectedPos.StripHom() * (1.0f / projectedPos[2]);

                float trackData[2*4];
                float cameraData[NUM_CAMERA_UPDATE_PARAMETERS*2];
                float calibData[NUM_TOTAL_CALIBRATION_UPDATE_PARAMETERS*2];

                typename DistortionModel::JacobianComputations::DistortionJacobian distortionJacobian =
                            distortionModel.compute(calibData + ProjectionModel::NUM_UPDATE_PARAMETERS*2,
                                                    euclideanProjectedPos,
                                                    weight,
                                                    errorFirstDerivative);

                LinAlg::Vector2f pointGrad;

                // d/dX1
                pointGrad[0] = projectionViewMatrix[0][0] / projectedPos[2] -
                               projectionViewMatrix[2][0] * euclideanProjectedPos[0] / projectedPos[2];
                pointGrad[1] = projectionViewMatrix[1][0] / projectedPos[2] -
                               projectionViewMatrix[2][0] * euclideanProjectedPos[1] / projectedPos[2];

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                trackData[0*4+0] = pointGrad[0] * weight;
                trackData[1*4+0] = pointGrad[1] * weight;
                cameraData[0*6+0] = -pointGrad[0] * weight * trackPos[3];
                cameraData[1*6+0] = -pointGrad[1] * weight * trackPos[3];

                // d/dX2
                pointGrad[0] = projectionViewMatrix[0][1] / projectedPos[2] -
                               projectionViewMatrix[2][1] * euclideanProjectedPos[0] / projectedPos[2];
                pointGrad[1] = projectionViewMatrix[1][1] / projectedPos[2] -
                               projectionViewMatrix[2][1] * euclideanProjectedPos[1] / projectedPos[2];

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                trackData[0*4+1] = pointGrad[0] * weight;
                trackData[1*4+1] = pointGrad[1] * weight;
                cameraData[0*6+1] = -pointGrad[0] * weight * trackPos[3];
                cameraData[1*6+1] = -pointGrad[1] * weight * trackPos[3];

                // d/dX3
                pointGrad[0] = projectionViewMatrix[0][2] / projectedPos[2] -
                               projectionViewMatrix[2][2] * euclideanProjectedPos[0] / projectedPos[2];
                pointGrad[1] = projectionViewMatrix[1][2] / projectedPos[2] -
                               projectionViewMatrix[2][2] * euclideanProjectedPos[1] / projectedPos[2];

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                trackData[0*4+2] = pointGrad[0] * weight;
                trackData[1*4+2] = pointGrad[1] * weight;
                cameraData[0*6+2] = -pointGrad[0] * weight * trackPos[3];
                cameraData[1*6+2] = -pointGrad[1] * weight * trackPos[3];

                // d/dX4
                pointGrad[0] = projectionViewMatrix[0][3] / projectedPos[2] -
                               projectionViewMatrix[2][3] * euclideanProjectedPos[0] / projectedPos[2];
                pointGrad[1] = projectionViewMatrix[1][3] / projectedPos[2] -
                               projectionViewMatrix[2][3] * euclideanProjectedPos[1] / projectedPos[2];

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                trackData[0*4+3] = pointGrad[0] * weight;
                trackData[1*4+3] = pointGrad[1] * weight;

                LinAlg::Vector3f Y = projectionMatrix * projectiveEyeSpacePosition;

                // d/dRx
                pointGrad[0] = (projectionMatrix[0][2] * projectiveEyeSpacePosition[1] -
                                projectionMatrix[0][1] * projectiveEyeSpacePosition[2]) / Y[2] -
                               (projectionMatrix[2][2] * projectiveEyeSpacePosition[1] -
                                projectionMatrix[2][1] * projectiveEyeSpacePosition[2]) * Y[0] / (Y[2]*Y[2]);

                pointGrad[1] = (projectionMatrix[1][2] * projectiveEyeSpacePosition[1] -
                                projectionMatrix[1][1] * projectiveEyeSpacePosition[2]) / Y[2] -
                               (projectionMatrix[2][2] * projectiveEyeSpacePosition[1] -
                                projectionMatrix[2][1] * projectiveEyeSpacePosition[2]) * Y[1] / (Y[2]*Y[2]);

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                cameraData[0*6+3] = pointGrad[0] * weight;
                cameraData[1*6+3] = pointGrad[1] * weight;

                // d/dRy
                pointGrad[0] = (projectionMatrix[0][2] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[0][0] * projectiveEyeSpacePosition[2]) / Y[2] -
                               (projectionMatrix[2][2] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[2][0] * projectiveEyeSpacePosition[2]) * Y[0] / (Y[2]*Y[2]);

                pointGrad[1] = (projectionMatrix[1][2] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[1][0] * projectiveEyeSpacePosition[2]) / Y[2] -
                               (projectionMatrix[2][2] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[2][0] * projectiveEyeSpacePosition[2]) * Y[1] / (Y[2]*Y[2]);

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                cameraData[0*6+4] = pointGrad[0] * weight;
                cameraData[1*6+4] = pointGrad[1] * weight;

                // d/dRz
                pointGrad[0] = (projectionMatrix[0][1] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[0][0] * projectiveEyeSpacePosition[1]) / Y[2] -
                               (projectionMatrix[2][1] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[2][0] * projectiveEyeSpacePosition[1]) * Y[0] / (Y[2]*Y[2]);

                pointGrad[1] = (projectionMatrix[1][1] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[1][0] * projectiveEyeSpacePosition[1]) / Y[2] -
                               (projectionMatrix[2][1] * projectiveEyeSpacePosition[0] -
                                projectionMatrix[2][0] * projectiveEyeSpacePosition[1]) * Y[1] / (Y[2]*Y[2]);

                pointGrad *= errorFirstDerivative;

                distortionModel.applyDistortionJacobian(pointGrad, distortionJacobian);

                assert(std::isfinite(pointGrad[0]));
                assert(std::isfinite(pointGrad[1]));

                cameraData[0*6+5] = pointGrad[0] * weight;
                cameraData[1*6+5] = pointGrad[1] * weight;

                ProjectionModel::computeJacobian(calibData, projectiveEyeSpacePosition,
                            projectedPos, weight, errorFirstDerivative,
                            distortionModel, distortionJacobian);

                JT.setSingleTrackData(observationBlockIndex*32+lane, trackData);
                JT.setSingleCameraData(observationBlockIndex*32+lane, cameraData);
                JT.setSingleInternalCalibData(observationBlockIndex*32+lane, calibData);

                J.setSingleTrackData(block.shuffledTrackIndex[lane], block.shuffledIntraTrackObsIndex[lane], trackData);
                J.setSingleCameraData(i, j*32+lane, cameraData);
                J.setSingleCalibData(camera.calibrationShuffledIndex, observationBlockIndex*32+lane - calibFirstObservationShuffledIndex, calibData);
            }
        }
    }
}

/// @}

}


PFBundleAdjustment::PFBundleAdjustment(const config::BundleAdjustmentStructureConfig &structureConfig)
{
    m_lambda = m_parameterConfig.LevenbergMarquardt_InitialLambda;
    m_baStructure.m_structureConfig = structureConfig;
}

PFBundleAdjustment::~PFBundleAdjustment()
{
    //dtor
}

void PFBundleAdjustment::changeParameterConfig(const config::BundleAdjustmentParameterConfig &config)
{
    m_parameterConfig = config;
}


void PFBundleAdjustment::clear(const config::BundleAdjustmentStructureConfig *config)
{
    m_baStructure.clear();
    if (config)
        m_baStructure.m_structureConfig = *config;
}

void PFBundleAdjustment::reserveInternalCamCalibs(unsigned count)
{

}

void PFBundleAdjustment::reserveCameras(unsigned count)
{

}

void PFBundleAdjustment::reserveObservations(unsigned count)
{

}

void PFBundleAdjustment::reserveTracks(unsigned count)
{

}

PFBundleAdjustment::InternalCamCalibHandle PFBundleAdjustment::addIntCamCalib()
{
    return m_baStructure.addIntCamCalib();
}

PFBundleAdjustment::CameraHandle PFBundleAdjustment::addCamera(InternalCamCalibHandle intCamCalib)
{
    return m_baStructure.addCamera(intCamCalib);
}

PFBundleAdjustment::TrackHandle PFBundleAdjustment::addTrack()
{
    return m_baStructure.addTrack();
}

PFBundleAdjustment::ObservationHandle PFBundleAdjustment::addObservation(CameraHandle camHandle, TrackHandle trackHandle)
{
    return m_baStructure.addObservation(camHandle, trackHandle);
}


void PFBundleAdjustment::setInternalCalibProjectionMatrix(InternalCamCalibHandle intCamCalibHandle, const LinAlg::Matrix4x4f &projectionMatrix)
{
    m_baStructure.setInternalCalibProjectionMatrix(intCamCalibHandle, projectionMatrix);
}

void PFBundleAdjustment::setInternalCalibRadialDistortion(InternalCamCalibHandle intCamCalibHandle, const RadialDistortionParametrization &radialDistortion)
{
    m_baStructure.setInternalCalibRadialDistortion(intCamCalibHandle, radialDistortion);
}



void PFBundleAdjustment::setCamera(CameraHandle camHandle, const LinAlg::Matrix4x4f &viewMatrix)
{
    m_baStructure.setCamera(camHandle, viewMatrix);
}

void PFBundleAdjustment::setObservation(ObservationHandle obsHandle, const LinAlg::Vector2f &screenSpacePosition, float weight)
{
    m_baStructure.setObservation(obsHandle, screenSpacePosition, weight);
}

void PFBundleAdjustment::setTrack(TrackHandle trackHandle, const LinAlg::Vector4f &position)
{
    m_baStructure.setTrack(trackHandle, position);
}


void PFBundleAdjustment::removeIntCamCalib(InternalCamCalibHandle intCamCalib)
{
    m_baStructure.removeIntCamCalib(intCamCalib);
}

void PFBundleAdjustment::removeCamera(CameraHandle camHandle)
{
    m_baStructure.removeCamera(camHandle);
}


void PFBundleAdjustment::removeObservation(ObservationHandle obsHandle)
{
    m_baStructure.removeObservation(obsHandle);
}

void PFBundleAdjustment::removeTrack(TrackHandle trackHandle)
{
    m_baStructure.removeTrack(trackHandle);
}

PFBundleAdjustment::BAStructure::BAStructure()
{
    clear();
}

void PFBundleAdjustment::BAStructure::clear()
{
    m_structureNeedsUpdate = true;

    m_calibrations.clear();
    m_cameras.clear();
    m_tracks.clear();
    m_observations.clear();
    m_numCalibrations = 0;
    m_numCameras = 0;
    m_numTracks = 0;
    m_numObservations = 0;
}


PFBundleAdjustment::InternalCamCalibHandle PFBundleAdjustment::BAStructure::addIntCamCalib()
{
    m_structureNeedsUpdate = true;
    m_numCalibrations++;
    return m_calibrations.allocate(Calibration());
}

PFBundleAdjustment::CameraHandle PFBundleAdjustment::BAStructure::addCamera(InternalCamCalibHandle intCamCalib)
{
    m_structureNeedsUpdate = true;
    m_numCameras++;
    m_calibrations[intCamCalib].numCameras++;
    return m_cameras.allocate(Camera(intCamCalib));
}

PFBundleAdjustment::TrackHandle PFBundleAdjustment::BAStructure::addTrack()
{
    m_structureNeedsUpdate = true;
    m_numTracks++;
    return m_tracks.allocate(Track());
}

PFBundleAdjustment::ObservationHandle PFBundleAdjustment::BAStructure::addObservation(CameraHandle camHandle, TrackHandle trackHandle)
{
    m_structureNeedsUpdate = true;
    m_numObservations++;

    unsigned calibIdx = m_cameras[camHandle].calibrationIndex;
    m_tracks[trackHandle].numObservations++;
    m_cameras[camHandle].numObservations++;
    m_calibrations[calibIdx].numObservations++;
    return m_observations.allocate(Observation(camHandle, calibIdx, trackHandle));

}


void PFBundleAdjustment::BAStructure::setInternalCalibProjectionMatrix(InternalCamCalibHandle intCamCalibHandle,
                                                                       const LinAlg::Matrix4x4f &projectionMatrix)
{
    m_calibrations[intCamCalibHandle].projectionMatrix = projectionMatrix;
}

void PFBundleAdjustment::BAStructure::setInternalCalibRadialDistortion(InternalCamCalibHandle intCamCalibHandle,
                                                                       const RadialDistortionParametrization &radialDistortion)
{
    m_calibrations[intCamCalibHandle].radialDistortion = radialDistortion;
}


void PFBundleAdjustment::BAStructure::setCamera(CameraHandle camHandle, const LinAlg::Matrix4x4f &viewMatrix)
{
    m_cameras[camHandle].viewMatrix = viewMatrix;
}

void PFBundleAdjustment::BAStructure::setObservation(ObservationHandle obsHandle, const LinAlg::Vector2f &screenSpacePosition, float weight)
{
    m_observations[obsHandle].weight = weight;
    m_observations[obsHandle].screenPos = screenSpacePosition;
}

void PFBundleAdjustment::BAStructure::setTrack(TrackHandle trackHandle, const LinAlg::Vector4f &position)
{
    m_tracks[trackHandle].position = position;
}


void PFBundleAdjustment::BAStructure::removeIntCamCalib(InternalCamCalibHandle intCamCalib)
{
    m_structureNeedsUpdate = true;

    assert(m_calibrations[intCamCalib].numObservations == 0);
    assert(m_calibrations[intCamCalib].numCameras == 0);

    m_calibrations.free(intCamCalib);
    m_numCalibrations--;
}

void PFBundleAdjustment::BAStructure::removeCamera(CameraHandle camHandle)
{
    m_structureNeedsUpdate = true;

    assert(m_cameras[camHandle].numObservations == 0);
    m_calibrations[m_cameras[camHandle].calibrationIndex].numCameras--;
    m_cameras.free(camHandle);
    m_numCameras--;
}


void PFBundleAdjustment::BAStructure::removeObservation(ObservationHandle obsHandle)
{
    m_structureNeedsUpdate = true;


    m_tracks[m_observations[obsHandle].trackIndex].numObservations--;
    m_cameras[m_observations[obsHandle].cameraIndex].numObservations--;
    m_calibrations[m_observations[obsHandle].calibrationIndex].numObservations--;

    m_observations.free(obsHandle);
    m_numObservations--;
}

void PFBundleAdjustment::BAStructure::removeTrack(TrackHandle trackHandle)
{
    m_structureNeedsUpdate = true;

    assert(m_tracks[trackHandle].numObservations == 0);
    m_tracks.free(trackHandle);
    m_numTracks--;
}


/*
typedef BAState<DistortionModel_PolynomialRadial234Dynamic,
            ProjectionModel_Full<typename DistortionModel_PolynomialRadial234Dynamic::JacobianComputations> > BAState_Polynomial234_FullProjection;
typedef BAState<DistortionModel_PolynomialRadial234Dynamic,
            ProjectionModel_SharedFocalLength<typename DistortionModel_PolynomialRadial234Dynamic::JacobianComputations> > BAState_Polynomial234_SharedFocalLength;
typedef BAState<DistortionModel_PolynomialRadial234Fixed,
            ProjectionModel_Fixed<typename DistortionModel_PolynomialRadial234Fixed::JacobianComputations> > BAState_Polynomial234_FixedProjection;

typedef BAState<DistortionModel_NoDistortion,
            ProjectionModel_Full<typename DistortionModel_NoDistortion::JacobianComputations> > BAState_NoDistortion_FullProjection;
typedef BAState<DistortionModel_NoDistortion,
            ProjectionModel_SharedFocalLength<typename DistortionModel_NoDistortion::JacobianComputations> > BAState_NoDistortion_SharedFocalLength;
typedef BAState<DistortionModel_NoDistortion,
            ProjectionModel_Fixed<typename DistortionModel_NoDistortion::JacobianComputations> > BAState_NoDistortion_FixedProjection;

*/

template<typename Type>
void ensureBaStateType(std::unique_ptr<detail::BAStateInterface> &state)
{
    if ((state == nullptr) || (typeid(*state) != typeid(Type)))
        state.reset(new Type());
}

void PFBundleAdjustment::BAStructure::initializeStates(std::unique_ptr<detail::BAStateInterface> *states)
{
    switch (m_structureConfig.internalCalibrationType) {
        case config::BundleAdjustmentStructureConfig::InternalCalibrationType::Fixed:
            switch (m_structureConfig.radialDistortionType) {
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::NoRadialDistortion:
                    ensureBaStateType<detail::BAState_NoDistortion_FixedProjection>(states[0]);
                    ensureBaStateType<detail::BAState_NoDistortion_FixedProjection>(states[1]);
                break;
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::Polynomial_234:
                    ensureBaStateType<detail::BAState_Polynomial234_FixedProjection>(states[0]);
                    ensureBaStateType<detail::BAState_Polynomial234_FixedProjection>(states[1]);
                break;
                default:
                    throw std::runtime_error("Unknown radial distortion type!");
                break;
            }
        break;
        case config::BundleAdjustmentStructureConfig::InternalCalibrationType::FullCalibration:
            switch (m_structureConfig.radialDistortionType) {
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::NoRadialDistortion:
                    ensureBaStateType<detail::BAState_NoDistortion_FullProjection>(states[0]);
                    ensureBaStateType<detail::BAState_NoDistortion_FullProjection>(states[1]);
                break;
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::Polynomial_234:
                    ensureBaStateType<detail::BAState_Polynomial234_FullProjection>(states[0]);
                    ensureBaStateType<detail::BAState_Polynomial234_FullProjection>(states[1]);
                break;
                default:
                    throw std::runtime_error("Unknown radial distortion type!");
                break;
            }
        break;
        case config::BundleAdjustmentStructureConfig::InternalCalibrationType::SharedFocalLength:
            switch (m_structureConfig.radialDistortionType) {
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::NoRadialDistortion:
                    ensureBaStateType<detail::BAState_NoDistortion_SharedFocalLength>(states[0]);
                    ensureBaStateType<detail::BAState_NoDistortion_SharedFocalLength>(states[1]);
                break;
                case config::BundleAdjustmentStructureConfig::RadialDistortionType::Polynomial_234:
                    ensureBaStateType<detail::BAState_Polynomial234_SharedFocalLength>(states[0]);
                    ensureBaStateType<detail::BAState_Polynomial234_SharedFocalLength>(states[1]);
                break;
                default:
                    throw std::runtime_error("Unknown radial distortion type!");
                break;
            }
        break;
        default:
            throw std::runtime_error("Unknown internal calibration type!");
        break;
    }

    states[0]->initializeFromStructure(*this);
    *states[1] = *states[0];
}



float dot(const std::vector<float, AlignedAllocator<float>> &a, const std::vector<float, AlignedAllocator<float>> &b)
{
    assert(a.size() == b.size());
    __m256 sum1 = _mm256_setzero_ps();
    __m256 sum2 = _mm256_setzero_ps();
    for (unsigned i = 0; i < b.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&a[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&a[i*16 + 1*8]);
        __m256 b0 = _mm256_load_ps(&b[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&b[i*16 + 1*8]);

        sum1 = _mm256_add_ps(sum1, _mm256_mul_ps(a0, b0));
        sum2 = _mm256_add_ps(sum2, _mm256_mul_ps(a1, b1));
    }
    sum1 = _mm256_add_ps(sum1, sum2);
    __m128 tmp = _mm_add_ps(_mm256_extractf128_ps(sum1, 0), _mm256_extractf128_ps(sum1, 1));
    tmp = _mm_hadd_ps(tmp, tmp);
    tmp = _mm_hadd_ps(tmp, tmp);
    float result;
    _mm_store_ss(&result, tmp);

    for (unsigned i = b.size()/16*16; i < b.size(); i++)
        result += a[i]*b[i];

    return result;
}

float sqrLen(const std::vector<float, AlignedAllocator<float>> &a)
{
    __m256 sum1 = _mm256_setzero_ps();
    __m256 sum2 = _mm256_setzero_ps();
    for (unsigned i = 0; i < a.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&a[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&a[i*16 + 1*8]);

        sum1 = _mm256_add_ps(sum1, _mm256_mul_ps(a0, a0));
        sum2 = _mm256_add_ps(sum2, _mm256_mul_ps(a1, a1));
    }
    sum1 = _mm256_add_ps(sum1, sum2);
    __m128 tmp = _mm_add_ps(_mm256_extractf128_ps(sum1, 0), _mm256_extractf128_ps(sum1, 1));
    tmp = _mm_hadd_ps(tmp, tmp);
    tmp = _mm_hadd_ps(tmp, tmp);
    float result;
    _mm_store_ss(&result, tmp);

    for (unsigned i = a.size()/16*16; i < a.size(); i++)
        result += a[i]*a[i];

    return result;
}

void madd(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &fac1, float fac2)
{
    assert(dst.size() == fac1.size());

    __m256 fac = _mm256_set1_ps(fac2);

    for (unsigned i = 0; i < fac1.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&dst[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&dst[i*16 + 1*8]);

        __m256 b0 = _mm256_load_ps(&fac1[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&fac1[i*16 + 1*8]);

        a0 = _mm256_add_ps(a0, _mm256_mul_ps(b0, fac));
        a1 = _mm256_add_ps(a1, _mm256_mul_ps(b1, fac));

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = fac1.size()/16*16; i < fac1.size(); i++)
        dst[i] += fac1[i]*fac2;
}

void madd(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &sum1, const std::vector<float, AlignedAllocator<float>> &fac1, float fac2)
{
    assert(dst.size() == fac1.size());

    __m256 fac = _mm256_set1_ps(fac2);

    for (unsigned i = 0; i < fac1.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&sum1[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&sum1[i*16 + 1*8]);

        __m256 b0 = _mm256_load_ps(&fac1[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&fac1[i*16 + 1*8]);

        a0 = _mm256_add_ps(a0, _mm256_mul_ps(b0, fac));
        a1 = _mm256_add_ps(a1, _mm256_mul_ps(b1, fac));

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = fac1.size()/16*16; i < fac1.size(); i++)
        dst[i] = sum1[i] + fac1[i]*fac2;
}


void rmadd(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &sum1, float fac2)
{
    assert(dst.size() == sum1.size());

    __m256 fac = _mm256_set1_ps(fac2);

    for (unsigned i = 0; i < sum1.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&dst[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&dst[i*16 + 1*8]);

        __m256 b0 = _mm256_load_ps(&sum1[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&sum1[i*16 + 1*8]);

        a0 = _mm256_add_ps(b0, _mm256_mul_ps(a0, fac));
        a1 = _mm256_add_ps(b1, _mm256_mul_ps(a1, fac));

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = sum1.size()/16*16; i < sum1.size(); i++)
        dst[i] = sum1[i] + dst[i]*fac2;
}


void mul(std::vector<float, AlignedAllocator<float>> &dst, float fac2)
{
    __m256 fac = _mm256_set1_ps(fac2);

    for (unsigned i = 0; i < dst.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&dst[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&dst[i*16 + 1*8]);

        a0 = _mm256_mul_ps(a0, fac);
        a1 = _mm256_mul_ps(a1, fac);

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = dst.size()/16*16; i < dst.size(); i++)
        dst[i] *= fac2;
}


void sub(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &fac1, const std::vector<float, AlignedAllocator<float>> &fac2)
{
    assert(dst.size() == fac1.size());
    assert(dst.size() == fac2.size());


    for (unsigned i = 0; i < fac1.size()/16; i++) {
        __m256 b0 = _mm256_load_ps(&fac1[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&fac1[i*16 + 1*8]);

        __m256 c0 = _mm256_load_ps(&fac2[i*16 + 0*8]);
        __m256 c1 = _mm256_load_ps(&fac2[i*16 + 1*8]);

        __m256 a0 = _mm256_sub_ps(b0, c0);
        __m256 a1 = _mm256_sub_ps(b1, c1);

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = fac1.size()/16*16; i < fac1.size(); i++)
        dst[i] = fac1[i]-fac2[i];
}


void madd(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &fac1, const std::vector<float, AlignedAllocator<float>> &fac2)
{
    assert(dst.size() == fac1.size());
    assert(dst.size() == fac2.size());


    for (unsigned i = 0; i < fac1.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&dst[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&dst[i*16 + 1*8]);

        __m256 b0 = _mm256_load_ps(&fac1[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&fac1[i*16 + 1*8]);

        __m256 c0 = _mm256_load_ps(&fac2[i*16 + 0*8]);
        __m256 c1 = _mm256_load_ps(&fac2[i*16 + 1*8]);

        a0 = _mm256_add_ps(a0, _mm256_mul_ps(b0, c0));
        a1 = _mm256_add_ps(a1, _mm256_mul_ps(b1, c1));

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = fac1.size()/16*16; i < fac1.size(); i++)
        dst[i] += fac1[i]*fac2[i];
}


void mul(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &fac1, const std::vector<float, AlignedAllocator<float>> &fac2)
{
    assert(fac1.size() == fac2.size());
    dst.resize(fac1.size());

    for (unsigned i = 0; i < fac1.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&fac1[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&fac1[i*16 + 1*8]);

        __m256 b0 = _mm256_load_ps(&fac2[i*16 + 0*8]);
        __m256 b1 = _mm256_load_ps(&fac2[i*16 + 1*8]);

        a0 = _mm256_mul_ps(a0, b0);
        a1 = _mm256_mul_ps(a1, b1);

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = fac1.size()/16*16; i < fac1.size(); i++)
        dst[i] = fac1[i]*fac2[i];
}


void rcp(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &src)
{
    dst.resize(src.size());
    for (unsigned i = 0; i < dst.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&src[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&src[i*16 + 1*8]);

        a0 = _mm256_rcp_ps(a0);
        a1 = _mm256_rcp_ps(a1);

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = dst.size()/16*16; i < dst.size(); i++)
        dst[i] = 1.0f / src[i];
}


void rcpThresh(std::vector<float, AlignedAllocator<float>> &dst, const std::vector<float, AlignedAllocator<float>> &src, float thresh)
{
    dst.resize(src.size());
    __m256 vecThresh = _mm256_set1_ps(thresh);

    for (unsigned i = 0; i < dst.size()/16; i++) {
        __m256 a0 = _mm256_load_ps(&src[i*16 + 0*8]);
        __m256 a1 = _mm256_load_ps(&src[i*16 + 1*8]);

        __m256 absA0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0);
        __m256 absA1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1);

        __m256 LargeEnough0 = _mm256_cmp_ps(absA0, vecThresh, _CMP_GT_OQ);
        __m256 LargeEnough1 = _mm256_cmp_ps(absA1, vecThresh, _CMP_GT_OQ);
/*
        a0 = _mm256_rcp_ps(a0);
        a1 = _mm256_rcp_ps(a1);
*/
        a0 = _mm256_div_ps(_mm256_set1_ps(1.0f), a0);
        a1 = _mm256_div_ps(_mm256_set1_ps(1.0f), a1);

        a0 = _mm256_and_ps(a0, LargeEnough0);
        a1 = _mm256_and_ps(a1, LargeEnough1);

        _mm256_store_ps(&dst[i*16 + 0*8], a0);
        _mm256_store_ps(&dst[i*16 + 1*8], a1);
    }

    for (unsigned i = dst.size()/16*16; i < dst.size(); i++)
        if (std::abs(src[i]) > thresh)
            dst[i] = 1.0f / src[i];
        else
            dst[i] = 0.0f;
}

void PFBundleAdjustment::iterate(unsigned iterations)
{
    if (m_baStructure.m_structureNeedsUpdate) {

        if (m_parameterConfig.verbatimLevel >= 1)
            std::cout << "Initializing state!" << std::endl;
        m_baStructure.initializeStates(m_states);
        m_currentState = m_states[0].get();
        m_nextState = m_states[1].get();

        m_currentState->copyDataFromStructure(m_baStructure);
        if (m_parameterConfig.verbatimLevel >= 1)
            std::cout << "Initializing state done!" << std::endl;


        if (m_parameterConfig.verbatimLevel >= 1) {
            std::cout << "Calibs: " << m_baStructure.m_numCalibrations << std::endl;
            std::cout << "Cameras: " << m_baStructure.m_numCameras << std::endl;
            std::cout << "Tracks: " << m_baStructure.m_numTracks << std::endl;
            std::cout << "Observations: " << m_baStructure.m_numObservations << std::endl;
        }

        m_currentState->initializeJacobian(m_J, m_JT);
/*
        J.writeLayoutToImage("J.png");
        JT.writeLayoutToImage("JT.png");
*/


        //std::cout << "J = " << J;
        //exit(1);
        m_lambda = m_parameterConfig.LevenbergMarquardt_InitialLambda;

        m_baStructure.m_structureNeedsUpdate = false;
    }
    m_currentState->copyDataFromStructure(m_baStructure);
    m_nextState->copyDataFromStructure(m_baStructure);

    m_currentState->computeResiduals(m_residuals);
    float lastResiduals = sqrLen(m_residuals);
    if (m_parameterConfig.verbatimLevel >= 1) {
        std::cout << "LM_PCGD residual before: " << lastResiduals << std::endl;
    }
    Engine::CPUStopWatch stopwatch;
    unsigned lmIteration;
    for (lmIteration = 0; lmIteration < iterations; lmIteration++) {
        m_currentState->computeJacobi(*m_J, *m_JT);
        m_J->multiplyWithVector(m_residuals, m_JT_times_residuals, 1000);
        m_J->computeDiagonalOfJTJ(m_diag_JTJ_times_lambda);
        mul(m_diag_JTJ_times_lambda, m_lambda);
        m_preconditioner = m_diag_JTJ_times_lambda;
        mul(m_preconditioner, 1.0f+1.0f/m_lambda);
        rcpThresh(m_rcpPreconditioner, m_preconditioner, 1e-30f);
/*
            std::cout << "m_JT_times_residuals = [";
            for (unsigned i = 0; i < m_JT_times_residuals.size(); i++) std::cout << " " << m_JT_times_residuals[i];
            std::cout << "];" << std::endl;

            std::cout << "m_rcpPreconditioner = [";
            for (unsigned i = 0; i < m_rcpPreconditioner.size(); i++) std::cout << " " << m_rcpPreconditioner[i];
            std::cout << "];" << std::endl;
*/
        //mul(m_rcpPreconditioner, 1.0f / std::sqrt(sqrLen(m_rcpPreconditioner)));
/*
        m_rcpPreconditioner = m_preconditioner;

        for (unsigned i = 0; i < m_rcpPreconditioner.size(); i++)
            m_rcpPreconditioner[i] = 1.0f;
            //m_rcpPreconditioner[i] = 1.0f / (m_preconditioner[i] >= 0.0f?std::max(m_preconditioner[i], 1e-10f):std::min(m_preconditioner[i], -1e-10f));
*/
        m_x.resize(m_JT_times_residuals.size());
        memset(&m_x[0], 0, m_x.size()*4);

        m_r = m_JT_times_residuals;

        mul(m_z, m_rcpPreconditioner, m_r);
        m_p = m_z;
        float oldRtimesZ = dot(m_r, m_z);
        const float RtimesZThreshold = oldRtimesZ * m_parameterConfig.PCGD_RtimesZThresholdFactor;
        //std::cout << oldRtimesZ << std::endl;
        for (unsigned pcgdIteration = 0; pcgdIteration < m_parameterConfig.PCGD_MaxIterations; pcgdIteration++) {

            if (m_parameterConfig.verbatimLevel >= 2)
                std::cout << "iteration:" << pcgdIteration << "  " << oldRtimesZ << std::endl;
/*
            std::cout << "lambda = " << m_lambda <<";" << std::endl;
            std::cout << "p = [";
            for (unsigned i = 0; i < m_p.size(); i++) std::cout << " " << m_p[i];
            std::cout << "];" << std::endl;

            std::cout << "J = " << m_J << std::endl;
*/

            m_JT->multiplyWithVector(m_p, m_Jp, 1000);
            m_J->multiplyWithVector(m_Jp, m_Ap, 1000);
            madd(m_Ap, m_diag_JTJ_times_lambda, m_p);
/*
            std::cout << "p = [";
            for (unsigned i = 0; i < m_p.size(); i++) std::cout << " " << m_p[i];
            std::cout << "];" << std::endl;

            std::cout << "ap = [";
            for (unsigned i = 0; i < m_Ap.size(); i++) std::cout << " " << m_Ap[i];
            std::cout << "];" << std::endl;

            std::cout << "diagTimesLambda = [";
            for (unsigned i = 0; i < m_diag_JTJ_times_lambda.size(); i++) std::cout << " " << m_diag_JTJ_times_lambda[i];
            std::cout << "];" << std::endl;

            return;
*/
            float alpha = oldRtimesZ / dot(m_p, m_Ap);

            madd(m_x, m_p, alpha);

/*
            if (pcgdIteration % 4 == 0) {
                m_JT->multiplyWithVector(m_x, m_Jp);
                m_J->multiplyWithVector(m_Jp, m_r);
                madd(m_r, m_diag_JTJ_times_lambda, m_x);
                sub(m_r, m_JT_times_residuals, m_r);
            } else */
                madd(m_r, m_Ap, -alpha);
/*
            if (pcgdIteration % 1 == 0) {
                m_JT->multiplyWithVector(m_x, m_Jp);
                m_J->multiplyWithVector(m_Jp, m_r2);
                madd(m_r2, m_diag_JTJ_times_lambda, m_x);
                sub(m_r2, m_JT_times_residuals, m_r2);

                sub(m_r2, m_r2, m_r);
                std::cout << std::sqrt(dot(m_r2, m_r2) / dot(m_r, m_r)) << std::endl;
            }
*/
            mul(m_z, m_rcpPreconditioner, m_r);

            float newRtimesZ = dot(m_r, m_z);
            if (newRtimesZ < RtimesZThreshold) {
                oldRtimesZ = newRtimesZ;
                break;
            }

            rmadd(m_p, m_z, newRtimesZ/oldRtimesZ);
            oldRtimesZ = newRtimesZ;
        }
/*
        if (lmIteration == iterations-1) {
            std::cout << "x = ";
            for (unsigned i = 0; i < m_x.size(); i++) {
                std::cout << " " << m_x[i];
            }
            std::cout << std::endl;
        }
*/
        m_currentState->performUpdateStep(m_x, *m_nextState);
        m_nextState->computeResiduals(m_residuals);
        float newResiduals = sqrLen(m_residuals);
        if (newResiduals < lastResiduals) {
            if (m_parameterConfig.verbatimLevel >= 2)
                std::cout << "Success with residual of " << newResiduals << ", decreasing lambda to " << m_lambda << std::endl;
            m_lambda *= m_parameterConfig.LevenbergMarquardt_LambdaDecreaseFactor;
            m_lambda = fmax(m_lambda, 1e-10f);
            lastResiduals = newResiduals;
            std::swap(m_currentState, m_nextState);
        } else {
            m_lambda *= m_parameterConfig.LevenbergMarquardt_LambdaIncreaseFactor;
            if (m_parameterConfig.verbatimLevel >= 2)
                std::cout << "Misstep with residual of " << newResiduals << ", increasing lambda to " << m_lambda << std::endl;
            m_currentState->computeResiduals(m_residuals);
        }
        if (m_lambda > m_parameterConfig.LevenbergMarquardt_MaxLambdaForConvergence) {
            lmIteration++;
            break;
        }
    }
    if (m_parameterConfig.verbatimLevel >= 1) {
        std::cout << "LM_PCGD residual after: " << lastResiduals << std::endl;
        std::cout << "LM_PCGD iteration: " << stopwatch.getNanoseconds() / lmIteration * 1e-6f << "ms" << std::endl;
    }
    m_currentState->copyDataToStructure(m_baStructure);
}

}

}
