/*
    Structure from Motion with Deferred Feature Matching and Subset Bundle Adjustment
    Copyright (C) 2015 Andreas Ley <andy-ley@arcor.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "BlockSparseMatrix.h"
#include <iostream>
#include <stdexcept>
#include <string.h>

#include "RasterImage.h"

#include <immintrin.h>

#include <assert.h>

namespace LinAlg {


namespace BlockSparseMatrix {

void LayoutGenerator::resize(unsigned w, unsigned h)
{
    m_width = w;
    m_height = h;
    m_numBlocksX = (m_width + BLOCK_SIZE-1)/BLOCK_SIZE;
    m_numBlocksY = (m_height + BLOCK_SIZE-1)/BLOCK_SIZE;
    m_blocksUsed.resize(m_numBlocksX * m_numBlocksY);
    clear();
}

void LayoutGenerator::clear()
{
    for (unsigned i = 0; i < m_numBlocksX * m_numBlocksY; i++)
        m_blocksUsed[i] = false;
}


Layout::Layout()
{
    m_width = 0;
    m_height = 0;
    m_numBlocksX = 0;
    m_numBlocksY = 0;

    m_mode = MODE_ARBITRARY;
}

unsigned Layout::getBlockIndex(unsigned bx, unsigned by) const
{
    unsigned blockIndex = (unsigned)-1;

    const std::vector<BlockEntry> &rowIndices = m_rows[by];
    for (unsigned i = 0; i < rowIndices.size(); i++) {
        if (rowIndices[i].offset == bx) {
            blockIndex = rowIndices[i].blockIndex;
            break;
        }
    }

    return blockIndex;
}


ElementHandle Layout::getElementHandle(unsigned x, unsigned y) const
{
    ElementHandle handle;
    unsigned blockIndex = getBlockIndex(x/4, y/4);

    if (blockIndex == (unsigned)-1) {
        throw std::runtime_error("Block not present in matrix!");
    }

    unsigned intraBlockX = x % 4;
    unsigned intraBlockY = y % 4;

    unsigned intraBlockIndex = intraBlockX*4 + intraBlockY; // column major

    handle.blockIndex = blockIndex;
    handle.intraBlockIndex = intraBlockIndex;

    return handle;
}

void Layout::debugFindBlockCoord(unsigned blockIndex, unsigned &bx, unsigned &by) const
{
    for (unsigned i = 0; i < m_rows.size(); i++) {
        for (unsigned j = 0; j < m_rows[i].size(); j++) {
            if (m_rows[i][j].blockIndex == blockIndex) {
                by = i;
                bx = m_rows[i][j].offset;
                return;
            }
        }
    }
    assert(false && "Block not found!");
}



void Layout::createFromGenerator(const LayoutGenerator &generator, bool symmetric)
{
    m_width = generator.getWidth();
    m_height = generator.getHeight();
    m_numBlocksX = (m_width + BLOCK_SIZE-1)/BLOCK_SIZE;
    m_numBlocksY = (m_height + BLOCK_SIZE-1)/BLOCK_SIZE;

    m_mode = symmetric?MODE_SYMMETRIC:MODE_ARBITRARY;

    m_cols.resize(m_numBlocksX);
    m_rows.resize(m_numBlocksY);

    unsigned nextBlockIndex = 0;

    std::vector<unsigned> rowSize;
    std::vector<unsigned> colSize;

    rowSize.resize(m_numBlocksY);
    colSize.resize(m_numBlocksX);

    for (unsigned y = 0; y < m_numBlocksY; y++) {
        for (unsigned x = 0; x < m_numBlocksX; x++) {
            if (generator.isBlockSet(x, y)) {
                rowSize[y]++;
                colSize[x]++;
            }
        }
    }

    for (unsigned y = 0; y < m_numBlocksY; y++)
        m_rows[y].reserve(rowSize[y]);

    for (unsigned x = 0; x < m_numBlocksX; x++)
        m_cols[x].reserve(colSize[x]);


    for (unsigned y = 0; y < m_numBlocksY; y++) {
        std::vector<BlockEntry> &rowIndices = m_rows[y];
        for (unsigned x = 0; x < m_numBlocksX; x++) {
            std::vector<BlockEntry> &colIndices = m_cols[x];

            if (generator.isBlockSet(x, y)) {
                rowIndices.push_back(BlockEntry(x, nextBlockIndex));
                colIndices.push_back(BlockEntry(y, nextBlockIndex));

                nextBlockIndex++;
            }
        }
    }

    m_numBlocks = nextBlockIndex;
}

void Layout::createMultiplicationMatrixWithTransposed(const Layout &src, MultiplicationWithTransposeRecipe &recipe)
{
    m_width = src.getHeight();
    m_height = src.getHeight();
    m_numBlocksX = (m_width + BLOCK_SIZE-1)/BLOCK_SIZE;
    m_numBlocksY = (m_height + BLOCK_SIZE-1)/BLOCK_SIZE;

    m_mode = MODE_SYMMETRIC;

    m_cols.resize(m_numBlocksX);
    m_rows.resize(m_numBlocksY);

    unsigned nextBlockIndex = 0;
    recipe.jobs.clear();

    for (unsigned y = 0; y < m_numBlocksY; y++) {
        std::vector<BlockEntry> &rowIndices = m_rows[y];
        for (unsigned x = 0; x <= y; x++) {
            std::vector<BlockEntry> &colIndices = m_cols[x];

            bool blockNonZero = false;
            MultiplicationWithTransposeRecipe::BlockJob job;
            {
                const std::vector<BlockEntry> &leftRow = src.m_rows[y];
                const std::vector<BlockEntry> &rightCol = src.m_rows[x]; // row cause it's transposed
                unsigned rightIndex = 0;
                for (unsigned k = 0; k < leftRow.size(); k++) {
                    while ((rightIndex < rightCol.size()) && (rightCol[rightIndex].offset < leftRow[k].offset))
                        rightIndex++;

                    if (rightIndex == rightCol.size())
                        break;

                    if (rightCol[rightIndex].offset == leftRow[k].offset) {
                        blockNonZero = true;

                        job.sourcePairs.push_back(std::make_pair(leftRow[k].blockIndex, rightCol[rightIndex].blockIndex));
                    }
                }
            }

            if (blockNonZero) {
                rowIndices.push_back(BlockEntry(x, nextBlockIndex));
                colIndices.push_back(BlockEntry(y, nextBlockIndex));
                job.targetBlockIndex = nextBlockIndex;

                recipe.jobs.push_back(job);

                nextBlockIndex++;
            }
        }
    }
    m_numBlocks = nextBlockIndex;
}

void Layout::createCholeskyFactorization(const Layout &src, CholeskyFactorizationRecipe &recipe)
{
    if (!src.m_mode == MODE_SYMMETRIC)
        throw std::runtime_error("Source matrix must be symmetric!");

    m_width = src.getWidth();
    m_height = src.getHeight();
    m_numBlocksX = (m_width + BLOCK_SIZE-1)/BLOCK_SIZE;
    m_numBlocksY = (m_height + BLOCK_SIZE-1)/BLOCK_SIZE;

    m_mode = MODE_LOWER_TRIANGULAR;

    m_cols.resize(m_numBlocksX);
    m_rows.resize(m_numBlocksY);

    unsigned nextBlockIndex = 0;

    for (unsigned y = 0; y < m_numBlocksY; y++) {
        std::vector<BlockEntry> &rowIndices = m_rows[y];
        for (unsigned x = 0; x < y; x++) {
            std::vector<BlockEntry> &colIndices = m_cols[x];

            const std::vector<BlockEntry> &leftRow = src.m_rows[y];
            const std::vector<BlockEntry> &topRow = src.m_rows[x];
            unsigned topIndex = 0;

            bool blockNonZero = false;

            for (unsigned k = 0; k < leftRow.size(); k++) {
                if (leftRow[k].offset > x)
                    break;

                while ((topIndex < topRow.size()) && (topRow[topIndex].offset < leftRow[k].offset))
                    topIndex++;

                if (topIndex == topRow.size())
                    break;

                if (topRow[topIndex].offset == leftRow[k].offset) {
                    blockNonZero = true;
                    break;
                }
            }

            if (blockNonZero) {
                rowIndices.push_back(BlockEntry(x, nextBlockIndex));
                colIndices.push_back(BlockEntry(y, nextBlockIndex));

                nextBlockIndex++;
            }
        }

        rowIndices.push_back(BlockEntry(y, nextBlockIndex));
        m_cols[y].push_back(BlockEntry(y, nextBlockIndex));

        nextBlockIndex++;
    }
    m_numBlocks = nextBlockIndex;

    recipe.columns.resize(m_numBlocksY);
    for (unsigned bx = 0; bx < m_numBlocksX; bx++) {
        const std::vector<BlockEntry> &col = m_cols[bx];
        CholeskyFactorizationRecipe::Column &recipeColumn = recipe.columns[bx];


        recipeColumn.rowJobs.resize(col.size()-1);
        for (unsigned i = 0; i < recipeColumn.rowJobs.size(); i++) {
            CholeskyFactorizationRecipe::RowElementJob &rowJob = recipeColumn.rowJobs[i];

            unsigned blockY = col[1+i].offset;
            const std::vector<BlockEntry> &row = m_rows[blockY];

            rowJob.diagonalHeadBlockIndex = getBlockIndex(bx, bx);
assert(rowJob.diagonalHeadBlockIndex != (unsigned)-1);
            rowJob.sourceBlockIndex = src.getBlockIndex(bx, blockY);
            rowJob.targetBlockIndex = getBlockIndex(bx, blockY);
assert(rowJob.targetBlockIndex != (unsigned)-1);
            rowJob.sumBlocks.clear();
            for (unsigned j = 0; j < row.size(); j++) {
                if (row[j].offset >= bx)
                    break;

                unsigned topIndex = getBlockIndex(row[j].offset, bx);
                if (topIndex != (unsigned)-1)
                    rowJob.sumBlocks.push_back(CholeskyFactorizationRecipe::RowElementJob::Pair(row[j].blockIndex, topIndex));
            }
        }
        CholeskyFactorizationRecipe::DiagonalElementJob &diagonalJob = recipeColumn.diagonalJob;
        diagonalJob.targetBlockIndex = getBlockIndex(bx, bx);
assert(diagonalJob.targetBlockIndex != (unsigned)-1);
        diagonalJob.sourceBlockIndex = src.getBlockIndex(bx, bx);
        const std::vector<BlockEntry> &row = m_rows[bx];
        diagonalJob.rowBlocks.resize(row.size()-1);
        for (unsigned i = 0; i < diagonalJob.rowBlocks.size(); i++) {
            diagonalJob.rowBlocks[i] = row[i].blockIndex;
        }
    }
}

void Layout::createMultiplyDiagonalRecipe(MultiplyDiagonalRecipe &recipe)
{
    recipe.blocks.clear();
    for (unsigned i = 0; i < m_numBlocksX; i++) {
        unsigned blockIndex = getBlockIndex(i, i);
        if (blockIndex != (unsigned)-1)
            recipe.blocks.push_back(blockIndex);
    }
}



void Layout::drawLayout(RasterImage &image) const
{
    image.resize(m_numBlocksX, m_numBlocksY);
    memset(image.getData(), 0xFF, m_numBlocksX * m_numBlocksY * 4);
    for (unsigned y = 0; y < m_numBlocksY; y++) {
        for (unsigned i = 0; i < m_rows[y].size(); i++) {
            const unsigned x = m_rows[y][i].offset;
            image.getData()[y*m_numBlocksX+x] = 0xFF000000;
        }
    }
}




BlockSparseMatrix::BlockSparseMatrix()
{

}

BlockSparseMatrix::~BlockSparseMatrix()
{
    //dtor
}

void BlockSparseMatrix::setup(const Layout &layout)
{
    m_layout = layout;
    m_blocks.resize(m_layout.getNumBlocks());
    memset(&m_blocks[0], 0x00, m_blocks.size() * 4*4*sizeof(float));
}



void BlockSparseMatrix::performMultiplicationWithTranspose(const BlockSparseMatrix &src, const MultiplicationWithTransposeRecipe &recipe)
{
    for (unsigned i = 0; i < src.m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(src.m_blocks[i].values[j]));


    for (unsigned i = 0; i < recipe.jobs.size(); i++) {
        const MultiplicationWithTransposeRecipe::BlockJob &job = recipe.jobs[i];

        __m128 sum1 = _mm_setzero_ps();
        __m128 sum2 = _mm_setzero_ps();
        __m128 sum3 = _mm_setzero_ps();
        __m128 sum4 = _mm_setzero_ps();
        for (unsigned j = 0; j < job.sourcePairs.size(); j++) {

            const Block &leftBlock = src.m_blocks[job.sourcePairs[j].first];
            const Block &rightBlock = src.m_blocks[job.sourcePairs[j].second];

            for (unsigned y = 0; y < 4; y++) {
                __m128 col = _mm_load_ps(&leftBlock.values[y*4]);
                sum1 = _mm_add_ps(sum1, _mm_mul_ps(col, _mm_set1_ps(rightBlock.values[y*4+0]))); // right side is transposed
                sum2 = _mm_add_ps(sum2, _mm_mul_ps(col, _mm_set1_ps(rightBlock.values[y*4+1]))); // right side is transposed
                sum3 = _mm_add_ps(sum3, _mm_mul_ps(col, _mm_set1_ps(rightBlock.values[y*4+2]))); // right side is transposed
                sum4 = _mm_add_ps(sum4, _mm_mul_ps(col, _mm_set1_ps(rightBlock.values[y*4+3]))); // right side is transposed
            }
        }
        Block &target = m_blocks[job.targetBlockIndex];
        _mm_store_ps(&target.values[0*4], sum1);
        _mm_store_ps(&target.values[1*4], sum2);
        _mm_store_ps(&target.values[2*4], sum3);
        _mm_store_ps(&target.values[3*4], sum4);
    }

    for (unsigned i = 0; i < m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(m_blocks[i].values[j]));
}

void BlockSparseMatrix::performCholeskyFactorization(const BlockSparseMatrix &src, const CholeskyFactorizationRecipe &recipe)
{
    Block zeroBlock;
    memset(zeroBlock.values, 0x00, 4*4*sizeof(float));

    for (unsigned i = 0; i < src.m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(src.m_blocks[i].values[j]));


    for (unsigned colIndex = 0; colIndex < recipe.columns.size(); colIndex++) {
        const CholeskyFactorizationRecipe::Column &column = recipe.columns[colIndex];

        {
            const CholeskyFactorizationRecipe::DiagonalElementJob &diagJob = column.diagonalJob;
            Block &targetBlock = m_blocks[diagJob.targetBlockIndex];

            const Block &sourceBlock = (diagJob.sourceBlockIndex != (unsigned)-1)?src.m_blocks[diagJob.sourceBlockIndex]:zeroBlock;

            for (unsigned y = 0; y < 4; y++) {
                for (unsigned x = 0; x <= y; x++) {
                    float sum = 0.0f;
                    for (unsigned i = 0; i < diagJob.rowBlocks.size(); i++) {
                        const Block &leftBlock = m_blocks[diagJob.rowBlocks[i]];
                        for (unsigned k = 0; k < 4; k++) {
                            sum += leftBlock.values[y+k*4] * leftBlock.values[x+k*4];
                        }
                    }
                    for (unsigned k = 0; k < x; k++) {
                        sum += targetBlock.values[y+k*4] * targetBlock.values[x+k*4];
                    }
                    if (x < y) {
                        if (std::abs(targetBlock.values[x+x*4]) <= 1e-25f) {
                            //std::cout << __LINE__ << " Diag element: " << targetBlock.values[x+x*4] << std::endl;
/*
                            unsigned __bx, __by;
                            m_layout.debugFindBlockCoord(diagJob.targetBlockIndex, __bx, __by);

                            std::cout << "   Block coords: " << __bx << ";"<<__by<<"  x="<<x<<std::endl;
                            std::cout << "Block: ";
                            for (unsigned k = 0; k < 16; k++)
                                std::cout << "  " << targetBlock.values[k];
                            std::cout << std::endl;
                            std::cout << "Height: " << m_layout.getHeight() << std::endl;
*/
                          //  throw std::runtime_error("Diagonal element too small");
                            targetBlock.values[y+x*4] = 0.0f;
                        } else
                            targetBlock.values[y+x*4] = (sourceBlock.values[y+x*4] - sum) / targetBlock.values[x+x*4];
                    } else {
                        float f = sourceBlock.values[y+x*4] - sum;
                        if (f < 0.0f)
                            throw std::runtime_error("Matrix not positive definite");
                        targetBlock.values[y+x*4] = std::sqrt(f);
                    }
                }
            }
        }
        for (unsigned rowJobIndex = 0; rowJobIndex < column.rowJobs.size(); rowJobIndex++) {
            const CholeskyFactorizationRecipe::RowElementJob &rowJob = column.rowJobs[rowJobIndex];

            const Block &sourceBlock = (rowJob.sourceBlockIndex!=(unsigned)-1)?src.m_blocks[rowJob.sourceBlockIndex]:zeroBlock;
            const Block &diagHeadBlock = m_blocks[rowJob.diagonalHeadBlockIndex];
            Block &targetBlock = m_blocks[rowJob.targetBlockIndex];

            for (unsigned x = 0; x < 4; x++) {
                __m128 sum = _mm_load_ps(&sourceBlock.values[x*4]);
                for (unsigned i = 0; i < rowJob.sumBlocks.size(); i++) {
                    const Block &leftBlock = m_blocks[rowJob.sumBlocks[i].leftBlockIndex];
                    const Block &topBlock = m_blocks[rowJob.sumBlocks[i].topBlockIndex];
                    for (unsigned k = 0; k < 4; k++) {
                        __m128 a = _mm_load_ps(&leftBlock.values[k*4]);
                        sum = _mm_sub_ps(sum, _mm_mul_ps(a, _mm_set1_ps(topBlock.values[k*4+x])));
                    }
                }
                {
                    const Block &leftBlock = targetBlock;
                    const Block &topBlock = diagHeadBlock;
                    for (unsigned k = 0; k < x; k++) {
                        __m128 a = _mm_load_ps(&leftBlock.values[k*4]);
                        sum = _mm_sub_ps(sum, _mm_mul_ps(a, _mm_set1_ps(topBlock.values[k*4+x])));
                    }
                }
                if (std::abs(diagHeadBlock.values[x*4+x]) <= 1e-25f) {
                    std::cout << __LINE__ << " Diag element: " << diagHeadBlock.values[x*4+x] << std::endl;
                    /*
                    unsigned __bx, __by;
                    m_layout.debugFindBlockCoord(rowJob.diagonalHeadBlockIndex, __bx, __by);
                    std::cout << "   Block coords: " << __bx << ";"<<__by<<"  x="<<x<<std::endl;
                    std::cout << "Block: ";
                    for (unsigned k = 0; k < 16; k++)
                        std::cout << "  " << diagHeadBlock.values[k];
                    std::cout << std::endl;
                    std::cout << "Height: " << m_layout.getHeight() << std::endl;
                    */
//                    throw std::runtime_error("Diagonal element too small");
                    sum = _mm_setzero_ps();
                } else
                    sum = _mm_div_ps(sum, _mm_set1_ps(diagHeadBlock.values[x*4+x]));
                _mm_store_ps(&targetBlock.values[x*4], sum);
            }
        }
    }

    for (unsigned i = 0; i < m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(m_blocks[i].values[j]));
}

void BlockSparseMatrix::solveForwardSubstitution(const std::vector<float> &src, std::vector<float> &dst) const
{
    assert(src.size() == m_layout.getWidth());
    for (unsigned i = 0; i < m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(m_blocks[i].values[j]));

    if (m_layout.getMode() != Layout::MODE_LOWER_TRIANGULAR)
        throw std::runtime_error("Matrix not lower triangular!");
    dst.resize(src.size());
    for (unsigned by = 0; by < m_layout.getNumBlocksY(); by++) {
        unsigned maxY = std::min<unsigned>(by*4+4, dst.size())-by*4;

        const std::vector<Layout::BlockEntry> &row = m_layout.getRow(by);
        __m128 sum = _mm_setzero_ps();
        for (unsigned bx = 0; (bx+1) < row.size(); bx++) {
            const Block &block = m_blocks[row[bx].blockIndex];
            unsigned offset = row[bx].offset;

            for (unsigned k = 0; k < 4; k++) {
                sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load_ps(&block.values[k*4]), _mm_set1_ps(dst[offset*4+k])));
            }
        }
        const Block &block = m_blocks[row[row.size()-1].blockIndex];

        float scalarSum[4] __attribute__ ((aligned (16)));
        _mm_store_ps(scalarSum, sum);

        for (unsigned y = 0; y < maxY; y++) {
            float f = scalarSum[y];
            for (unsigned k = 0; k < y; k++) {
                f += block.values[k*4+y] * dst[by*4+k];
            }
            if (std::abs(block.values[y*4+y]) <= 1e-25f) {
                std::cout << __LINE__ << " Diag element: " << block.values[y*4+y] << std::endl;
                unsigned __bx, __by;
                m_layout.debugFindBlockCoord(row[row.size()-1].blockIndex, __bx, __by);
                std::cout << "   Block coords: " << __bx << ";"<<__by<<"  y="<<y<<std::endl;
                std::cout << "Block: ";
                for (unsigned k = 0; k < 16; k++)
                    std::cout << "  " << block.values[k];
                std::cout << std::endl;
                std::cout << "Height: " << m_layout.getHeight() << std::endl;

                // throw std::runtime_error("Diagonal element too small");
                dst[by*4+y] = 0.0f;
            } else
                dst[by*4+y] = (src[by*4+y] - f) / block.values[y*4+y];
        }
    }
}

void BlockSparseMatrix::solveBackwardSubstitution(const std::vector<float> &src, std::vector<float> &dst) const
{
    for (unsigned i = 0; i < m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(m_blocks[i].values[j]));

    if (m_layout.getMode() != Layout::MODE_LOWER_TRIANGULAR)
        throw std::runtime_error("Matrix not lower triangular!");


    dst.resize(src.size());
    for (unsigned bx_ = 0; bx_ < m_layout.getNumBlocksX(); bx_++) {
        unsigned bx = m_layout.getNumBlocksX()-1-bx_;
        const std::vector<Layout::BlockEntry> &col = m_layout.getCol(bx);

        __m128 sum = _mm_setzero_ps();
        for (unsigned by = 1; by < col.size(); by++) {

            const Block &block = m_blocks[col[by].blockIndex];
            __m128 rows[4];
            rows[0] = _mm_load_ps(&block.values[0*4]);
            rows[1] = _mm_load_ps(&block.values[1*4]);
            rows[2] = _mm_load_ps(&block.values[2*4]);
            rows[3] = _mm_load_ps(&block.values[3*4]);
            _MM_TRANSPOSE4_PS(rows[0], rows[1], rows[2], rows[3]);

            unsigned offset = col[by].offset;
            unsigned maxY = std::min<unsigned>(offset*4+4, dst.size())-offset*4;

            for (unsigned k = 0; k < maxY; k++) {
                sum = _mm_add_ps(sum, _mm_mul_ps(rows[k], _mm_set1_ps(dst[offset*4+k])));
            }
        }

        const Block &block = m_blocks[col[0].blockIndex];

        float scalarSum[4] __attribute__ ((aligned (16)));
        _mm_store_ps(scalarSum, sum);

        unsigned maxX = std::min<unsigned>(bx*4+4, dst.size())-bx*4;
        unsigned minX_ = 4-maxX;

        for (unsigned x_ = minX_; x_ < 4; x_++) {
            unsigned x = 3-x_;
            float f = scalarSum[x];
            for (unsigned k_ = minX_; k_ < x_; k_++) {
                unsigned k = 3-k_;
                f += block.values[x*4+k] * dst[bx*4+k];
            }
            if (std::abs(block.values[x*4+x]) < 1e-25f) {
                std::cout << __LINE__ << " Diag element: " << block.values[x*4+x] << std::endl;
                //throw std::runtime_error("Diagonal element too small");
                dst[bx*4+x] = 0.0f;
            } else
                dst[bx*4+x] = (src[bx*4+x] - f) / block.values[x*4+x];
        }
    }
}


void BlockSparseMatrix::multiplyWithVector(const std::vector<float> &src, std::vector<float> &dst) const
{
    assert(src.size() == m_layout.getWidth());
    if (m_layout.getMode() != Layout::MODE_ARBITRARY)
        throw std::runtime_error("Matrix not arbitrary, multiplication only implemented for arbitrary!");

    for (unsigned i = 0; i < m_blocks.size(); i++)
        for (unsigned j = 0; j < 16; j++)
            assert(std::isfinite(m_blocks[i].values[j]));


    dst.resize(m_layout.getHeight());
    for (unsigned by = 0; by < m_layout.getNumBlocksY(); by++) {

        const std::vector<Layout::BlockEntry> &row = m_layout.getRow(by);
        __m128 sum = _mm_setzero_ps();
        for (unsigned bx = 0; bx < row.size(); bx++) {
            const Block &block = m_blocks[row[bx].blockIndex];
            unsigned offset = row[bx].offset;

            unsigned maxY = std::min<unsigned>(offset*4+4, src.size())-offset*4;
            for (unsigned k = 0; k < maxY; k++) {
                sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load_ps(&block.values[k*4]), _mm_set1_ps(src[offset*4+k])));
            }
        }

        float scalarSum[4] __attribute__ ((aligned (16)));
        _mm_store_ps(scalarSum, sum);

        unsigned maxY = std::min<unsigned>(by*4+4, dst.size())-by*4;
        for (unsigned y = 0; y < maxY; y++) {
            dst[by*4+y] = scalarSum[y];
        }
    }
}

void BlockSparseMatrix::multiplyDiagonal(const MultiplyDiagonalRecipe &recipe, float factor)
{
    for (unsigned i = 0; i < recipe.blocks.size(); i++) {
        Block &block = m_blocks[recipe.blocks[i]];
        for (unsigned k = 0; k < 4; k++) {
            block.values[k*4+k] *= factor;
        }
    }
}

void BlockSparseMatrix::copyFrom(const BlockSparseMatrix &src)
{
    memcpy(&m_blocks[0], &src.m_blocks[0], m_blocks.size() * sizeof(Block));
}


void BlockSparseMatrix::printMatlab() const
{
    std::vector<float> values;
    unsigned w = m_layout.getNumBlocksX()*4;
    unsigned h = m_layout.getNumBlocksY()*4;
    values.resize(w*h);

    for (unsigned by = 0; by < m_layout.getNumBlocksY(); by++) {
        for (unsigned y = 0; y < 4; y++) {
            unsigned ty = by*4+y;
            unsigned tx = 0;
            for (unsigned bx = 0; bx < m_layout.getNumBlocksX(); bx++) {
                unsigned blockIndex = m_layout.getBlockIndex(bx, by);
                if (blockIndex == (unsigned)-1) {
                    values[ty*w+tx]= 0.0f;
                    tx++;
                    values[ty*w+tx]= 0.0f;
                    tx++;
                    values[ty*w+tx]= 0.0f;
                    tx++;
                    values[ty*w+tx]= 0.0f;
                    tx++;
                } else {
                    const Block &block = m_blocks[blockIndex];
                    values[ty*w+tx]= block.values[y+0*4];
                    tx++;
                    values[ty*w+tx]= block.values[y+1*4];
                    tx++;
                    values[ty*w+tx]= block.values[y+2*4];
                    tx++;
                    values[ty*w+tx]= block.values[y+3*4];
                    tx++;
                }
            }
        }
    }
    if (m_layout.getMode() == Layout::MODE_SYMMETRIC) {
        for (unsigned y = 0; y < m_layout.getHeight(); y++) {
            for (unsigned x = y+1; x < m_layout.getWidth(); x++) {
                values[x+y*w] = values[y+x*w];
            }
        }
    }


    std::cout << " [ ";
    for (unsigned y = 0; y < m_layout.getHeight(); y++) {
        for (unsigned x = 0; x < m_layout.getWidth(); x++) {
            std::cout << values[x+y*w] << " ";
        }
        if (y+1 < m_layout.getHeight())
            std::cout << " ; " << std::endl;
    }
    std::cout << " ] " << std::endl;
}



}

}
