Bérenger Bramas

[ARM][SVE] change vector size at runtime (dynamically)

Berenger — Mon, 01 Nov 2021 10:57:52 +0000

Yes it is possible to change the size of the vector of ARM SVE (like A64FX) using the following code:

#define PR_SVE_SET_VL 50 #define PR_SVE_GET_VL 51 #include #include int main(int argc, char** argv){ if(getenv(“VECBITS”)){ int svebitstoset=atoi(getenv(“VECBITS”)); std::cout << "svebitstoset " << svebitstoset << std::endl; int info = prctl(PR_SVE_SET_VL, svebitstoset/8); std::cout << "info " << info << std::endl; } std::cout << "Vec size = " << svcntb()*8 << "bits" << std::endl; [/cpp]

[C++][SIMD] Fast Exponential Computation on SIMD Architectures – Implementation

Berenger — Fri, 24 Jun 2016 08:28:59 +0000

Here is my implementation of the paper:
“Fast Exponential Computation on SIMD Architectures”
https://www.researchgate.net/publication/272178514_Fast_Exponential_Computation_on_SIMD_Architectures

This code has been taken from Inastemp ( https://gitlab.mpcdf.mpg.de/bbramas/inastemp ) which is under MIT licence.

In the paper we need to have the coefficients from the Remez polynomials to interpolate the function 1 + xk(i) – 2.^xk(i) between [0;1).
I follow the algorithm given by wikipedia (sorry the French page is better https://fr.wikipedia.org/wiki/Algorithme_de_Remez :)

Here is my scilab code to do so (for n in [2;10]):

// From https://fr.wikipedia.org/wiki/Algorithme_de_Remez
a = 0
b = 1 - %eps

format('v',21)
dispall=%f

for n = 2:10
    
    if dispall
        printf("N = %d\n",n)
    end
    //////////////////////////////////////////////:
    xk = zeros(n+2,1)
    
    for k = 1:(n+2)
        xk(k) = ((a+b)/2) + ((a-b)/2) * cos(((k-1)*%pi)/(n+1))
    end
    
    if dispall
        disp(xk)
    end
    
    //////////////////////////////////////////////:
    M = zeros(n+2, n+2)
    
    for i = 1:(n+1)
        M(:,i) = xk.^(i-1)
    end
    for i = 1:(n+2)
        M(i, n+2) = (-1).^i
    end
    
    if dispall
        disp(M)
    end
    
    //////////////////////////////////////////////:
    func = zeros(n+2, 1)
    for i = 1:(n+2)
        //func(i) = exp(xk(i))
        func(i) = 1 + xk(i) - 2.^xk(i)
    end
    
    if dispall
        disp(func)
    end
    
    //////////////////////////////////////////////:
    [x0,kerA]=linsolve(M,func)
    
    //////////////////////////////////////////////:
    if dispall
        disp(x0)
    end
    
    printf("constexpr static std::array GetCoefficient%d() {\n", n+1, n+1);
    printf("    return {{\n");
    for i = 1:(n+1)
        sep=","
        if i == n+1 then
            sep=""
        end
        printf("             %.20e%s\n", -x0(i), sep);
    end
    printf("           }};\n");
    printf("}\n\n");

end

It gives me the coefficients for the different accuracies:

//-->exec('/home/bbramas/Projects/inastemp/CMakeModules/remez.sce',-1)
constexpr static std::array GetCoefficient3() {
    return {{
             -2.47142816509723700288e-03,
             3.48943001636461247461e-01,
             -3.44000145306266491563e-01
           }};
}

constexpr static std::array GetCoefficient4() {
    return {{
             1.06906116358144185133e-04,
             3.03543677780836240743e-01,
             -2.24339532327269441936e-01,
             -7.92041454535668681958e-02
           }};
}

constexpr static std::array GetCoefficient5() {
    return {{
             -3.70138142771437266806e-06,
             3.07033820309224325662e-01,
             -2.41638288055762540107e-01,
             -5.16904731562965388814e-02,
             -1.36976563343097993558e-02
           }};
}

constexpr static std::array GetCoefficient6() {
    return {{
             1.06823753710239477000e-07,
             3.06845249656632845792e-01,
             -2.40139721982230797126e-01,
             -5.58662282412822480682e-02,
             -8.94283890931273951763e-03,
             -1.89646052380707734290e-03
           }};
}

constexpr static std::array GetCoefficient7() {
    return {{
             -2.64303273610414963822e-09,
             3.06853075372807815313e-01,
             -2.40230549677691723742e-01,
             -5.54802224547989303316e-02,
             -9.68497459444197204836e-03,
             -1.23843111224273085859e-03,
             -2.18892247566917477666e-04
           }};
}

constexpr static std::array GetCoefficient8() {
    return {{
             5.72265234348656066133e-11,
             3.06852812183173784266e-01,
             -2.40226356058427820139e-01,
             -5.55053022725605083032e-02,
             -9.61350625581030605871e-03,
             -1.34302437845634000529e-03,
             -1.42962470418959216190e-04,
             -2.16607474999407558923e-05
           }};
}

constexpr static std::array GetCoefficient9() {
    return {{
             -1.10150186041739869460e-12,
             3.06852819617161765020e-01,
             -2.40226511645233870018e-01,
             -5.55040609720754696266e-02,
             -9.61837182960864275905e-03,
             -1.33266405715993271723e-03,
             -1.55186852765468104613e-04,
             -1.41484352491262699514e-05,
             -1.87582286605066256753e-06
           }};
}

constexpr static std::array GetCoefficient10() {
    return {{
             1.79040320992239805871e-11,
             3.06852815055756844576e-01,
             -2.40226385506041861806e-01,
             -5.55053584940081654042e-02,
             -9.61174262279892825667e-03,
             -1.35164210003994454852e-03,
             -1.23291147980286128769e-04,
             -4.53940620364305641833e-05,
             1.46363500589519947862e-05,
             -3.63750326480946818984e-06
           }};
}

constexpr static std::array GetCoefficient11() {
    return {{
             7.32388148129676088418e-13,
             3.06852819216552274995e-01,
             -2.40226499275945526435e-01,
             -5.55042073859920090384e-02,
             -9.61749102796678571881e-03,
             -1.33571753728242812072e-03,
             -1.48718480159015542822e-04,
             -2.26598047213222231406e-05,
             4.91492761180322572151e-06,
             -3.00875847392884227107e-06,
             5.68126156224525271282e-07
           }};
}

I add some constants

    static const long int S64 = (1L << 52);
    static const long int S32 = (1L << 23);
public:
    constexpr static double CoeffLog2E(){
        // const double Euler = 2.71828182845904523536028747135266249775724709369995;
        // return std::log2(Euler());
        return 1.442695040888963407359924681001892137426645954153;
    }

    constexpr static double CoeffA64(){
        return double(S64);
    }

    constexpr static double CoeffB64(){
        return double(S64*1023);
    }

    constexpr static double CoeffA32(){
        return double(S32);
    }

    constexpr static double CoeffB32(){
        return double(S32*127);
    }

And here is an example of implementation for AVX:

static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
        static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
        static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
        static const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[8]));
        static const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[7]));
        static const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[6]));
        static const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[5]));
        static const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[4]));
        static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[3]));
        static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[2]));
        static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[1]));
        static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[0]));

        __m256d x = val * COEFF_LOG2E;

        const __m256d fractional_part = x - Floor(x);

        __m256d factor = COEFF_P5_X ;
        factor = (factor * fractional_part + COEFF_P5_Y);
        factor = (factor * fractional_part + COEFF_P5_Z);
        factor = (factor * fractional_part + COEFF_P5_A);
        factor = (factor * fractional_part + COEFF_P5_B);
        factor = (factor * fractional_part + COEFF_P5_C);
        factor = (factor * fractional_part + COEFF_P5_D);
        factor = (factor * fractional_part + COEFF_P5_E);
        factor = (factor * fractional_part + COEFF_P5_F);

        x -= factor;

        x = (COEFF_A * x + COEFF_B);

        __m128d valupper = _mm256_extractf128_pd(x, 1);
        _mm256_zeroupper();
        __m128d vallower = _mm256_castpd256_pd128(x);

        alignas(64) long long int allvalint[VecLength] = {_mm_cvtsd_si64(vallower),
                                         _mm_cvtsd_si64(_mm_shuffle_pd(vallower, vallower, 1)),
                                         _mm_cvtsd_si64(valupper),
                                         _mm_cvtsd_si64(_mm_shuffle_pd(valupper, valupper, 1))};

        return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)allvalint));

[C/C++][CUDA] cuSparse Sparse matrix examples (CSR, BCSR) SpMV and conversions

Berenger — Tue, 07 Jul 2015 11:42:53 +0000

I left on this page an old a deprecated code (at the bottom) and a new version at the top.

Apologize I do not have time to clean and comment it, but I hope it might help if someone is searching for an example.

New code (tested with CUDA 11.6)


#include &amp;lt;cuda_runtime.h&amp;gt;
#include &amp;lt;cusparse.h&amp;gt;&amp;lt;/cusparse.h&amp;gt;&amp;lt;/cuda_runtime.h&amp;gt;

#include &amp;lt;cassert&amp;gt;&amp;lt;/cassert&amp;gt;
 #include &amp;lt;algorithm&amp;gt;
#include &amp;lt;iostream&amp;gt;
#include &amp;lt;fstream&amp;gt;
#include &amp;lt;sstream&amp;gt;
#include &amp;lt;string&amp;gt;
#include &amp;lt;vector&amp;gt;
#include &amp;lt;tuple&amp;gt;&amp;lt;/tuple&amp;gt;&amp;lt;/vector&amp;gt;&amp;lt;/string&amp;gt;&amp;lt;/sstream&amp;gt;&amp;lt;/fstream&amp;gt;&amp;lt;/iostream&amp;gt;

#include "SpTimer.hpp"

#define CUDA_ASSERT(X)\
{\
    cudaError_t ___resCuda = (X);\
    if ( cudaSuccess != ___resCuda ){\
        printf("Error: fails, %s (%s line %d)\nbCols", cudaGetErrorString(___resCuda), __FILE__, __LINE__ );\
        exit(1);\
    }\
}

#define CUSPARSE_ASSERT(X)\
{\
    cusparseStatus_t ___resCuda = (X);\
    if ( CUSPARSE_STATUS_SUCCESS != ___resCuda ){\
        printf("Error: fails, %s (%s line %d)\nbCols", cusparseGetErrorString(___resCuda), __FILE__, __LINE__ );\
        exit(1);\
    }\
}

std::vector&amp;lt;std::string&amp;gt; SplitString(const std::string&amp;amp;amp; inString, const char inDelim = ' ') {
    std::vector&amp;lt;std::string&amp;gt; substrings;
    size_t start;
    size_t end = 0;
    while ((start = inString.find_first_not_of(inDelim, end)) != std::string::npos) {
        end = inString.find(inDelim, start);
        substrings.push_back(inString.substr(start, end - start));
    }
    return substrings;
}&amp;lt;/std::string&amp;gt;&amp;lt;/std::string&amp;gt;

std::tuple&amp;lt;int, int,="" std::vector&amp;lt;std::tuple&amp;lt;int,="" double=""&amp;gt;&amp;amp;gt;&amp;amp;gt; ReadMMFile(const std::string&amp;amp;amp; inFilename){
    std::ifstream file(inFilename);
    if (!file.is_open()) {
        std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Could not open file " &amp;amp;lt;&amp;amp;lt; inFilename &amp;amp;lt;&amp;amp;lt; std::endl;
        return {};
    }

    // %%MatrixMarket matrix coordinate real general
    {
        std::string header;
        std::getline(file, header);
        const auto words = SplitString(header);
        if(words.size() != 5 || words[3] != "real"){
            std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Incompatible matrix: " &amp;amp;lt;&amp;amp;lt; header &amp;amp;lt;&amp;amp;lt; std::endl;
            return {};
        }
    }
    int nbRows, nbCols, nnz;
    char c;
    {
        std::string line;
        std::getline(file, line);
        while(line.size() &amp;amp;amp;&amp;amp;amp; line[0] == '%'){
            std::getline(file, line);        
        }
        if(!file.good() || line.size() == 0){
            std::cerr &amp;amp;lt;&amp;amp;lt; "Error: in header" &amp;amp;lt;&amp;amp;lt; std::endl;
            return {};
        }
        
        std::istringstream iss(line);
        iss &amp;amp;gt;&amp;amp;gt; nbRows &amp;amp;gt;&amp;amp;gt; nbCols &amp;amp;gt;&amp;amp;gt; nnz;
    }&amp;lt;/int,&amp;gt;

    std::vector&amp;lt;std::tuple&amp;lt;int, int,="" double=""&amp;gt;&amp;amp;gt; values;
    values.reserve(nnz);&amp;lt;/std::tuple&amp;lt;int,&amp;gt;

    int i, j;
    double val;
    while (file &amp;amp;gt;&amp;amp;gt; i &amp;amp;gt;&amp;amp;gt; j &amp;amp;gt;&amp;amp;gt; val) {
        values.emplace_back(i - 1, j - 1, val);
        nnz -= 1;
    }

    if(nnz != 0){
        std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Cannot read the expected number of NNZ" &amp;amp;lt;&amp;amp;lt; std::endl;
        return {};
    }


std::sort(values.begin(), values.end(),
[](const auto& v1, const auto& v2){
return std::get<0>(v1) < std::get<0>(v2)
|| (std::get<0>(v1) == std::get<0>(v2) && std::get<1>(v1) < std::get<1>(v2));
});

    return {nbRows, nbCols, values};
}

std::tuple&amp;lt;int, int,="" std::vector&amp;lt;int=""&amp;gt;, std::vector&amp;lt;int&amp;gt;, std::vector&amp;lt;double&amp;gt;&amp;amp;gt;
MMToCsr(const int nbRows, const int nbCols, const std::vector&amp;lt;std::tuple&amp;lt;int, int,="" double=""&amp;gt;&amp;amp;gt;&amp;amp;amp; values)
{
    std::vector&amp;lt;int&amp;gt; row_ptr(nbRows + 1, 0);
    std::vector&amp;lt;int&amp;gt; col_ind;
    std::vector&amp;lt;double&amp;gt; val;
    col_ind.reserve(values.size());
    val.reserve(values.size());&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/std::tuple&amp;lt;int,&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int,&amp;gt;

    for (const auto &amp;amp;amp;[i, j, v] : values) {
        row_ptr[i + 1]++;
        col_ind.emplace_back(j);
        val.emplace_back(v);
    }

    for (int idxRow = 0; idxRow &amp;amp;lt; nbRows; idxRow++) {
        row_ptr[idxRow + 1] += row_ptr[idxRow];
    }

    return {nbRows, nbCols, row_ptr, col_ind, val};
}

std::vector&amp;lt;double&amp;gt; cuSparseCsr(const int nbRows, const int nbCols,
                 const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrOffset,
                 const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrColIdx,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; values,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecX,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecY,
                 const int NbLoops = 10){
    const cudaDataType RealType = CUDA_R_64F;
    const int nnz = values.size();
    const double alpha = 1;
    const double beta  = 0;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/double&amp;gt;

    assert(vecX.size() == nbCols);
    assert(vecY.size() == nbRows);
    assert(csrColIdx.size() == nnz);
    assert(csrOffset.size() == nbRows+1);

    cusparseHandle_t cusparseHandle = NULL;
    CUSPARSE_ASSERT( cusparseCreate(&amp;amp;amp;cusparseHandle) )

    int* cuCsrOffsets;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuCsrOffsets, (nbRows + 1) * sizeof(int)) )
    CUDA_ASSERT( cudaMemcpy(cuCsrOffsets, csrOffset.data(),
                           (nbRows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    int* cuCsrColIdx;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuCsrColIdx, nnz * sizeof(int))        )
    CUDA_ASSERT( cudaMemcpy(cuCsrColIdx, csrColIdx.data(), nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    double* cuValues;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuValues,  nnz * sizeof(double))      )
    CUDA_ASSERT( cudaMemcpy(cuValues, values.data(), nnz * sizeof(float),
                           cudaMemcpyHostToDevice) )

    cusparseSpMatDescr_t cuMatrixDescr;
    CUSPARSE_ASSERT( cusparseCreateCsr(&amp;amp;amp;cuMatrixDescr, nbRows, nbCols, nnz,
                                      cuCsrOffsets, cuCsrColIdx, cuValues,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, RealType) )

    double* cuVecX;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuVecX, nbCols * sizeof(double)) )
    CUDA_ASSERT( cudaMemcpy(cuVecX, vecX.data(), nbCols * sizeof(double),
                           cudaMemcpyHostToDevice) )
    cusparseDnVecDescr_t cuVecDescX;
    CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;cuVecDescX, nbCols, cuVecX, RealType) )

    double* cuVecY;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuVecY, nbRows * sizeof(double)) )
    CUDA_ASSERT( cudaMemcpy(cuVecY, vecY.data(), nbRows * sizeof(double),
                           cudaMemcpyHostToDevice) )
    cusparseDnVecDescr_t cuVecDescrY;
    CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;cuVecDescrY, nbRows, cuVecY, RealType) )

    void*  cuBuffer    = NULL;
    size_t bufferSize = 0;
    CUSPARSE_ASSERT( cusparseSpMV_bufferSize(
                                 cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;beta, cuVecDescrY, RealType,
                                 CUSPARSE_SPMV_ALG_DEFAULT, &amp;amp;amp;bufferSize) )
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuBuffer, bufferSize) )

    SpTimer timer;
    for(int idxLoop = 0 ; idxLoop &amp;amp;lt; NbLoops ; ++idxLoop){
    	CUSPARSE_ASSERT( cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;beta, cuVecDescrY, RealType,
                                 CUSPARSE_SPMV_ALG_DEFAULT, cuBuffer) )
    }
    timer.stop();
    std::cout &amp;amp;lt;&amp;amp;lt; "CuSparse took: " &amp;amp;lt;&amp;amp;lt; timer.getElapsed() &amp;amp;lt;&amp;amp;lt; "s ("
              &amp;amp;lt;&amp;amp;lt; (NbLoops*nnz*2)/timer.getElapsed()/1E9 &amp;amp;lt;&amp;amp;lt; "GFlop/s) for "
              &amp;amp;lt;&amp;amp;lt; NbLoops &amp;amp;lt;&amp;amp;lt; " iterations" &amp;amp;lt;&amp;amp;lt; std::endl;
    
    CUSPARSE_ASSERT( cusparseDestroySpMat(cuMatrixDescr) )
    CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescX) )
    CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescrY) )
    CUSPARSE_ASSERT( cusparseDestroy(cusparseHandle) )
    
    
    std::vector&amp;lt;double&amp;gt; resY(nbRows);
    CUDA_ASSERT( cudaMemcpy(resY.data(), cuVecY, nbRows * sizeof(double),
                           cudaMemcpyDeviceToHost) )&amp;lt;/double&amp;gt;

    CUDA_ASSERT( cudaFree(cuBuffer) )
    CUDA_ASSERT( cudaFree(cuCsrOffsets) )
    CUDA_ASSERT( cudaFree(cuCsrColIdx) )
    CUDA_ASSERT( cudaFree(cuValues) )
    CUDA_ASSERT( cudaFree(cuVecX) )
    CUDA_ASSERT( cudaFree(cuVecY) )

    return resY;
}

std::vector&amp;lt;double&amp;gt; SpMvCsr(const int nbRows, const int nbCols,
                 const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrOffset,
                 const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrColIdx,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; values,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecX,
                 const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecY){
    assert(vecX.size() == nbCols);
    assert(vecY.size() == nbRows);
    assert(csrColIdx.size() == nnz);
    assert(csrOffset.size() == nbRows+1);&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/double&amp;gt;

    const double alpha = 1;
    const double beta  = 0;

    std::vector&amp;lt;double&amp;gt; resY(nbRows);&amp;lt;/double&amp;gt;

    for(int idxRow = 0 ; idxRow &amp;amp;lt; nbRows ; ++idxRow){
    	double res = 0;
    	for(int idxVal = csrOffset[idxRow] ; idxVal &amp;amp;lt; csrOffset[idxRow+1] ; ++idxVal){
    		res += vecX[csrColIdx[idxVal]] * values[idxVal];
    	}
    	resY[idxRow] = res * alpha + vecY[idxRow] * beta;
    }
    
    return resY;
}

auto MaxRelDiff(const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecGood,
	         const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecTest){
    if(vecGood.size() != vecTest.size()){
        return std::numeric_limits&amp;lt;double&amp;gt;::infinity();
    }&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;

    double diff = 0;

	for(int idxVal = 0 ; idxVal &amp;amp;lt; vecGood.size() ; ++idxVal){
	    diff = std::max(diff, (vecGood[idxVal] == 0 ? vecTest[idxVal] : std::abs((vecGood[idxVal]-vecTest[idxVal])/vecGood[idxVal])));
	}
	
    return diff;
}


int main(int argc, char *argv[]){

    if (argc != 2) {
        std::cerr &amp;amp;lt;&amp;amp;lt; "Usage: " &amp;amp;lt;&amp;amp;lt; argv[0] &amp;amp;lt;&amp;amp;lt; " matrix.mtx" &amp;amp;lt;&amp;amp;lt; std::endl;
        return 1;
    }

    auto [nbRows, nbCols, values] = ReadMMFile(argv[1]);
    std::cout &amp;amp;lt;&amp;amp;lt; "Matrix dimensions: " &amp;amp;lt;&amp;amp;lt; nbRows &amp;amp;lt;&amp;amp;lt; " x " &amp;amp;lt;&amp;amp;lt; nbCols &amp;amp;lt;&amp;amp;lt; std::endl;
    std::cout &amp;amp;lt;&amp;amp;lt; "Number of non-zero elements: " &amp;amp;lt;&amp;amp;lt; values.size() &amp;amp;lt;&amp;amp;lt; std::endl;

    auto [m_csr, n_csr, row_ptr, col_ind, val] = MMToCsr(nbRows, nbCols, values);

    std::vector&amp;lt;double&amp;gt; vecX(n_csr, 1);
    std::vector&amp;lt;double&amp;gt; vecY(n_csr, 0);
    auto resCuSparse = cuSparseCsr(m_csr, n_csr, row_ptr, col_ind, val,
                                   vecX, vecY);&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;

    auto resCsr = SpMvCsr(m_csr, n_csr, row_ptr, col_ind, val,
                                   vecX, vecY);

    std::cout &amp;amp;lt;&amp;amp;lt; "Error cuSparse vs. CSR: " &amp;amp;lt;&amp;amp;lt; MaxRelDiff(resCsr,resCuSparse) &amp;amp;lt;&amp;amp;lt; std::endl;
    
    return 0;
}


Old code (but with conversions)
//////////////////////////////////////////////////////////////////////////
// In this file we compute Gemv sparse matrix with cuda for two kernels:
// CSR and BCSR
// We use a CPU COO version to test the accuracy of the code.
//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

#include &amp;amp;lt;cuda.h&amp;amp;gt;
#include &amp;amp;lt;cusparse_v2.h&amp;amp;gt;
#include &amp;amp;lt;cublas_v2.h&amp;amp;gt;

#include &amp;amp;lt;cstring&amp;amp;gt; // memset
#include &amp;amp;lt;cassert&amp;amp;gt;
#include &amp;amp;lt;cstdio&amp;amp;gt;

#define Min(x,y) ((x)&amp;amp;lt;(y)?(x):(y))
#define Max(x,y) ((x)&amp;amp;gt;(y)?(x):(y))
#define Abs(x) ((x)&amp;amp;gt;(0)?(x):-(x))

////////////////////// CUDA ERROR /////////////////////////////////////////

static void CudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}

#define CudaCheck( test ) { CudaCheckCore((test), __FILE__, __LINE__); }
#define CudaCheckAfterCall() { CudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }

////////////////////// CUDA SPARSE ERROR ///////////////////////////////////

static const char * cusparseGetErrorString(cusparseStatus_t error)
{
// Read more at: http://docs.nvidia.com/cuda/cusparse/index.html#ixzz3f79JxRar
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "The cuSPARSE library was not initialized. This is usually caused by the lack of a prior call, an error in the CUDA Runtime API called by the cuSPARSE routine, or an error in the hardware setup.\n" \
"To correct: call cusparseCreate() prior to the function call; and check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed.";

case CUSPARSE_STATUS_ALLOC_FAILED:
return "Resource allocation failed inside the cuSPARSE library. This is usually caused by a cudaMalloc() failure.\n"\
"To correct: prior to the function call, deallocate previously allocated memory as much as possible.";

case CUSPARSE_STATUS_INVALID_VALUE:
return "An unsupported value or parameter was passed to the function (a negative vector size, for example).\n"\
"To correct: ensure that all the parameters being passed have valid values.";

case CUSPARSE_STATUS_ARCH_MISMATCH:
return "The function requires a feature absent from the device architecture; usually caused by the lack of support for atomic operations or double precision.\n"\
"To correct: compile and run the application on a device with appropriate compute capability, which is 1.1 for 32-bit atomic operations and 1.3 for double precision.";

case CUSPARSE_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed, which is usually caused by a failure to bind a texture.\n"\
"To correct: prior to the function call, unbind any previously bound textures.";

case CUSPARSE_STATUS_EXECUTION_FAILED:
return "The GPU program failed to execute. This is often caused by a launch failure of the kernel on the GPU, which can be caused by multiple reasons.\n"\
"To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed.";

case CUSPARSE_STATUS_INTERNAL_ERROR:
return "An internal cuSPARSE operation failed. This error is usually caused by a cudaMemcpyAsync() failure.\n"\
"To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed. Also, check that the memory passed as a parameter to the routine is not being deallocated prior to the routine’s completion.";

case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "The matrix type is not supported by this function. This is usually caused by passing an invalid matrix descriptor to the function.\n"\
"To correct: check that the fields in cusparseMatDescr_t descrA were set correctly.";
}

return "&amp;amp;lt;unknown&amp;amp;gt;";
}
static void CudaSparseCheckCore(cusparseStatus_t code, const char *file, int line) {
if (code != CUSPARSE_STATUS_SUCCESS) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cusparseGetErrorString(code), file, line);
exit(code);
}
}

#define CudaSparseCheck( test ) { CudaSparseCheckCore((test), __FILE__, __LINE__); }

////////////// Alloc and copy ////////////////////////////////////////////////

template &amp;amp;lt;class ObjectType&amp;amp;gt;
ObjectType* allocAndCopy(const ObjectType src[], const int size){
ObjectType* dest = NULL;
CudaCheck( cudaMalloc(&amp;amp;amp;dest,size*sizeof(ObjectType)) );
CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) );
return dest;
}

template &amp;amp;lt;class ObjectType&amp;amp;gt;
ObjectType* alloc(const int size){
ObjectType* dest = NULL;
CudaCheck( cudaMalloc(&amp;amp;amp;dest,size*sizeof(ObjectType)) );
return dest;
}

template &amp;amp;lt;class ObjectType&amp;amp;gt;
ObjectType* allocAndCopyPart(const ObjectType src[], const int size, const int allocSize){
ObjectType* dest = NULL;
assert(size &amp;amp;lt;= allocSize);
CudaCheck( cudaMalloc(&amp;amp;amp;dest,allocSize*sizeof(ObjectType)) );
CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) );
CudaCheck( cudaMemset(&amp;amp;amp;dest[size],0,(allocSize-size)*sizeof(ObjectType)) );
return dest;
}
//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

#include &amp;amp;lt;algorithm&amp;amp;gt;

struct Ijv{
int i, j;
double v;
};

bool IjvComp(const Ijv&amp;amp;amp; v1, const Ijv&amp;amp;amp; v2){
return v1.i &amp;amp;lt; v2.i || (v1.i == v2.i &amp;amp;amp;&amp;amp;amp; v1.j &amp;amp;lt; v2.j);
}

struct COOArrays{
int m;
int nnz;
double *val;/*values(NNZ)*/
int *rowind;/*i(NNZ)*/
int *colind;/*j(NNZ)*/

COOArrays(){
val = NULL;
rowind = NULL;
colind = NULL;
}

~COOArrays(){
delete[] val;
delete[] rowind;
delete[] colind;
}

void sortToRowMajor(){

Ijv* ijvs = new Ijv[nnz];
for(int idxCopy = 0 ; idxCopy &amp;amp;lt; nnz ; ++idxCopy){
ijvs[idxCopy].i = rowind[idxCopy];
ijvs[idxCopy].j = colind[idxCopy];
ijvs[idxCopy].v = val[idxCopy];
}

std::sort(ijvs, ijvs+nnz, IjvComp);

for(int idxCopy = 0 ; idxCopy &amp;amp;lt; nnz ; ++idxCopy){
rowind[idxCopy] = ijvs[idxCopy].i;
colind[idxCopy] = ijvs[idxCopy].j;
val[idxCopy] = ijvs[idxCopy].v;
}

delete[] ijvs;
}
};

void compute_COO(COOArrays&amp;amp;amp; coo, double *x , double *y ){
for(int idxVal = 0 ; idxVal &amp;amp;lt; coo.nnz ; ++idxVal){
y[coo.rowind[idxVal]] += x[coo.colind[idxVal]] * coo.val[idxVal];
}
}

//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

struct CRSArrays{
int m;  //&amp;amp;lt; the dim of the matrix
int nnz;//&amp;amp;lt; the number of nnz (== ia[m])
double *cu_csrValA;  //&amp;amp;lt; the values (of size NNZ)
int *cu_csrRowPtrA;//&amp;amp;lt; the usual rowptr (of size m+1)
int *cu_csrColIndA;//&amp;amp;lt; the colidx of each NNZ (of size nnz)

cudaStream_t streamId;
cusparseHandle_t cusparseHandle;

CRSArrays(){
cu_csrValA = NULL;
cu_csrRowPtrA = NULL;
cu_csrColIndA = NULL;

// Create sparse handle (needed to call sparse functions
streamId = 0;
cusparseHandle = 0;
CudaSparseCheck(cusparseCreate(&amp;amp;amp;cusparseHandle));
CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId));
}

~CRSArrays(){
CudaCheck(cudaFree(cu_csrValA));
CudaCheck(cudaFree(cu_csrRowPtrA));
CudaCheck(cudaFree(cu_csrColIndA));

// Destroy sparse handle
CudaSparseCheck(cusparseDestroy(cusparseHandle));
}
};

void COO_to_CRS(COOArrays&amp;amp;amp; coo, CRSArrays* crs){
// We need COO to be sorted by row (and column)
coo.sortToRowMajor();

crs-&amp;amp;gt;m = coo.m;
crs-&amp;amp;gt;nnz = coo.nnz;

// Convert COO to CSR (it is just for the rows idx)
crs-&amp;amp;gt;cu_csrRowPtrA = alloc&amp;amp;lt;int&amp;amp;gt;(coo.m+1);
{
int* cu_cooRowIndA = allocAndCopy(coo.rowind, coo.nnz);
CudaSparseCheck(cusparseXcoo2csr(crs-&amp;amp;gt;cusparseHandle, cu_cooRowIndA,
coo.nnz, coo.m, crs-&amp;amp;gt;cu_csrRowPtrA, CUSPARSE_INDEX_BASE_ZERO));
CudaCheck(cudaFree(cu_cooRowIndA));
}
// Copy cols idx and values that are unchanged
crs-&amp;amp;gt;cu_csrValA = allocAndCopy(coo.val, coo.nnz);
crs-&amp;amp;gt;cu_csrColIndA = allocAndCopy(coo.colind, coo.nnz);
}

double compute_CRS( CRSArrays&amp;amp;amp; crs, double *x , double *y){
// For blas 2 gemv y = alpha.x.A + Beta.y
const double alpha = 1.0;
const double beta = 0.0;
// Copy input
double* cu_x = allocAndCopy(x, crs.m);
double* cu_y = allocAndCopy(y, crs.m);
// Init matrix properties
cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Compute gemv
float gemvComputeTume = 0;
{
cudaEvent_t startTime, stopTime;
cudaEventCreate(&amp;amp;amp;startTime);
cudaEventCreate(&amp;amp;amp;stopTime);
cudaEventRecord(startTime, crs.streamId);

CudaSparseCheck(cusparseDcsrmv(crs.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
crs.m, crs.m, crs.nnz, &amp;amp;amp;alpha,
descr, crs.cu_csrValA, crs.cu_csrRowPtrA,
crs.cu_csrColIndA, cu_x, &amp;amp;amp;beta, cu_y));

cudaEventRecord(stopTime, crs.streamId);
cudaEventSynchronize(stopTime);
cudaEventElapsedTime(&amp;amp;amp;gemvComputeTume, startTime, stopTime);
gemvComputeTume /=1000.0;
}
// Get back result
CudaCheck( cudaMemcpy(y, cu_y, crs.m*sizeof(double), cudaMemcpyDeviceToHost ) );
// Dealloc vectors
CudaCheck(cudaFree(cu_x));
CudaCheck(cudaFree(cu_y));

return gemvComputeTume;
}

//////////////////////////////////////////////////////////////////////////
// BCSR part
//////////////////////////////////////////////////////////////////////////

struct BCRSArrays{
int m;
int nnz;
int nbBlocks;
int nbBlockRow;
int blockSize;

int* cu_bsrRowPtrC;
int* cu_bsrColIndC;
double* cu_bsrValC;

cudaStream_t streamId;
cusparseHandle_t cusparseHandle;

BCRSArrays(){
cu_bsrRowPtrC = NULL;
cu_bsrColIndC = NULL;
cu_bsrValC = NULL;

// Create sparse handle (needed to call sparse functions
streamId = 0;
cusparseHandle = 0;
CudaSparseCheck(cusparseCreate(&amp;amp;amp;cusparseHandle));
CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId));
}

~BCRSArrays(){
CudaCheck(cudaFree(cu_bsrRowPtrC));
CudaCheck(cudaFree(cu_bsrColIndC));
CudaCheck(cudaFree(cu_bsrValC));

// Destroy sparse handle
CudaSparseCheck(cusparseDestroy(cusparseHandle));
}
};

void CRS_to_BCRS(CRSArrays&amp;amp;amp; csr, BCRSArrays* bcrs, const int blockSize){
bcrs-&amp;amp;gt;m = csr.m;
bcrs-&amp;amp;gt;nnz = csr.nnz;
bcrs-&amp;amp;gt;blockSize = blockSize;

bcrs-&amp;amp;gt;nbBlockRow = (csr.m + blockSize-1)/blockSize;

cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrRowPtrC, sizeof(int) *(bcrs-&amp;amp;gt;nbBlockRow+1));

cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

int nbNnzBlocks;
cusparseXcsr2bsrNnz(bcrs-&amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN, csr.m, csr.m, descr, csr.cu_csrRowPtrA, csr.cu_csrColIndA,
blockSize, descr, bcrs-&amp;amp;gt;cu_bsrRowPtrC, &amp;amp;amp;nbNnzBlocks);
{
int firstBlockIdx, lastBlockIdx;
cudaMemcpy(&amp;amp;amp;lastBlockIdx, bcrs-&amp;amp;gt;cu_bsrRowPtrC+bcrs-&amp;amp;gt;nbBlockRow, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&amp;amp;amp;firstBlockIdx, bcrs-&amp;amp;gt;cu_bsrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
assert(firstBlockIdx == 0); // we are in base 0
assert(nbNnzBlocks == lastBlockIdx - firstBlockIdx);
}
bcrs-&amp;amp;gt;nbBlocks = nbNnzBlocks;

CudaCheck(cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrColIndC, sizeof(int)*nbNnzBlocks));
CudaCheck(cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrValC, sizeof(double)*(blockSize*blockSize)*nbNnzBlocks));
cusparseDcsr2bsr(bcrs-&amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN,
csr.m, csr.m, descr, csr.cu_csrValA, csr.cu_csrRowPtrA, csr.cu_csrColIndA, blockSize, descr, bcrs-&amp;amp;gt;cu_bsrValC, bcrs-&amp;amp;gt;cu_bsrRowPtrC, bcrs-&amp;amp;gt;cu_bsrColIndC);
}

double compute_BSR(BCRSArrays&amp;amp;amp; bcsr, double *x , double *y){
// For blas 2 gemv y = alpha.x.A + Beta.y
const double alpha = 1.0;
const double beta = 0.0;
// Copy input
const int sizeMultipleBlockSize = ((bcsr.m+bcsr.blockSize-1)/bcsr.blockSize)*bcsr.blockSize;
double* cu_x = allocAndCopyPart(x, bcsr.m, sizeMultipleBlockSize);
double* cu_y = allocAndCopyPart(y, bcsr.m, sizeMultipleBlockSize);
// Init matrix properties
cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Compute gemv
float gemvComputeTume = 0;
{
cudaEvent_t startTime, stopTime;
cudaEventCreate(&amp;amp;amp;startTime);
cudaEventCreate(&amp;amp;amp;stopTime);
cudaEventRecord(startTime, bcsr.streamId);

cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE,
bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &amp;amp;amp;alpha, descr,
bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize,
cu_x, &amp;amp;amp;beta, cu_y);

cudaEventRecord(stopTime, bcsr.streamId);
cudaEventSynchronize(stopTime);
cudaEventElapsedTime(&amp;amp;amp;gemvComputeTume, startTime, stopTime);
gemvComputeTume /=1000.0;
}
// Get back result
CudaCheck( cudaMemcpy(y, cu_y, bcsr.m*sizeof(double), cudaMemcpyDeviceToHost ) );
// Dealloc vectors
CudaCheck(cudaFree(cu_x));
CudaCheck(cudaFree(cu_y));

return gemvComputeTume;
}

Example of how to use it:
/** Simply return the maximum relative diff */
double ChechAccuracy(const double y1[], const double y2[], const int size){
double maxDiff = 0;
for(int idx = 0 ; idx &amp;amp;lt; size ; ++idx){
if(y1[idx] != 0.0){
maxDiff = Max(maxDiff, Abs((y1[idx]-y2[idx])/y1[idx]));
}
}
return maxDiff;
}

/** Compute with COO and call cuda CSR and BCSR */
void computeWithAll(COOArrays&amp;amp;amp; coo, double times[3], double errors[3], const int loop = 1, const int blockSize = 8){
CRSArrays crs;
COO_to_CRS(coo, &amp;amp;amp;crs);

BCRSArrays bcsr;
CRS_to_BCRS(crs, &amp;amp;amp;bcsr, blockSize);
const int dimMulitpleBlock = ((coo.m+blockSize-1)/blockSize)*blockSize;

double* x = new double[dimMulitpleBlock];
for(int idx = 0 ; idx &amp;amp;lt; dimMulitpleBlock ; ++idx){
x[idx] = 1.0;
}
double* y = new double[dimMulitpleBlock];
double* ycoo = new double[dimMulitpleBlock];

{
memset(ycoo, 0, sizeof(double)*coo.m);
for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){
compute_COO(coo, x, ycoo);
}
times[0] = 0;
errors[0] = 0;
}
{
memset(y, 0, sizeof(double)*coo.m);
times[1] = 0;
for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){
times[1] += compute_CRS(crs, x, y);
}
errors[1] = ChechAccuracy(ycoo, y, coo.m);
}
{
memset(y, 0, sizeof(double)*coo.m);
times[2] = 0;
for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){
times[2] += compute_BSR(bcsr, x, y);
}
errors[2] = ChechAccuracy(ycoo, y, coo.m);
}
delete[] x;
delete[] y;
delete[] ycoo;
}

To fill a COO struct:
const int dim = 300;
COOArrays coo;
coo.m = dim;
coo.nnz = dim*dim;
flops[0] = coo.nnz*2;
coo.val = new double[coo.nnz];
coo.rowind = new int[coo.nnz];
coo.colind = new int[coo.nnz];

for(int idxRow = 0 ; idxRow &amp;amp;lt; dim ; ++idxRow){
for(int idxCol = 0 ; idxCol &amp;amp;lt; dim ; ++idxCol){
const int idx = (idxRow)*dim + idxCol;
coo.val[idx] = 1.0;
coo.rowind[idx] = idxRow;
coo.colind[idx] = idxCol;
}
}

computeWithAll(coo, alltimes[0], errors[0]);

To compile:
nvcc --compiler-options "-m64" -O3 main.cu -o main.exe  --linker-options "-lcusparse"

[C/C++] Sparse matrix MKL examples (C00, CSR, DIA, BCSR) gemv and conversions

Berenger — Tue, 07 Jul 2015 11:38:45 +0000

Here is a code sample of using the MKL to perform SpMV (gemv), I put it in different functions but the code is not clean (mix of C and C++).
However it is easy to understand, there are the conversions and the call to gemv (as the references to intel website).
I put it on the web because after a few research I was not able to find a simple example of mkl on the internet, I hope this one will be useful to you.

//////////////////////////////////////////////////////////////////////////
// In this file we compute gemv sparse matrix using different mkl kernels,
// We first convert from COO to CSR and then we use the CSR to convert
// to DIA, and BCSR (Aligned)
//
// MKL matrix storage are presented at:
// https://software.intel.com/fr-fr/node/522243#F854349C-7075-4A36-A63C-EE7E2E869262
//
//////////////////////////////////////////////////////////////////////////

#include 

#include  // memset
#include  // for assert
#include    // just for timer
#include  // for random generation
#include   // for printf

// Utils macro
#define Min(x,y) ((x)<(y)?(x):(y))
#define Max(x,y) ((x)>(y)?(x):(y))
#define Abs(x) ((x)>(0)?(x):-(x))

//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////
struct COOArrays{
    MKL_INT m;      //< the dimension of the matrix
    MKL_INT nnz;    //< the number of nnz inside the matrix
    double *val;    //< the values (size = nnz)
    MKL_INT *rowind;//< the row indexes (size = nnz)
    MKL_INT *colind;//< the col indexes (size = nnz)

    /** simply set ptr to null */
    COOArrays(){
        val = NULL;
        rowind = NULL;
        colind = NULL;
    }

    /** delete ptr */
    ~COOArrays(){
        delete[] val;
        delete[] rowind;
        delete[] colind;
    }
};

/** see https://software.intel.com/fr-fr/node/520817#38F0A87C-7884-4A96-B83E-CEE88290580F */
void compute_COO(COOArrays& coo, double *x , double *y){
    char transa = 'N';
    // void mkl_cspblas_dcoogemv(char *transa, MKL_INT *m, double *val, MKL_INT *rowind,  MKL_INT *colind, MKL_INT *nnz, double *x,  double *y);
    mkl_cspblas_dcoogemv(&transa, &coo.m, coo.val, coo.rowind, coo.colind, &coo.nnz, x, y);
}

//////////////////////////////////////////////////////////////////////////
// CRS part
//////////////////////////////////////////////////////////////////////////

struct CRSArrays{
    MKL_INT m;  //< the dim of the matrix
    MKL_INT nnz;//< the number of nnz (== ia[m])
    double *a;  //< the values (of size NNZ)
    MKL_INT *ia;//< the usual rowptr (of size m+1)
    MKL_INT *ja;//< the colidx of each NNZ (of size nnz)

    CRSArrays(){
        a = NULL;
        ia = NULL;
        ja= NULL;
    }

    ~CRSArrays(){
        delete[] a;
        delete[] ia;
        delete[] ja;
    }
};

/** See https://software.intel.com/fr-fr/node/520849#449CA855-CE5B-4061-B003-70D078CA5E05 */
void COO_to_CRS(COOArrays& coo, CRSArrays* crs){
    MKL_INT job[6] = {1,//if job(1)=1, the matrix in the coordinate format is converted to the CRS format.
                     0,//If job(2)=0, zero-based indexing for the matrix in CRS format is used;
                     0,//If job(3)=0, zero-based indexing for the matrix in coordinate format is used;
                     0,
                     coo.nnz,//job(5)=nnz - sets number of the non-zero elements of the matrix A if job(1)=1.
                     0 //If job(6)=0, all arrays acsr, ja, ia are filled in for the output storage.
                     };
    // Init crs
    crs->m = coo.m;
    crs->nnz = coo.nnz;
    crs->a = new double[crs->nnz];
    crs->ia = new MKL_INT[crs->m+1];
    crs->ja = new MKL_INT[crs->nnz];
    MKL_INT nnz = coo.nnz;
    MKL_INT info;
    mkl_dcsrcoo(job , &coo.m,
                crs->a , crs->ja , crs->ia , &nnz ,
                coo.val, coo.rowind , coo.colind , &info );
}

/** See https://software.intel.com/fr-fr/node/520815#D840F0E5-E41A-4E91-94D2-FEB320F93E91 */
void compute_CRS( CRSArrays& crs, double *x , double *y){
    char transa = 'N';
    // void mkl_cspblas_dcsrgemv (const char *transa , const MKL_INT *m , const double *a , const MKL_INT *ia , const MKL_INT *ja , const double *x , double *y );
    mkl_cspblas_dcsrgemv(&transa, &crs.m , crs.a , crs.ia , crs.ja , x , y);
}

//////////////////////////////////////////////////////////////////////////
// DIA part
//////////////////////////////////////////////////////////////////////////

struct DIAArrays{
    MKL_INT m;      //< the dimensio of the matrix
    MKL_INT nnz;    //< the number of nnz inside the matrix
    double *val;    //< the NNZ values - may include zeros (of size lval*ndiag)*/
    MKL_INT *idiag; //< distance from the diagonal (of size ndiag)
    MKL_INT lval;   //< leading where the diagonals are stored >= m,  which is the declared leading dimension in the calling (sub)programs
    MKL_INT ndiag;  //< number of diagonals that have at least one nnz

    DIAArrays(){
        val = NULL;
        idiag = NULL;
    }

    ~DIAArrays(){
        delete[] val;
        delete[] idiag;
    }
};

/** https://software.intel.com/fr-fr/node/520852#00B3CA58-E0E4-4ED9-B42B-BC6338AB461D */
void CRS_to_DIA(CRSArrays& crs, DIAArrays* dia){
    MKL_INT job[6] = {0,//If job(1)=0, the matrix in the CRS format is converted to the diagonal format;
                     0,//If job(2)=0, zero-based indexing for the matrix in CRS format is used;
                     1,//if job(3)=1, one-based indexing for the matrix in the diagonal format is used.
                     0,
                     0,
                     10//If job(6)=10, diagonals are selected internally, and acsr_rem, ja_rem, ia_rem are not filled in for the output storage.
                     };
    dia->m = crs.m;
    dia->nnz = crs.nnz;
    dia->lval = dia->m;
    dia->ndiag = 0;
    // We need to count the number of diagonals with NNZ
    {
        unsigned* usedDiag = new unsigned[crs.m*2-1];
        memset(usedDiag, 0, sizeof(unsigned)*(crs.m*2-1));

        for(int idxRow = 0 ; idxRow < crs.m ; ++idxRow){
            for(int idxVal = crs.ia[idxRow] ; idxVal < crs.ia[idxRow+1] ; ++idxVal){
                const int idxCol = crs.ja[idxVal];
                const int diag = crs.m-idxRow+idxCol-1;
                assert(0 <= diag && diag < crs.m*2-1);
                if(usedDiag[diag] == 0){
                    usedDiag[diag] = 1;
                    dia->ndiag += 1;
                }
            }
        }

        delete[] usedDiag;
    }
    // Allocate the working arrays
    dia->val = new double[dia->ndiag*dia->lval];
    dia->idiag = new MKL_INT[dia->ndiag];

    // void mkl_dcsrdia (const MKL_INT *job , const MKL_INT *n , double *acsr , MKL_INT *ja ,
    // MKL_INT *ia , double *adia , const MKL_INT *ndiag , MKL_INT *distance , MKL_INT *idiag ,
    // double *acsr_rem , MKL_INT *ja_rem , MKL_INT *ia_rem , MKL_INT *info );

    MKL_INT info;
    mkl_dcsrdia(job , &crs.m ,crs.a , crs.ja, crs.ia ,
                dia->val , &dia->lval , dia->idiag , &dia->ndiag ,
                NULL , NULL , NULL , &info );
}

/** https://software.intel.com/fr-fr/node/520806#FCB5B469-8AA1-4CFB-88BE-E2F22E9E2AF0 */
void compute_DIA_1idx(DIAArrays& dia , double *x , double *y){
    char transa = 'N';
    // void mkl_ddiagemv (const char *transa , const MKL_INT *m , const double *val , const MKL_INT *lval , const MKL_INT *idiag , const MKL_INT *ndiag , const double *x , double *y );
    mkl_ddiagemv(&transa, &dia.m , dia.val , &dia.lval , dia.idiag , &dia.ndiag , x , y);
}

//////////////////////////////////////////////////////////////////////////
// BCRS part
//////////////////////////////////////////////////////////////////////////

struct BCRSArrays{
    MKL_INT m;
    MKL_INT nnz;
    MKL_INT nbBlocks;
    MKL_INT nbBlockRows;
    MKL_INT lb;/*size of blocks*/
    MKL_INT ldabsr;/*leading >= lb*lb*/
    double *a;/*values(m*lb*lb)*/
    MKL_INT *ia;/*i(m+1)*/
    MKL_INT *ja;/*j(m+1)*/
    MKL_INT allocatedBlocks;

    BCRSArrays(){
        a = NULL;
        ia = NULL;
        ja = NULL;
    }

    ~BCRSArrays(){
        delete[] a;
        delete[] ia;
        delete[] ja;
    }
};

/** https://software.intel.com/fr-fr/node/520850#3A22B45C-4604-4444-B6FE-205A5CD4E667 */
void CRS_to_BCRS(CRSArrays& crs, BCRSArrays* bcrs, const int blockSize){
    MKL_INT job[6] = {0,//If job(1)=0, the matrix in the CSR format is converted to the BSR format;
                     0,//If job(2)=0, zero-based indexing for the matrix in CSR format is used;
                     0,//If job(3)=0, zero-based indexing for the matrix in the BSR format is used;
                     0,
                     0,
                     1 //If job(6)>0, all output arrays absr, jab, and iab are filled in for the output storage.
                     };
    bcrs->m = crs.m;
    bcrs->nnz = crs.nnz;
    bcrs->nbBlocks = 0;
    // We need to count the number of blocks (aligned!)
    {
        const MKL_INT maxBlockPerRow = (bcrs->m+blockSize-1)/blockSize;
        unsigned* usedBlocks = new unsigned[maxBlockPerRow];

        for(int idxRow = 0 ; idxRow < crs.m ; ++idxRow){
            if(idxRow%blockSize == 0){
                memset(usedBlocks, 0, sizeof(unsigned)*maxBlockPerRow);
            }
            for(int idxVal = crs.ia[idxRow] ; idxVal < crs.ia[idxRow+1] ; ++idxVal){
                const int idxCol = crs.ja[idxVal];
                if(usedBlocks[idxCol/blockSize] == 0){
                    usedBlocks[idxCol/blockSize] = 1;
                    bcrs->nbBlocks += 1;
                }
            }
        }

        delete[] usedBlocks;
    }
    bcrs->nbBlockRows = (bcrs->m+blockSize-1)/blockSize;
    bcrs->lb = blockSize;
    bcrs->ldabsr = bcrs->lb*bcrs->lb;
    bcrs->a = new double[bcrs->nbBlocks*bcrs->lb*bcrs->lb];
    bcrs->ia = new MKL_INT[bcrs->nbBlockRows+1];
    bcrs->ja = new MKL_INT[bcrs->nbBlocks];

    // void mkl_dcsrbsr (const MKL_INT *job , const MKL_INT *m , const MKL_INT *mblk ,
    // const MKL_INT *ldabsr , double *acsr , MKL_INT *ja , MKL_INT *ia , double *absr ,
    // MKL_INT *jab , MKL_INT *iab , MKL_INT *info );
    MKL_INT info;
    mkl_dcsrbsr(job , &crs.m ,
                 &bcrs->lb , &bcrs->ldabsr,
                 crs.a , crs.ja , crs.ia ,
                 bcrs->a, bcrs->ja , bcrs->ia, &info );
    assert(bcrs->ia[bcrs->nbBlockRows] == bcrs->nbBlocks);
}

/** https://software.intel.com/fr-fr/node/520816#366F2854-A2C0-4661-8CE7-F478F8E6B613 */
void compute_BSR(BCRSArrays& bcsr, double *x , double *y ){
    char transa = 'N';
    // void mkl_cspblas_dbsrgemv (const char *transa , const MKL_INT *m , const MKL_INT *lb , const double *a , const MKL_INT *ia , const MKL_INT *ja , const double *x , double *y );
    mkl_cspblas_dbsrgemv(&transa, &bcsr.nbBlockRows , &bcsr.lb , bcsr.a , bcsr.ia , bcsr.ja , x , y);
}

For example we can compare these function (if we have a COO struct that has been filled)

//////////////////////////////////////////////////////////////////////////
// Utils part
//////////////////////////////////////////////////////////////////////////

/** Simply return the max relative diff */
double ChechAccuracy(const double y1[], const double y2[], const int size){
    double maxDiff = 0;
    for(int idx = 0 ; idx < size ; ++idx){
        if(y1[idx] != 0.0){
            maxDiff = Max(maxDiff, Abs((y1[idx]-y2[idx])/y1[idx]));
        }
    }
    return maxDiff;
}

/** From a Coo this function convert in all format and calls the different kernels,
 * it returns the error, the elapsed time */
void computeWithAll(COOArrays& coo, double times[4], double errors[4], const int loop = 1, const int blockSize = 8){
    CRSArrays crs;
    COO_to_CRS(coo, &crs);

    DIAArrays dia;
    CRS_to_DIA(crs, &dia);

    BCRSArrays bcsr;
    CRS_to_BCRS(crs, &bcsr, blockSize);
    const int dimMulitpleBlock = ((coo.m+blockSize-1)/blockSize)*blockSize;

    double* x = new double[dimMulitpleBlock];
    for(int idx = 0 ; idx < dimMulitpleBlock ; ++idx){
        x[idx] = 1.0;
    }
    double* y = new double[dimMulitpleBlock];
    double* ycoo = new double[dimMulitpleBlock];

    {
        memset(ycoo, 0, sizeof(double)*coo.m);
        const double timeStart = omp_get_wtime();
        for(int idxLoop = 0 ; idxLoop < loop ; ++idxLoop){
            compute_COO(coo, x, ycoo);
        }
        const double timeEnd = omp_get_wtime();
        times[0] = timeEnd-timeStart;
        errors[0] = 0;
    }
    {
        memset(y, 0, sizeof(double)*coo.m);
        const double timeStart = omp_get_wtime();
        for(int idxLoop = 0 ; idxLoop < loop ; ++idxLoop){
            compute_CRS(crs, x, y);
        }
        const double timeEnd = omp_get_wtime();
        times[1] = timeEnd-timeStart;
        errors[1] = ChechAccuracy(ycoo, y, coo.m);
    }
    {
        memset(y, 0, sizeof(double)*coo.m);
        const double timeStart = omp_get_wtime();
        for(int idxLoop = 0 ; idxLoop < loop ; ++idxLoop){
            compute_DIA_1idx(dia, x, y);
        }
        const double timeEnd = omp_get_wtime();
        times[2] = timeEnd-timeStart;
        errors[2] = ChechAccuracy(ycoo, y, coo.m);
    }
    {
        memset(y, 0, sizeof(double)*coo.m);
        const double timeStart = omp_get_wtime();
        for(int idxLoop = 0 ; idxLoop < loop ; ++idxLoop){
            compute_BSR(bcsr, x, y);
        }
        const double timeEnd = omp_get_wtime();
        times[3] = timeEnd-timeStart;
        errors[3] = ChechAccuracy(ycoo, y, coo.m);
    }
    delete[] x;
    delete[] y;
    delete[] ycoo;
}

To fill a COO struct:

        const int dim = 3000;
        COOArrays coo;
        coo.m = dim;
        coo.nnz = dim*dim;
        flops[0] = coo.nnz*2;
        coo.val = new double[coo.nnz];
        coo.rowind = new int[coo.nnz];
        coo.colind = new int[coo.nnz];

        for(int idxRow = 0 ; idxRow < dim ; ++idxRow){
            for(int idxCol = 0 ; idxCol < dim ; ++idxCol){
                const int idx = (idxRow)*dim + idxCol;
                coo.val[idx] = 1.0;
                coo.rowind[idx] = idxRow;
                coo.colind[idx] = idxCol;
            }
        }

        computeWithAll(coo, alltimes[0], errors[0]);

To compile you can use something like:

g++ -m64 -I${MKLROOT}/include -O3 main.cpp -o main.exe -lgomp -fopenmp  -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm