[C/C++][CUDA] cuSparse Sparse matrix examples (CSR, BCSR) SpMV and conversions

I left on this page an old a deprecated code (at the bottom) and a new version at the top.

Apologize I do not have time to clean and comment it, but I hope it might help if someone is searching for an example.

New code (tested with CUDA 11.6)


#include <cuda_runtime.h>
#include <cusparse.h></cusparse.h></cuda_runtime.h>

#include <cassert></cassert>
 #include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <tuple></tuple></vector></string></sstream></fstream></iostream>

#include "SpTimer.hpp"

#define CUDA_ASSERT(X)\
{\
    cudaError_t ___resCuda = (X);\
    if ( cudaSuccess != ___resCuda ){\
        printf("Error: fails, %s (%s line %d)\nbCols", cudaGetErrorString(___resCuda), __FILE__, __LINE__ );\
        exit(1);\
    }\
}

#define CUSPARSE_ASSERT(X)\
{\
    cusparseStatus_t ___resCuda = (X);\
    if ( CUSPARSE_STATUS_SUCCESS != ___resCuda ){\
        printf("Error: fails, %s (%s line %d)\nbCols", cusparseGetErrorString(___resCuda), __FILE__, __LINE__ );\
        exit(1);\
    }\
}

std::vector<std::string> SplitString(const std::string& inString, const char inDelim = ' ') {
    std::vector<std::string> substrings;
    size_t start;
    size_t end = 0;
    while ((start = inString.find_first_not_of(inDelim, end)) != std::string::npos) {
        end = inString.find(inDelim, start);
        substrings.push_back(inString.substr(start, end - start));
    }
    return substrings;
}</std::string></std::string>

std::tuple<int, int,="" std::vector<std::tuple<int,="" double="">>> ReadMMFile(const std::string& inFilename){
    std::ifstream file(inFilename);
    if (!file.is_open()) {
        std::cerr << "Error: Could not open file " << inFilename << std::endl;
        return {};
    }

    // %%MatrixMarket matrix coordinate real general
    {
        std::string header;
        std::getline(file, header);
        const auto words = SplitString(header);
        if(words.size() != 5 || words[3] != "real"){
            std::cerr << "Error: Incompatible matrix: " << header << std::endl;
            return {};
        }
    }
    int nbRows, nbCols, nnz;
    char c;
    {
        std::string line;
        std::getline(file, line);
        while(line.size() && line[0] == '%'){
            std::getline(file, line);        
        }
        if(!file.good() || line.size() == 0){
            std::cerr << "Error: in header" << std::endl;
            return {};
        }
        
        std::istringstream iss(line);
        iss >> nbRows >> nbCols >> nnz;
    }</int,>

    std::vector<std::tuple<int, int,="" double="">> values;
    values.reserve(nnz);</std::tuple<int,>

    int i, j;
    double val;
    while (file >> i >> j >> val) {
        values.emplace_back(i - 1, j - 1, val);
        nnz -= 1;
    }

    if(nnz != 0){
        std::cerr << "Error: Cannot read the expected number of NNZ" << std::endl;
        return {};
    }


std::sort(values.begin(), values.end(),
[](const auto& v1, const auto& v2){
return std::get<0>(v1) < std::get<0>(v2)
|| (std::get<0>(v1) == std::get<0>(v2) && std::get<1>(v1) < std::get<1>(v2));
});

    return {nbRows, nbCols, values};
}

std::tuple&amp;amp;lt;int, int,="" std::vector&amp;amp;lt;int=""&amp;amp;gt;, std::vector&amp;amp;lt;int&amp;amp;gt;, std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;gt;
MMToCsr(const int nbRows, const int nbCols, const std::vector&amp;amp;lt;std::tuple&amp;amp;lt;int, int,="" double=""&amp;amp;gt;&amp;amp;amp;gt;&amp;amp;amp;amp; values)
{
    std::vector&amp;amp;lt;int&amp;amp;gt; row_ptr(nbRows + 1, 0);
    std::vector&amp;amp;lt;int&amp;amp;gt; col_ind;
    std::vector&amp;amp;lt;double&amp;amp;gt; val;
    col_ind.reserve(values.size());
    val.reserve(values.size());&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/std::tuple&amp;amp;lt;int,&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/int,&amp;amp;gt;

    for (const auto &amp;amp;amp;amp;[i, j, v] : values) {
        row_ptr[i + 1]++;
        col_ind.emplace_back(j);
        val.emplace_back(v);
    }

    for (int idxRow = 0; idxRow &amp;amp;amp;lt; nbRows; idxRow++) {
        row_ptr[idxRow + 1] += row_ptr[idxRow];
    }

    return {nbRows, nbCols, row_ptr, col_ind, val};
}

std::vector&amp;amp;lt;double&amp;amp;gt; cuSparseCsr(const int nbRows, const int nbCols,
                 const std::vector&amp;amp;lt;int&amp;amp;gt;&amp;amp;amp;amp; csrOffset,
                 const std::vector&amp;amp;lt;int&amp;amp;gt;&amp;amp;amp;amp; csrColIdx,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; values,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecX,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecY,
                 const int NbLoops = 10){
    const cudaDataType RealType = CUDA_R_64F;
    const int nnz = values.size();
    const double alpha = 1;
    const double beta  = 0;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;

    assert(vecX.size() == nbCols);
    assert(vecY.size() == nbRows);
    assert(csrColIdx.size() == nnz);
    assert(csrOffset.size() == nbRows+1);

    cusparseHandle_t cusparseHandle = NULL;
    CUSPARSE_ASSERT( cusparseCreate(&amp;amp;amp;amp;cusparseHandle) )

    int* cuCsrOffsets;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuCsrOffsets, (nbRows + 1) * sizeof(int)) )
    CUDA_ASSERT( cudaMemcpy(cuCsrOffsets, csrOffset.data(),
                           (nbRows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    int* cuCsrColIdx;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuCsrColIdx, nnz * sizeof(int))        )
    CUDA_ASSERT( cudaMemcpy(cuCsrColIdx, csrColIdx.data(), nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    double* cuValues;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuValues,  nnz * sizeof(double))      )
    CUDA_ASSERT( cudaMemcpy(cuValues, values.data(), nnz * sizeof(float),
                           cudaMemcpyHostToDevice) )

    cusparseSpMatDescr_t cuMatrixDescr;
    CUSPARSE_ASSERT( cusparseCreateCsr(&amp;amp;amp;amp;cuMatrixDescr, nbRows, nbCols, nnz,
                                      cuCsrOffsets, cuCsrColIdx, cuValues,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, RealType) )

    double* cuVecX;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuVecX, nbCols * sizeof(double)) )
    CUDA_ASSERT( cudaMemcpy(cuVecX, vecX.data(), nbCols * sizeof(double),
                           cudaMemcpyHostToDevice) )
    cusparseDnVecDescr_t cuVecDescX;
    CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;amp;cuVecDescX, nbCols, cuVecX, RealType) )

    double* cuVecY;
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuVecY, nbRows * sizeof(double)) )
    CUDA_ASSERT( cudaMemcpy(cuVecY, vecY.data(), nbRows * sizeof(double),
                           cudaMemcpyHostToDevice) )
    cusparseDnVecDescr_t cuVecDescrY;
    CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;amp;cuVecDescrY, nbRows, cuVecY, RealType) )

    void*  cuBuffer    = NULL;
    size_t bufferSize = 0;
    CUSPARSE_ASSERT( cusparseSpMV_bufferSize(
                                 cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &amp;amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;amp;beta, cuVecDescrY, RealType,
                                 CUSPARSE_SPMV_ALG_DEFAULT, &amp;amp;amp;amp;bufferSize) )
    CUDA_ASSERT( cudaMalloc(&amp;amp;amp;amp;cuBuffer, bufferSize) )

    SpTimer timer;
    for(int idxLoop = 0 ; idxLoop &amp;amp;amp;lt; NbLoops ; ++idxLoop){
    	CUSPARSE_ASSERT( cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &amp;amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;amp;beta, cuVecDescrY, RealType,
                                 CUSPARSE_SPMV_ALG_DEFAULT, cuBuffer) )
    }
    timer.stop();
    std::cout &amp;amp;amp;lt;&amp;amp;amp;lt; "CuSparse took: " &amp;amp;amp;lt;&amp;amp;amp;lt; timer.getElapsed() &amp;amp;amp;lt;&amp;amp;amp;lt; "s ("
              &amp;amp;amp;lt;&amp;amp;amp;lt; (NbLoops*nnz*2)/timer.getElapsed()/1E9 &amp;amp;amp;lt;&amp;amp;amp;lt; "GFlop/s) for "
              &amp;amp;amp;lt;&amp;amp;amp;lt; NbLoops &amp;amp;amp;lt;&amp;amp;amp;lt; " iterations" &amp;amp;amp;lt;&amp;amp;amp;lt; std::endl;
    
    CUSPARSE_ASSERT( cusparseDestroySpMat(cuMatrixDescr) )
    CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescX) )
    CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescrY) )
    CUSPARSE_ASSERT( cusparseDestroy(cusparseHandle) )
    
    
    std::vector&amp;amp;lt;double&amp;amp;gt; resY(nbRows);
    CUDA_ASSERT( cudaMemcpy(resY.data(), cuVecY, nbRows * sizeof(double),
                           cudaMemcpyDeviceToHost) )&amp;amp;lt;/double&amp;amp;gt;

    CUDA_ASSERT( cudaFree(cuBuffer) )
    CUDA_ASSERT( cudaFree(cuCsrOffsets) )
    CUDA_ASSERT( cudaFree(cuCsrColIdx) )
    CUDA_ASSERT( cudaFree(cuValues) )
    CUDA_ASSERT( cudaFree(cuVecX) )
    CUDA_ASSERT( cudaFree(cuVecY) )

    return resY;
}

std::vector&amp;amp;lt;double&amp;amp;gt; SpMvCsr(const int nbRows, const int nbCols,
                 const std::vector&amp;amp;lt;int&amp;amp;gt;&amp;amp;amp;amp; csrOffset,
                 const std::vector&amp;amp;lt;int&amp;amp;gt;&amp;amp;amp;amp; csrColIdx,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; values,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecX,
                 const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecY){
    assert(vecX.size() == nbCols);
    assert(vecY.size() == nbRows);
    assert(csrColIdx.size() == nnz);
    assert(csrOffset.size() == nbRows+1);&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/int&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;

    const double alpha = 1;
    const double beta  = 0;

    std::vector&amp;amp;lt;double&amp;amp;gt; resY(nbRows);&amp;amp;lt;/double&amp;amp;gt;

    for(int idxRow = 0 ; idxRow &amp;amp;amp;lt; nbRows ; ++idxRow){
    	double res = 0;
    	for(int idxVal = csrOffset[idxRow] ; idxVal &amp;amp;amp;lt; csrOffset[idxRow+1] ; ++idxVal){
    		res += vecX[csrColIdx[idxVal]] * values[idxVal];
    	}
    	resY[idxRow] = res * alpha + vecY[idxRow] * beta;
    }
    
    return resY;
}

auto MaxRelDiff(const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecGood,
	         const std::vector&amp;amp;lt;double&amp;amp;gt;&amp;amp;amp;amp; vecTest){
    if(vecGood.size() != vecTest.size()){
        return std::numeric_limits&amp;amp;lt;double&amp;amp;gt;::infinity();
    }&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;

    double diff = 0;

	for(int idxVal = 0 ; idxVal &amp;amp;amp;lt; vecGood.size() ; ++idxVal){
	    diff = std::max(diff, (vecGood[idxVal] == 0 ? vecTest[idxVal] : std::abs((vecGood[idxVal]-vecTest[idxVal])/vecGood[idxVal])));
	}
	
    return diff;
}


int main(int argc, char *argv[]){

    if (argc != 2) {
        std::cerr &amp;amp;amp;lt;&amp;amp;amp;lt; "Usage: " &amp;amp;amp;lt;&amp;amp;amp;lt; argv[0] &amp;amp;amp;lt;&amp;amp;amp;lt; " matrix.mtx" &amp;amp;amp;lt;&amp;amp;amp;lt; std::endl;
        return 1;
    }

    auto [nbRows, nbCols, values] = ReadMMFile(argv[1]);
    std::cout &amp;amp;amp;lt;&amp;amp;amp;lt; "Matrix dimensions: " &amp;amp;amp;lt;&amp;amp;amp;lt; nbRows &amp;amp;amp;lt;&amp;amp;amp;lt; " x " &amp;amp;amp;lt;&amp;amp;amp;lt; nbCols &amp;amp;amp;lt;&amp;amp;amp;lt; std::endl;
    std::cout &amp;amp;amp;lt;&amp;amp;amp;lt; "Number of non-zero elements: " &amp;amp;amp;lt;&amp;amp;amp;lt; values.size() &amp;amp;amp;lt;&amp;amp;amp;lt; std::endl;

    auto [m_csr, n_csr, row_ptr, col_ind, val] = MMToCsr(nbRows, nbCols, values);

    std::vector&amp;amp;lt;double&amp;amp;gt; vecX(n_csr, 1);
    std::vector&amp;amp;lt;double&amp;amp;gt; vecY(n_csr, 0);
    auto resCuSparse = cuSparseCsr(m_csr, n_csr, row_ptr, col_ind, val,
                                   vecX, vecY);&amp;amp;lt;/double&amp;amp;gt;&amp;amp;lt;/double&amp;amp;gt;

    auto resCsr = SpMvCsr(m_csr, n_csr, row_ptr, col_ind, val,
                                   vecX, vecY);

    std::cout &amp;amp;amp;lt;&amp;amp;amp;lt; "Error cuSparse vs. CSR: " &amp;amp;amp;lt;&amp;amp;amp;lt; MaxRelDiff(resCsr,resCuSparse) &amp;amp;amp;lt;&amp;amp;amp;lt; std::endl;
    
    return 0;
}

Old code (but with conversions)

//////////////////////////////////////////////////////////////////////////
// In this file we compute Gemv sparse matrix with cuda for two kernels:
// CSR and BCSR
// We use a CPU COO version to test the accuracy of the code.
//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

#include &amp;amp;amp;lt;cuda.h&amp;amp;amp;gt;
#include &amp;amp;amp;lt;cusparse_v2.h&amp;amp;amp;gt;
#include &amp;amp;amp;lt;cublas_v2.h&amp;amp;amp;gt;

#include &amp;amp;amp;lt;cstring&amp;amp;amp;gt; // memset
#include &amp;amp;amp;lt;cassert&amp;amp;amp;gt;
#include &amp;amp;amp;lt;cstdio&amp;amp;amp;gt;

#define Min(x,y) ((x)&amp;amp;amp;lt;(y)?(x):(y))
#define Max(x,y) ((x)&amp;amp;amp;gt;(y)?(x):(y))
#define Abs(x) ((x)&amp;amp;amp;gt;(0)?(x):-(x))

////////////////////// CUDA ERROR /////////////////////////////////////////

static void CudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}

#define CudaCheck( test ) { CudaCheckCore((test), __FILE__, __LINE__); }
#define CudaCheckAfterCall() { CudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }

////////////////////// CUDA SPARSE ERROR ///////////////////////////////////

static const char * cusparseGetErrorString(cusparseStatus_t error)
{
// Read more at: http://docs.nvidia.com/cuda/cusparse/index.html#ixzz3f79JxRar
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "The cuSPARSE library was not initialized. This is usually caused by the lack of a prior call, an error in the CUDA Runtime API called by the cuSPARSE routine, or an error in the hardware setup.\n" \
"To correct: call cusparseCreate() prior to the function call; and check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed.";

case CUSPARSE_STATUS_ALLOC_FAILED:
return "Resource allocation failed inside the cuSPARSE library. This is usually caused by a cudaMalloc() failure.\n"\
"To correct: prior to the function call, deallocate previously allocated memory as much as possible.";

case CUSPARSE_STATUS_INVALID_VALUE:
return "An unsupported value or parameter was passed to the function (a negative vector size, for example).\n"\
"To correct: ensure that all the parameters being passed have valid values.";

case CUSPARSE_STATUS_ARCH_MISMATCH:
return "The function requires a feature absent from the device architecture; usually caused by the lack of support for atomic operations or double precision.\n"\
"To correct: compile and run the application on a device with appropriate compute capability, which is 1.1 for 32-bit atomic operations and 1.3 for double precision.";

case CUSPARSE_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed, which is usually caused by a failure to bind a texture.\n"\
"To correct: prior to the function call, unbind any previously bound textures.";

case CUSPARSE_STATUS_EXECUTION_FAILED:
return "The GPU program failed to execute. This is often caused by a launch failure of the kernel on the GPU, which can be caused by multiple reasons.\n"\
"To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed.";

case CUSPARSE_STATUS_INTERNAL_ERROR:
return "An internal cuSPARSE operation failed. This error is usually caused by a cudaMemcpyAsync() failure.\n"\
"To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed. Also, check that the memory passed as a parameter to the routine is not being deallocated prior to the routine’s completion.";

case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "The matrix type is not supported by this function. This is usually caused by passing an invalid matrix descriptor to the function.\n"\
"To correct: check that the fields in cusparseMatDescr_t descrA were set correctly.";
}

return "&amp;amp;amp;lt;unknown&amp;amp;amp;gt;";
}
static void CudaSparseCheckCore(cusparseStatus_t code, const char *file, int line) {
if (code != CUSPARSE_STATUS_SUCCESS) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cusparseGetErrorString(code), file, line);
exit(code);
}
}

#define CudaSparseCheck( test ) { CudaSparseCheckCore((test), __FILE__, __LINE__); }

////////////// Alloc and copy ////////////////////////////////////////////////

template &amp;amp;amp;lt;class ObjectType&amp;amp;amp;gt;
ObjectType* allocAndCopy(const ObjectType src[], const int size){
ObjectType* dest = NULL;
CudaCheck( cudaMalloc(&amp;amp;amp;amp;dest,size*sizeof(ObjectType)) );
CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) );
return dest;
}

template &amp;amp;amp;lt;class ObjectType&amp;amp;amp;gt;
ObjectType* alloc(const int size){
ObjectType* dest = NULL;
CudaCheck( cudaMalloc(&amp;amp;amp;amp;dest,size*sizeof(ObjectType)) );
return dest;
}

template &amp;amp;amp;lt;class ObjectType&amp;amp;amp;gt;
ObjectType* allocAndCopyPart(const ObjectType src[], const int size, const int allocSize){
ObjectType* dest = NULL;
assert(size &amp;amp;amp;lt;= allocSize);
CudaCheck( cudaMalloc(&amp;amp;amp;amp;dest,allocSize*sizeof(ObjectType)) );
CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) );
CudaCheck( cudaMemset(&amp;amp;amp;amp;dest[size],0,(allocSize-size)*sizeof(ObjectType)) );
return dest;
}
//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

#include &amp;amp;amp;lt;algorithm&amp;amp;amp;gt;

struct Ijv{
int i, j;
double v;
};

bool IjvComp(const Ijv&amp;amp;amp;amp; v1, const Ijv&amp;amp;amp;amp; v2){
return v1.i &amp;amp;amp;lt; v2.i || (v1.i == v2.i &amp;amp;amp;amp;&amp;amp;amp;amp; v1.j &amp;amp;amp;lt; v2.j);
}

struct COOArrays{
int m;
int nnz;
double *val;/*values(NNZ)*/
int *rowind;/*i(NNZ)*/
int *colind;/*j(NNZ)*/

COOArrays(){
val = NULL;
rowind = NULL;
colind = NULL;
}

~COOArrays(){
delete[] val;
delete[] rowind;
delete[] colind;
}

void sortToRowMajor(){

Ijv* ijvs = new Ijv[nnz];
for(int idxCopy = 0 ; idxCopy &amp;amp;amp;lt; nnz ; ++idxCopy){
ijvs[idxCopy].i = rowind[idxCopy];
ijvs[idxCopy].j = colind[idxCopy];
ijvs[idxCopy].v = val[idxCopy];
}

std::sort(ijvs, ijvs+nnz, IjvComp);

for(int idxCopy = 0 ; idxCopy &amp;amp;amp;lt; nnz ; ++idxCopy){
rowind[idxCopy] = ijvs[idxCopy].i;
colind[idxCopy] = ijvs[idxCopy].j;
val[idxCopy] = ijvs[idxCopy].v;
}

delete[] ijvs;
}
};

void compute_COO(COOArrays&amp;amp;amp;amp; coo, double *x , double *y ){
for(int idxVal = 0 ; idxVal &amp;amp;amp;lt; coo.nnz ; ++idxVal){
y[coo.rowind[idxVal]] += x[coo.colind[idxVal]] * coo.val[idxVal];
}
}

//////////////////////////////////////////////////////////////////////////
// COO part
//////////////////////////////////////////////////////////////////////////

struct CRSArrays{
int m;  //&amp;amp;amp;lt; the dim of the matrix
int nnz;//&amp;amp;amp;lt; the number of nnz (== ia[m])
double *cu_csrValA;  //&amp;amp;amp;lt; the values (of size NNZ)
int *cu_csrRowPtrA;//&amp;amp;amp;lt; the usual rowptr (of size m+1)
int *cu_csrColIndA;//&amp;amp;amp;lt; the colidx of each NNZ (of size nnz)

cudaStream_t streamId;
cusparseHandle_t cusparseHandle;

CRSArrays(){
cu_csrValA = NULL;
cu_csrRowPtrA = NULL;
cu_csrColIndA = NULL;

// Create sparse handle (needed to call sparse functions
streamId = 0;
cusparseHandle = 0;
CudaSparseCheck(cusparseCreate(&amp;amp;amp;amp;cusparseHandle));
CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId));
}

~CRSArrays(){
CudaCheck(cudaFree(cu_csrValA));
CudaCheck(cudaFree(cu_csrRowPtrA));
CudaCheck(cudaFree(cu_csrColIndA));

// Destroy sparse handle
CudaSparseCheck(cusparseDestroy(cusparseHandle));
}
};

void COO_to_CRS(COOArrays&amp;amp;amp;amp; coo, CRSArrays* crs){
// We need COO to be sorted by row (and column)
coo.sortToRowMajor();

crs-&amp;amp;amp;gt;m = coo.m;
crs-&amp;amp;amp;gt;nnz = coo.nnz;

// Convert COO to CSR (it is just for the rows idx)
crs-&amp;amp;amp;gt;cu_csrRowPtrA = alloc&amp;amp;amp;lt;int&amp;amp;amp;gt;(coo.m+1);
{
int* cu_cooRowIndA = allocAndCopy(coo.rowind, coo.nnz);
CudaSparseCheck(cusparseXcoo2csr(crs-&amp;amp;amp;gt;cusparseHandle, cu_cooRowIndA,
coo.nnz, coo.m, crs-&amp;amp;amp;gt;cu_csrRowPtrA, CUSPARSE_INDEX_BASE_ZERO));
CudaCheck(cudaFree(cu_cooRowIndA));
}
// Copy cols idx and values that are unchanged
crs-&amp;amp;amp;gt;cu_csrValA = allocAndCopy(coo.val, coo.nnz);
crs-&amp;amp;amp;gt;cu_csrColIndA = allocAndCopy(coo.colind, coo.nnz);
}

double compute_CRS( CRSArrays&amp;amp;amp;amp; crs, double *x , double *y){
// For blas 2 gemv y = alpha.x.A + Beta.y
const double alpha = 1.0;
const double beta = 0.0;
// Copy input
double* cu_x = allocAndCopy(x, crs.m);
double* cu_y = allocAndCopy(y, crs.m);
// Init matrix properties
cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Compute gemv
float gemvComputeTume = 0;
{
cudaEvent_t startTime, stopTime;
cudaEventCreate(&amp;amp;amp;amp;startTime);
cudaEventCreate(&amp;amp;amp;amp;stopTime);
cudaEventRecord(startTime, crs.streamId);

CudaSparseCheck(cusparseDcsrmv(crs.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
crs.m, crs.m, crs.nnz, &amp;amp;amp;amp;alpha,
descr, crs.cu_csrValA, crs.cu_csrRowPtrA,
crs.cu_csrColIndA, cu_x, &amp;amp;amp;amp;beta, cu_y));

cudaEventRecord(stopTime, crs.streamId);
cudaEventSynchronize(stopTime);
cudaEventElapsedTime(&amp;amp;amp;amp;gemvComputeTume, startTime, stopTime);
gemvComputeTume /=1000.0;
}
// Get back result
CudaCheck( cudaMemcpy(y, cu_y, crs.m*sizeof(double), cudaMemcpyDeviceToHost ) );
// Dealloc vectors
CudaCheck(cudaFree(cu_x));
CudaCheck(cudaFree(cu_y));

return gemvComputeTume;
}

//////////////////////////////////////////////////////////////////////////
// BCSR part
//////////////////////////////////////////////////////////////////////////

struct BCRSArrays{
int m;
int nnz;
int nbBlocks;
int nbBlockRow;
int blockSize;

int* cu_bsrRowPtrC;
int* cu_bsrColIndC;
double* cu_bsrValC;

cudaStream_t streamId;
cusparseHandle_t cusparseHandle;

BCRSArrays(){
cu_bsrRowPtrC = NULL;
cu_bsrColIndC = NULL;
cu_bsrValC = NULL;

// Create sparse handle (needed to call sparse functions
streamId = 0;
cusparseHandle = 0;
CudaSparseCheck(cusparseCreate(&amp;amp;amp;amp;cusparseHandle));
CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId));
}

~BCRSArrays(){
CudaCheck(cudaFree(cu_bsrRowPtrC));
CudaCheck(cudaFree(cu_bsrColIndC));
CudaCheck(cudaFree(cu_bsrValC));

// Destroy sparse handle
CudaSparseCheck(cusparseDestroy(cusparseHandle));
}
};

void CRS_to_BCRS(CRSArrays&amp;amp;amp;amp; csr, BCRSArrays* bcrs, const int blockSize){
bcrs-&amp;amp;amp;gt;m = csr.m;
bcrs-&amp;amp;amp;gt;nnz = csr.nnz;
bcrs-&amp;amp;amp;gt;blockSize = blockSize;

bcrs-&amp;amp;amp;gt;nbBlockRow = (csr.m + blockSize-1)/blockSize;

cudaMalloc((void**)&amp;amp;amp;amp;bcrs-&amp;amp;amp;gt;cu_bsrRowPtrC, sizeof(int) *(bcrs-&amp;amp;amp;gt;nbBlockRow+1));

cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

int nbNnzBlocks;
cusparseXcsr2bsrNnz(bcrs-&amp;amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN, csr.m, csr.m, descr, csr.cu_csrRowPtrA, csr.cu_csrColIndA,
blockSize, descr, bcrs-&amp;amp;amp;gt;cu_bsrRowPtrC, &amp;amp;amp;amp;nbNnzBlocks);
{
int firstBlockIdx, lastBlockIdx;
cudaMemcpy(&amp;amp;amp;amp;lastBlockIdx, bcrs-&amp;amp;amp;gt;cu_bsrRowPtrC+bcrs-&amp;amp;amp;gt;nbBlockRow, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&amp;amp;amp;amp;firstBlockIdx, bcrs-&amp;amp;amp;gt;cu_bsrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
assert(firstBlockIdx == 0); // we are in base 0
assert(nbNnzBlocks == lastBlockIdx - firstBlockIdx);
}
bcrs-&amp;amp;amp;gt;nbBlocks = nbNnzBlocks;

CudaCheck(cudaMalloc((void**)&amp;amp;amp;amp;bcrs-&amp;amp;amp;gt;cu_bsrColIndC, sizeof(int)*nbNnzBlocks));
CudaCheck(cudaMalloc((void**)&amp;amp;amp;amp;bcrs-&amp;amp;amp;gt;cu_bsrValC, sizeof(double)*(blockSize*blockSize)*nbNnzBlocks));
cusparseDcsr2bsr(bcrs-&amp;amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN,
csr.m, csr.m, descr, csr.cu_csrValA, csr.cu_csrRowPtrA, csr.cu_csrColIndA, blockSize, descr, bcrs-&amp;amp;amp;gt;cu_bsrValC, bcrs-&amp;amp;amp;gt;cu_bsrRowPtrC, bcrs-&amp;amp;amp;gt;cu_bsrColIndC);
}

double compute_BSR(BCRSArrays&amp;amp;amp;amp; bcsr, double *x , double *y){
// For blas 2 gemv y = alpha.x.A + Beta.y
const double alpha = 1.0;
const double beta = 0.0;
// Copy input
const int sizeMultipleBlockSize = ((bcsr.m+bcsr.blockSize-1)/bcsr.blockSize)*bcsr.blockSize;
double* cu_x = allocAndCopyPart(x, bcsr.m, sizeMultipleBlockSize);
double* cu_y = allocAndCopyPart(y, bcsr.m, sizeMultipleBlockSize);
// Init matrix properties
cusparseMatDescr_t descr = 0;
CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;amp;descr));
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Compute gemv
float gemvComputeTume = 0;
{
cudaEvent_t startTime, stopTime;
cudaEventCreate(&amp;amp;amp;amp;startTime);
cudaEventCreate(&amp;amp;amp;amp;stopTime);
cudaEventRecord(startTime, bcsr.streamId);

cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE,
bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &amp;amp;amp;amp;alpha, descr,
bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize,
cu_x, &amp;amp;amp;amp;beta, cu_y);

cudaEventRecord(stopTime, bcsr.streamId);
cudaEventSynchronize(stopTime);
cudaEventElapsedTime(&amp;amp;amp;amp;gemvComputeTume, startTime, stopTime);
gemvComputeTume /=1000.0;
}
// Get back result
CudaCheck( cudaMemcpy(y, cu_y, bcsr.m*sizeof(double), cudaMemcpyDeviceToHost ) );
// Dealloc vectors
CudaCheck(cudaFree(cu_x));
CudaCheck(cudaFree(cu_y));

return gemvComputeTume;
}

Example of how to use it:

/** Simply return the maximum relative diff */
double ChechAccuracy(const double y1[], const double y2[], const int size){
double maxDiff = 0;
for(int idx = 0 ; idx &amp;amp;amp;lt; size ; ++idx){
if(y1[idx] != 0.0){
maxDiff = Max(maxDiff, Abs((y1[idx]-y2[idx])/y1[idx]));
}
}
return maxDiff;
}

/** Compute with COO and call cuda CSR and BCSR */
void computeWithAll(COOArrays&amp;amp;amp;amp; coo, double times[3], double errors[3], const int loop = 1, const int blockSize = 8){
CRSArrays crs;
COO_to_CRS(coo, &amp;amp;amp;amp;crs);

BCRSArrays bcsr;
CRS_to_BCRS(crs, &amp;amp;amp;amp;bcsr, blockSize);
const int dimMulitpleBlock = ((coo.m+blockSize-1)/blockSize)*blockSize;

double* x = new double[dimMulitpleBlock];
for(int idx = 0 ; idx &amp;amp;amp;lt; dimMulitpleBlock ; ++idx){
x[idx] = 1.0;
}
double* y = new double[dimMulitpleBlock];
double* ycoo = new double[dimMulitpleBlock];

{
memset(ycoo, 0, sizeof(double)*coo.m);
for(int idxLoop = 0 ; idxLoop &amp;amp;amp;lt; loop ; ++idxLoop){
compute_COO(coo, x, ycoo);
}
times[0] = 0;
errors[0] = 0;
}
{
memset(y, 0, sizeof(double)*coo.m);
times[1] = 0;
for(int idxLoop = 0 ; idxLoop &amp;amp;amp;lt; loop ; ++idxLoop){
times[1] += compute_CRS(crs, x, y);
}
errors[1] = ChechAccuracy(ycoo, y, coo.m);
}
{
memset(y, 0, sizeof(double)*coo.m);
times[2] = 0;
for(int idxLoop = 0 ; idxLoop &amp;amp;amp;lt; loop ; ++idxLoop){
times[2] += compute_BSR(bcsr, x, y);
}
errors[2] = ChechAccuracy(ycoo, y, coo.m);
}
delete[] x;
delete[] y;
delete[] ycoo;
}

To fill a COO struct:

const int dim = 300;
COOArrays coo;
coo.m = dim;
coo.nnz = dim*dim;
flops[0] = coo.nnz*2;
coo.val = new double[coo.nnz];
coo.rowind = new int[coo.nnz];
coo.colind = new int[coo.nnz];

for(int idxRow = 0 ; idxRow &amp;amp;amp;lt; dim ; ++idxRow){
for(int idxCol = 0 ; idxCol &amp;amp;amp;lt; dim ; ++idxCol){
const int idx = (idxRow)*dim + idxCol;
coo.val[idx] = 1.0;
coo.rowind[idx] = idxRow;
coo.colind[idx] = idxCol;
}
}

computeWithAll(coo, alltimes[0], errors[0]);

To compile:

nvcc --compiler-options "-m64" -O3 main.cu -o main.exe  --linker-options "-lcusparse"