I left on this page an old a deprecated code (at the bottom) and a new version at the top.
Apologize I do not have time to clean and comment it, but I hope it might help if someone is searching for an example.
New code (tested with CUDA 11.6)
#include &amp;lt;cuda_runtime.h&amp;gt; #include &amp;lt;cusparse.h&amp;gt;&amp;lt;/cusparse.h&amp;gt;&amp;lt;/cuda_runtime.h&amp;gt; #include &amp;lt;cassert&amp;gt;&amp;lt;/cassert&amp;gt; #include &amp;lt;algorithm&amp;gt; #include &amp;lt;iostream&amp;gt; #include &amp;lt;fstream&amp;gt; #include &amp;lt;sstream&amp;gt; #include &amp;lt;string&amp;gt; #include &amp;lt;vector&amp;gt; #include &amp;lt;tuple&amp;gt;&amp;lt;/tuple&amp;gt;&amp;lt;/vector&amp;gt;&amp;lt;/string&amp;gt;&amp;lt;/sstream&amp;gt;&amp;lt;/fstream&amp;gt;&amp;lt;/iostream&amp;gt; #include "SpTimer.hpp" #define CUDA_ASSERT(X)\ {\ cudaError_t ___resCuda = (X);\ if ( cudaSuccess != ___resCuda ){\ printf("Error: fails, %s (%s line %d)\nbCols", cudaGetErrorString(___resCuda), __FILE__, __LINE__ );\ exit(1);\ }\ } #define CUSPARSE_ASSERT(X)\ {\ cusparseStatus_t ___resCuda = (X);\ if ( CUSPARSE_STATUS_SUCCESS != ___resCuda ){\ printf("Error: fails, %s (%s line %d)\nbCols", cusparseGetErrorString(___resCuda), __FILE__, __LINE__ );\ exit(1);\ }\ } std::vector&amp;lt;std::string&amp;gt; SplitString(const std::string&amp;amp;amp; inString, const char inDelim = ' ') { std::vector&amp;lt;std::string&amp;gt; substrings; size_t start; size_t end = 0; while ((start = inString.find_first_not_of(inDelim, end)) != std::string::npos) { end = inString.find(inDelim, start); substrings.push_back(inString.substr(start, end - start)); } return substrings; }&amp;lt;/std::string&amp;gt;&amp;lt;/std::string&amp;gt; std::tuple&amp;lt;int, int,="" std::vector&amp;lt;std::tuple&amp;lt;int,="" double=""&amp;gt;&amp;amp;gt;&amp;amp;gt; ReadMMFile(const std::string&amp;amp;amp; inFilename){ std::ifstream file(inFilename); if (!file.is_open()) { std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Could not open file " &amp;amp;lt;&amp;amp;lt; inFilename &amp;amp;lt;&amp;amp;lt; std::endl; return {}; } // %%MatrixMarket matrix coordinate real general { std::string header; std::getline(file, header); const auto words = SplitString(header); if(words.size() != 5 || words[3] != "real"){ std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Incompatible matrix: " &amp;amp;lt;&amp;amp;lt; header &amp;amp;lt;&amp;amp;lt; std::endl; return {}; } } int nbRows, nbCols, nnz; char c; { std::string line; std::getline(file, line); while(line.size() &amp;amp;amp;&amp;amp;amp; line[0] == '%'){ std::getline(file, line); } if(!file.good() || line.size() == 0){ std::cerr &amp;amp;lt;&amp;amp;lt; "Error: in header" &amp;amp;lt;&amp;amp;lt; std::endl; return {}; } std::istringstream iss(line); iss &amp;amp;gt;&amp;amp;gt; nbRows &amp;amp;gt;&amp;amp;gt; nbCols &amp;amp;gt;&amp;amp;gt; nnz; }&amp;lt;/int,&amp;gt; std::vector&amp;lt;std::tuple&amp;lt;int, int,="" double=""&amp;gt;&amp;amp;gt; values; values.reserve(nnz);&amp;lt;/std::tuple&amp;lt;int,&amp;gt; int i, j; double val; while (file &amp;amp;gt;&amp;amp;gt; i &amp;amp;gt;&amp;amp;gt; j &amp;amp;gt;&amp;amp;gt; val) { values.emplace_back(i - 1, j - 1, val); nnz -= 1; } if(nnz != 0){ std::cerr &amp;amp;lt;&amp;amp;lt; "Error: Cannot read the expected number of NNZ" &amp;amp;lt;&amp;amp;lt; std::endl; return {}; } std::sort(values.begin(), values.end(), [](const auto& v1, const auto& v2){ return std::get<0>(v1) < std::get<0>(v2) || (std::get<0>(v1) == std::get<0>(v2) && std::get<1>(v1) < std::get<1>(v2)); }); return {nbRows, nbCols, values}; } std::tuple&amp;lt;int, int,="" std::vector&amp;lt;int=""&amp;gt;, std::vector&amp;lt;int&amp;gt;, std::vector&amp;lt;double&amp;gt;&amp;amp;gt; MMToCsr(const int nbRows, const int nbCols, const std::vector&amp;lt;std::tuple&amp;lt;int, int,="" double=""&amp;gt;&amp;amp;gt;&amp;amp;amp; values) { std::vector&amp;lt;int&amp;gt; row_ptr(nbRows + 1, 0); std::vector&amp;lt;int&amp;gt; col_ind; std::vector&amp;lt;double&amp;gt; val; col_ind.reserve(values.size()); val.reserve(values.size());&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/std::tuple&amp;lt;int,&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int,&amp;gt; for (const auto &amp;amp;amp;[i, j, v] : values) { row_ptr[i + 1]++; col_ind.emplace_back(j); val.emplace_back(v); } for (int idxRow = 0; idxRow &amp;amp;lt; nbRows; idxRow++) { row_ptr[idxRow + 1] += row_ptr[idxRow]; } return {nbRows, nbCols, row_ptr, col_ind, val}; } std::vector&amp;lt;double&amp;gt; cuSparseCsr(const int nbRows, const int nbCols, const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrOffset, const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrColIdx, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; values, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecX, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecY, const int NbLoops = 10){ const cudaDataType RealType = CUDA_R_64F; const int nnz = values.size(); const double alpha = 1; const double beta = 0;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/double&amp;gt; assert(vecX.size() == nbCols); assert(vecY.size() == nbRows); assert(csrColIdx.size() == nnz); assert(csrOffset.size() == nbRows+1); cusparseHandle_t cusparseHandle = NULL; CUSPARSE_ASSERT( cusparseCreate(&amp;amp;amp;cusparseHandle) ) int* cuCsrOffsets; CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuCsrOffsets, (nbRows + 1) * sizeof(int)) ) CUDA_ASSERT( cudaMemcpy(cuCsrOffsets, csrOffset.data(), (nbRows + 1) * sizeof(int), cudaMemcpyHostToDevice) ) int* cuCsrColIdx; CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuCsrColIdx, nnz * sizeof(int)) ) CUDA_ASSERT( cudaMemcpy(cuCsrColIdx, csrColIdx.data(), nnz * sizeof(int), cudaMemcpyHostToDevice) ) double* cuValues; CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuValues, nnz * sizeof(double)) ) CUDA_ASSERT( cudaMemcpy(cuValues, values.data(), nnz * sizeof(float), cudaMemcpyHostToDevice) ) cusparseSpMatDescr_t cuMatrixDescr; CUSPARSE_ASSERT( cusparseCreateCsr(&amp;amp;amp;cuMatrixDescr, nbRows, nbCols, nnz, cuCsrOffsets, cuCsrColIdx, cuValues, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, RealType) ) double* cuVecX; CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuVecX, nbCols * sizeof(double)) ) CUDA_ASSERT( cudaMemcpy(cuVecX, vecX.data(), nbCols * sizeof(double), cudaMemcpyHostToDevice) ) cusparseDnVecDescr_t cuVecDescX; CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;cuVecDescX, nbCols, cuVecX, RealType) ) double* cuVecY; CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuVecY, nbRows * sizeof(double)) ) CUDA_ASSERT( cudaMemcpy(cuVecY, vecY.data(), nbRows * sizeof(double), cudaMemcpyHostToDevice) ) cusparseDnVecDescr_t cuVecDescrY; CUSPARSE_ASSERT( cusparseCreateDnVec(&amp;amp;amp;cuVecDescrY, nbRows, cuVecY, RealType) ) void* cuBuffer = NULL; size_t bufferSize = 0; CUSPARSE_ASSERT( cusparseSpMV_bufferSize( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;beta, cuVecDescrY, RealType, CUSPARSE_SPMV_ALG_DEFAULT, &amp;amp;amp;bufferSize) ) CUDA_ASSERT( cudaMalloc(&amp;amp;amp;cuBuffer, bufferSize) ) SpTimer timer; for(int idxLoop = 0 ; idxLoop &amp;amp;lt; NbLoops ; ++idxLoop){ CUSPARSE_ASSERT( cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &amp;amp;amp;alpha, cuMatrixDescr, cuVecDescX, &amp;amp;amp;beta, cuVecDescrY, RealType, CUSPARSE_SPMV_ALG_DEFAULT, cuBuffer) ) } timer.stop(); std::cout &amp;amp;lt;&amp;amp;lt; "CuSparse took: " &amp;amp;lt;&amp;amp;lt; timer.getElapsed() &amp;amp;lt;&amp;amp;lt; "s (" &amp;amp;lt;&amp;amp;lt; (NbLoops*nnz*2)/timer.getElapsed()/1E9 &amp;amp;lt;&amp;amp;lt; "GFlop/s) for " &amp;amp;lt;&amp;amp;lt; NbLoops &amp;amp;lt;&amp;amp;lt; " iterations" &amp;amp;lt;&amp;amp;lt; std::endl; CUSPARSE_ASSERT( cusparseDestroySpMat(cuMatrixDescr) ) CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescX) ) CUSPARSE_ASSERT( cusparseDestroyDnVec(cuVecDescrY) ) CUSPARSE_ASSERT( cusparseDestroy(cusparseHandle) ) std::vector&amp;lt;double&amp;gt; resY(nbRows); CUDA_ASSERT( cudaMemcpy(resY.data(), cuVecY, nbRows * sizeof(double), cudaMemcpyDeviceToHost) )&amp;lt;/double&amp;gt; CUDA_ASSERT( cudaFree(cuBuffer) ) CUDA_ASSERT( cudaFree(cuCsrOffsets) ) CUDA_ASSERT( cudaFree(cuCsrColIdx) ) CUDA_ASSERT( cudaFree(cuValues) ) CUDA_ASSERT( cudaFree(cuVecX) ) CUDA_ASSERT( cudaFree(cuVecY) ) return resY; } std::vector&amp;lt;double&amp;gt; SpMvCsr(const int nbRows, const int nbCols, const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrOffset, const std::vector&amp;lt;int&amp;gt;&amp;amp;amp; csrColIdx, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; values, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecX, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecY){ assert(vecX.size() == nbCols); assert(vecY.size() == nbRows); assert(csrColIdx.size() == nnz); assert(csrOffset.size() == nbRows+1);&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/int&amp;gt;&amp;lt;/double&amp;gt; const double alpha = 1; const double beta = 0; std::vector&amp;lt;double&amp;gt; resY(nbRows);&amp;lt;/double&amp;gt; for(int idxRow = 0 ; idxRow &amp;amp;lt; nbRows ; ++idxRow){ double res = 0; for(int idxVal = csrOffset[idxRow] ; idxVal &amp;amp;lt; csrOffset[idxRow+1] ; ++idxVal){ res += vecX[csrColIdx[idxVal]] * values[idxVal]; } resY[idxRow] = res * alpha + vecY[idxRow] * beta; } return resY; } auto MaxRelDiff(const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecGood, const std::vector&amp;lt;double&amp;gt;&amp;amp;amp; vecTest){ if(vecGood.size() != vecTest.size()){ return std::numeric_limits&amp;lt;double&amp;gt;::infinity(); }&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt; double diff = 0; for(int idxVal = 0 ; idxVal &amp;amp;lt; vecGood.size() ; ++idxVal){ diff = std::max(diff, (vecGood[idxVal] == 0 ? vecTest[idxVal] : std::abs((vecGood[idxVal]-vecTest[idxVal])/vecGood[idxVal]))); } return diff; } int main(int argc, char *argv[]){ if (argc != 2) { std::cerr &amp;amp;lt;&amp;amp;lt; "Usage: " &amp;amp;lt;&amp;amp;lt; argv[0] &amp;amp;lt;&amp;amp;lt; " matrix.mtx" &amp;amp;lt;&amp;amp;lt; std::endl; return 1; } auto [nbRows, nbCols, values] = ReadMMFile(argv[1]); std::cout &amp;amp;lt;&amp;amp;lt; "Matrix dimensions: " &amp;amp;lt;&amp;amp;lt; nbRows &amp;amp;lt;&amp;amp;lt; " x " &amp;amp;lt;&amp;amp;lt; nbCols &amp;amp;lt;&amp;amp;lt; std::endl; std::cout &amp;amp;lt;&amp;amp;lt; "Number of non-zero elements: " &amp;amp;lt;&amp;amp;lt; values.size() &amp;amp;lt;&amp;amp;lt; std::endl; auto [m_csr, n_csr, row_ptr, col_ind, val] = MMToCsr(nbRows, nbCols, values); std::vector&amp;lt;double&amp;gt; vecX(n_csr, 1); std::vector&amp;lt;double&amp;gt; vecY(n_csr, 0); auto resCuSparse = cuSparseCsr(m_csr, n_csr, row_ptr, col_ind, val, vecX, vecY);&amp;lt;/double&amp;gt;&amp;lt;/double&amp;gt; auto resCsr = SpMvCsr(m_csr, n_csr, row_ptr, col_ind, val, vecX, vecY); std::cout &amp;amp;lt;&amp;amp;lt; "Error cuSparse vs. CSR: " &amp;amp;lt;&amp;amp;lt; MaxRelDiff(resCsr,resCuSparse) &amp;amp;lt;&amp;amp;lt; std::endl; return 0; }Old code (but with conversions)
////////////////////////////////////////////////////////////////////////// // In this file we compute Gemv sparse matrix with cuda for two kernels: // CSR and BCSR // We use a CPU COO version to test the accuracy of the code. ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// // COO part ////////////////////////////////////////////////////////////////////////// #include &amp;amp;lt;cuda.h&amp;amp;gt; #include &amp;amp;lt;cusparse_v2.h&amp;amp;gt; #include &amp;amp;lt;cublas_v2.h&amp;amp;gt; #include &amp;amp;lt;cstring&amp;amp;gt; // memset #include &amp;amp;lt;cassert&amp;amp;gt; #include &amp;amp;lt;cstdio&amp;amp;gt; #define Min(x,y) ((x)&amp;amp;lt;(y)?(x):(y)) #define Max(x,y) ((x)&amp;amp;gt;(y)?(x):(y)) #define Abs(x) ((x)&amp;amp;gt;(0)?(x):-(x)) ////////////////////// CUDA ERROR ///////////////////////////////////////// static void CudaCheckCore(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line); exit(code); } } #define CudaCheck( test ) { CudaCheckCore((test), __FILE__, __LINE__); } #define CudaCheckAfterCall() { CudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); } ////////////////////// CUDA SPARSE ERROR /////////////////////////////////// static const char * cusparseGetErrorString(cusparseStatus_t error) { // Read more at: http://docs.nvidia.com/cuda/cusparse/index.html#ixzz3f79JxRar switch (error) { case CUSPARSE_STATUS_SUCCESS: return "The operation completed successfully."; case CUSPARSE_STATUS_NOT_INITIALIZED: return "The cuSPARSE library was not initialized. This is usually caused by the lack of a prior call, an error in the CUDA Runtime API called by the cuSPARSE routine, or an error in the hardware setup.\n" \ "To correct: call cusparseCreate() prior to the function call; and check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed."; case CUSPARSE_STATUS_ALLOC_FAILED: return "Resource allocation failed inside the cuSPARSE library. This is usually caused by a cudaMalloc() failure.\n"\ "To correct: prior to the function call, deallocate previously allocated memory as much as possible."; case CUSPARSE_STATUS_INVALID_VALUE: return "An unsupported value or parameter was passed to the function (a negative vector size, for example).\n"\ "To correct: ensure that all the parameters being passed have valid values."; case CUSPARSE_STATUS_ARCH_MISMATCH: return "The function requires a feature absent from the device architecture; usually caused by the lack of support for atomic operations or double precision.\n"\ "To correct: compile and run the application on a device with appropriate compute capability, which is 1.1 for 32-bit atomic operations and 1.3 for double precision."; case CUSPARSE_STATUS_MAPPING_ERROR: return "An access to GPU memory space failed, which is usually caused by a failure to bind a texture.\n"\ "To correct: prior to the function call, unbind any previously bound textures."; case CUSPARSE_STATUS_EXECUTION_FAILED: return "The GPU program failed to execute. This is often caused by a launch failure of the kernel on the GPU, which can be caused by multiple reasons.\n"\ "To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed."; case CUSPARSE_STATUS_INTERNAL_ERROR: return "An internal cuSPARSE operation failed. This error is usually caused by a cudaMemcpyAsync() failure.\n"\ "To correct: check that the hardware, an appropriate version of the driver, and the cuSPARSE library are correctly installed. Also, check that the memory passed as a parameter to the routine is not being deallocated prior to the routine’s completion."; case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "The matrix type is not supported by this function. This is usually caused by passing an invalid matrix descriptor to the function.\n"\ "To correct: check that the fields in cusparseMatDescr_t descrA were set correctly."; } return "&amp;amp;lt;unknown&amp;amp;gt;"; } static void CudaSparseCheckCore(cusparseStatus_t code, const char *file, int line) { if (code != CUSPARSE_STATUS_SUCCESS) { fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cusparseGetErrorString(code), file, line); exit(code); } } #define CudaSparseCheck( test ) { CudaSparseCheckCore((test), __FILE__, __LINE__); } ////////////// Alloc and copy //////////////////////////////////////////////// template &amp;amp;lt;class ObjectType&amp;amp;gt; ObjectType* allocAndCopy(const ObjectType src[], const int size){ ObjectType* dest = NULL; CudaCheck( cudaMalloc(&amp;amp;amp;dest,size*sizeof(ObjectType)) ); CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) ); return dest; } template &amp;amp;lt;class ObjectType&amp;amp;gt; ObjectType* alloc(const int size){ ObjectType* dest = NULL; CudaCheck( cudaMalloc(&amp;amp;amp;dest,size*sizeof(ObjectType)) ); return dest; } template &amp;amp;lt;class ObjectType&amp;amp;gt; ObjectType* allocAndCopyPart(const ObjectType src[], const int size, const int allocSize){ ObjectType* dest = NULL; assert(size &amp;amp;lt;= allocSize); CudaCheck( cudaMalloc(&amp;amp;amp;dest,allocSize*sizeof(ObjectType)) ); CudaCheck( cudaMemcpy(dest, src, size*sizeof(ObjectType), cudaMemcpyHostToDevice ) ); CudaCheck( cudaMemset(&amp;amp;amp;dest[size],0,(allocSize-size)*sizeof(ObjectType)) ); return dest; } ////////////////////////////////////////////////////////////////////////// // COO part ////////////////////////////////////////////////////////////////////////// #include &amp;amp;lt;algorithm&amp;amp;gt; struct Ijv{ int i, j; double v; }; bool IjvComp(const Ijv&amp;amp;amp; v1, const Ijv&amp;amp;amp; v2){ return v1.i &amp;amp;lt; v2.i || (v1.i == v2.i &amp;amp;amp;&amp;amp;amp; v1.j &amp;amp;lt; v2.j); } struct COOArrays{ int m; int nnz; double *val;/*values(NNZ)*/ int *rowind;/*i(NNZ)*/ int *colind;/*j(NNZ)*/ COOArrays(){ val = NULL; rowind = NULL; colind = NULL; } ~COOArrays(){ delete[] val; delete[] rowind; delete[] colind; } void sortToRowMajor(){ Ijv* ijvs = new Ijv[nnz]; for(int idxCopy = 0 ; idxCopy &amp;amp;lt; nnz ; ++idxCopy){ ijvs[idxCopy].i = rowind[idxCopy]; ijvs[idxCopy].j = colind[idxCopy]; ijvs[idxCopy].v = val[idxCopy]; } std::sort(ijvs, ijvs+nnz, IjvComp); for(int idxCopy = 0 ; idxCopy &amp;amp;lt; nnz ; ++idxCopy){ rowind[idxCopy] = ijvs[idxCopy].i; colind[idxCopy] = ijvs[idxCopy].j; val[idxCopy] = ijvs[idxCopy].v; } delete[] ijvs; } }; void compute_COO(COOArrays&amp;amp;amp; coo, double *x , double *y ){ for(int idxVal = 0 ; idxVal &amp;amp;lt; coo.nnz ; ++idxVal){ y[coo.rowind[idxVal]] += x[coo.colind[idxVal]] * coo.val[idxVal]; } } ////////////////////////////////////////////////////////////////////////// // COO part ////////////////////////////////////////////////////////////////////////// struct CRSArrays{ int m; //&amp;amp;lt; the dim of the matrix int nnz;//&amp;amp;lt; the number of nnz (== ia[m]) double *cu_csrValA; //&amp;amp;lt; the values (of size NNZ) int *cu_csrRowPtrA;//&amp;amp;lt; the usual rowptr (of size m+1) int *cu_csrColIndA;//&amp;amp;lt; the colidx of each NNZ (of size nnz) cudaStream_t streamId; cusparseHandle_t cusparseHandle; CRSArrays(){ cu_csrValA = NULL; cu_csrRowPtrA = NULL; cu_csrColIndA = NULL; // Create sparse handle (needed to call sparse functions streamId = 0; cusparseHandle = 0; CudaSparseCheck(cusparseCreate(&amp;amp;amp;cusparseHandle)); CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId)); } ~CRSArrays(){ CudaCheck(cudaFree(cu_csrValA)); CudaCheck(cudaFree(cu_csrRowPtrA)); CudaCheck(cudaFree(cu_csrColIndA)); // Destroy sparse handle CudaSparseCheck(cusparseDestroy(cusparseHandle)); } }; void COO_to_CRS(COOArrays&amp;amp;amp; coo, CRSArrays* crs){ // We need COO to be sorted by row (and column) coo.sortToRowMajor(); crs-&amp;amp;gt;m = coo.m; crs-&amp;amp;gt;nnz = coo.nnz; // Convert COO to CSR (it is just for the rows idx) crs-&amp;amp;gt;cu_csrRowPtrA = alloc&amp;amp;lt;int&amp;amp;gt;(coo.m+1); { int* cu_cooRowIndA = allocAndCopy(coo.rowind, coo.nnz); CudaSparseCheck(cusparseXcoo2csr(crs-&amp;amp;gt;cusparseHandle, cu_cooRowIndA, coo.nnz, coo.m, crs-&amp;amp;gt;cu_csrRowPtrA, CUSPARSE_INDEX_BASE_ZERO)); CudaCheck(cudaFree(cu_cooRowIndA)); } // Copy cols idx and values that are unchanged crs-&amp;amp;gt;cu_csrValA = allocAndCopy(coo.val, coo.nnz); crs-&amp;amp;gt;cu_csrColIndA = allocAndCopy(coo.colind, coo.nnz); } double compute_CRS( CRSArrays&amp;amp;amp; crs, double *x , double *y){ // For blas 2 gemv y = alpha.x.A + Beta.y const double alpha = 1.0; const double beta = 0.0; // Copy input double* cu_x = allocAndCopy(x, crs.m); double* cu_y = allocAndCopy(y, crs.m); // Init matrix properties cusparseMatDescr_t descr = 0; CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr)); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // Compute gemv float gemvComputeTume = 0; { cudaEvent_t startTime, stopTime; cudaEventCreate(&amp;amp;amp;startTime); cudaEventCreate(&amp;amp;amp;stopTime); cudaEventRecord(startTime, crs.streamId); CudaSparseCheck(cusparseDcsrmv(crs.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, crs.m, crs.m, crs.nnz, &amp;amp;amp;alpha, descr, crs.cu_csrValA, crs.cu_csrRowPtrA, crs.cu_csrColIndA, cu_x, &amp;amp;amp;beta, cu_y)); cudaEventRecord(stopTime, crs.streamId); cudaEventSynchronize(stopTime); cudaEventElapsedTime(&amp;amp;amp;gemvComputeTume, startTime, stopTime); gemvComputeTume /=1000.0; } // Get back result CudaCheck( cudaMemcpy(y, cu_y, crs.m*sizeof(double), cudaMemcpyDeviceToHost ) ); // Dealloc vectors CudaCheck(cudaFree(cu_x)); CudaCheck(cudaFree(cu_y)); return gemvComputeTume; } ////////////////////////////////////////////////////////////////////////// // BCSR part ////////////////////////////////////////////////////////////////////////// struct BCRSArrays{ int m; int nnz; int nbBlocks; int nbBlockRow; int blockSize; int* cu_bsrRowPtrC; int* cu_bsrColIndC; double* cu_bsrValC; cudaStream_t streamId; cusparseHandle_t cusparseHandle; BCRSArrays(){ cu_bsrRowPtrC = NULL; cu_bsrColIndC = NULL; cu_bsrValC = NULL; // Create sparse handle (needed to call sparse functions streamId = 0; cusparseHandle = 0; CudaSparseCheck(cusparseCreate(&amp;amp;amp;cusparseHandle)); CudaSparseCheck(cusparseSetStream(cusparseHandle, streamId)); } ~BCRSArrays(){ CudaCheck(cudaFree(cu_bsrRowPtrC)); CudaCheck(cudaFree(cu_bsrColIndC)); CudaCheck(cudaFree(cu_bsrValC)); // Destroy sparse handle CudaSparseCheck(cusparseDestroy(cusparseHandle)); } }; void CRS_to_BCRS(CRSArrays&amp;amp;amp; csr, BCRSArrays* bcrs, const int blockSize){ bcrs-&amp;amp;gt;m = csr.m; bcrs-&amp;amp;gt;nnz = csr.nnz; bcrs-&amp;amp;gt;blockSize = blockSize; bcrs-&amp;amp;gt;nbBlockRow = (csr.m + blockSize-1)/blockSize; cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrRowPtrC, sizeof(int) *(bcrs-&amp;amp;gt;nbBlockRow+1)); cusparseMatDescr_t descr = 0; CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr)); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); int nbNnzBlocks; cusparseXcsr2bsrNnz(bcrs-&amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN, csr.m, csr.m, descr, csr.cu_csrRowPtrA, csr.cu_csrColIndA, blockSize, descr, bcrs-&amp;amp;gt;cu_bsrRowPtrC, &amp;amp;amp;nbNnzBlocks); { int firstBlockIdx, lastBlockIdx; cudaMemcpy(&amp;amp;amp;lastBlockIdx, bcrs-&amp;amp;gt;cu_bsrRowPtrC+bcrs-&amp;amp;gt;nbBlockRow, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&amp;amp;amp;firstBlockIdx, bcrs-&amp;amp;gt;cu_bsrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost); assert(firstBlockIdx == 0); // we are in base 0 assert(nbNnzBlocks == lastBlockIdx - firstBlockIdx); } bcrs-&amp;amp;gt;nbBlocks = nbNnzBlocks; CudaCheck(cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrColIndC, sizeof(int)*nbNnzBlocks)); CudaCheck(cudaMalloc((void**)&amp;amp;amp;bcrs-&amp;amp;gt;cu_bsrValC, sizeof(double)*(blockSize*blockSize)*nbNnzBlocks)); cusparseDcsr2bsr(bcrs-&amp;amp;gt;cusparseHandle, CUSPARSE_DIRECTION_COLUMN, csr.m, csr.m, descr, csr.cu_csrValA, csr.cu_csrRowPtrA, csr.cu_csrColIndA, blockSize, descr, bcrs-&amp;amp;gt;cu_bsrValC, bcrs-&amp;amp;gt;cu_bsrRowPtrC, bcrs-&amp;amp;gt;cu_bsrColIndC); } double compute_BSR(BCRSArrays&amp;amp;amp; bcsr, double *x , double *y){ // For blas 2 gemv y = alpha.x.A + Beta.y const double alpha = 1.0; const double beta = 0.0; // Copy input const int sizeMultipleBlockSize = ((bcsr.m+bcsr.blockSize-1)/bcsr.blockSize)*bcsr.blockSize; double* cu_x = allocAndCopyPart(x, bcsr.m, sizeMultipleBlockSize); double* cu_y = allocAndCopyPart(y, bcsr.m, sizeMultipleBlockSize); // Init matrix properties cusparseMatDescr_t descr = 0; CudaSparseCheck(cusparseCreateMatDescr(&amp;amp;amp;descr)); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // Compute gemv float gemvComputeTume = 0; { cudaEvent_t startTime, stopTime; cudaEventCreate(&amp;amp;amp;startTime); cudaEventCreate(&amp;amp;amp;stopTime); cudaEventRecord(startTime, bcsr.streamId); cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE, bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &amp;amp;amp;alpha, descr, bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize, cu_x, &amp;amp;amp;beta, cu_y); cudaEventRecord(stopTime, bcsr.streamId); cudaEventSynchronize(stopTime); cudaEventElapsedTime(&amp;amp;amp;gemvComputeTume, startTime, stopTime); gemvComputeTume /=1000.0; } // Get back result CudaCheck( cudaMemcpy(y, cu_y, bcsr.m*sizeof(double), cudaMemcpyDeviceToHost ) ); // Dealloc vectors CudaCheck(cudaFree(cu_x)); CudaCheck(cudaFree(cu_y)); return gemvComputeTume; }Example of how to use it:
/** Simply return the maximum relative diff */ double ChechAccuracy(const double y1[], const double y2[], const int size){ double maxDiff = 0; for(int idx = 0 ; idx &amp;amp;lt; size ; ++idx){ if(y1[idx] != 0.0){ maxDiff = Max(maxDiff, Abs((y1[idx]-y2[idx])/y1[idx])); } } return maxDiff; } /** Compute with COO and call cuda CSR and BCSR */ void computeWithAll(COOArrays&amp;amp;amp; coo, double times[3], double errors[3], const int loop = 1, const int blockSize = 8){ CRSArrays crs; COO_to_CRS(coo, &amp;amp;amp;crs); BCRSArrays bcsr; CRS_to_BCRS(crs, &amp;amp;amp;bcsr, blockSize); const int dimMulitpleBlock = ((coo.m+blockSize-1)/blockSize)*blockSize; double* x = new double[dimMulitpleBlock]; for(int idx = 0 ; idx &amp;amp;lt; dimMulitpleBlock ; ++idx){ x[idx] = 1.0; } double* y = new double[dimMulitpleBlock]; double* ycoo = new double[dimMulitpleBlock]; { memset(ycoo, 0, sizeof(double)*coo.m); for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){ compute_COO(coo, x, ycoo); } times[0] = 0; errors[0] = 0; } { memset(y, 0, sizeof(double)*coo.m); times[1] = 0; for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){ times[1] += compute_CRS(crs, x, y); } errors[1] = ChechAccuracy(ycoo, y, coo.m); } { memset(y, 0, sizeof(double)*coo.m); times[2] = 0; for(int idxLoop = 0 ; idxLoop &amp;amp;lt; loop ; ++idxLoop){ times[2] += compute_BSR(bcsr, x, y); } errors[2] = ChechAccuracy(ycoo, y, coo.m); } delete[] x; delete[] y; delete[] ycoo; }To fill a COO struct:
const int dim = 300; COOArrays coo; coo.m = dim; coo.nnz = dim*dim; flops[0] = coo.nnz*2; coo.val = new double[coo.nnz]; coo.rowind = new int[coo.nnz]; coo.colind = new int[coo.nnz]; for(int idxRow = 0 ; idxRow &amp;amp;lt; dim ; ++idxRow){ for(int idxCol = 0 ; idxCol &amp;amp;lt; dim ; ++idxCol){ const int idx = (idxRow)*dim + idxCol; coo.val[idx] = 1.0; coo.rowind[idx] = idxRow; coo.colind[idx] = idxCol; } } computeWithAll(coo, alltimes[0], errors[0]);To compile:
nvcc --compiler-options "-m64" -O3 main.cu -o main.exe --linker-options "-lcusparse"