28
Feb
0

C++ – Several loops or one big loop – Memcpy performance, memory cache

One week ago a friend asked a question that seemed easy to answer :

“Is it better if I copy 3 arrays into another ones in one loop or in 3 small loops?” (supposing arrays have the same size).

I thought “I suppose one big loop is better because your compiler will manage the cache, and you will have a small part of your arrays in the fast memory” … I was completely wrong…

The source to test the several ideas

I put many stuffs in it, from the web, etc… I also tried memcpy because I read on many blog that this function is slow… maybe.

#include <iostream>
#include <omp.h>
#include <string.h>

// Openmp just use to get a simple timer
//g++ main.cpp -lgomp -fopenmp -o test

// A version of the memcpy in c
template <class T>
void homeMadeMemcpy(T *__restrict s1, const T *__restrict s2, size_t n){
    for(; 0<n; --n)*s1++ = *s2++;
}

// another version http://www.student.cs.uwaterloo.ca/~cs350/common/os161-src-html/kern_2arch_2mips_2include_2types_8h.html#83242de74310b4aec3fc506bc3644211
typedef unsigned long uintptr_t;
void homeMadeMemcpy2(void *dst, const void *src, size_t len)
 {
         size_t i;

         if ((uintptr_t)dst % sizeof(long) == 0 &&
             (uintptr_t)src % sizeof(long) == 0 &&
             len % sizeof(long) == 0) {
                 long *d = (long *)dst;
                 const long *s = (const long *)src;
                 for (i=0; i<len/sizeof(long); i++) {
                         d[i] = s[i];
                 }
         }
         else {
             char *d = (char*)dst;
             const char *s = (const char *) src;

             for (i=0; i<len; i++) {
                     d[i] = s[i];
             }
         }
}

int main(void){
    const long SizeEnd( 100000000 );
    const long SizeBegin( 100 );
    const long SizeInc(100);

    std::cout <<"S\\T";
    std::cout <<"\t" << "One Big Loop";
    std::cout <<"\t" << "Tree Loops";
    std::cout <<"\t" << "Memcpy";
    std::cout <<"\t" << "HomeMadeMemcpy";
    std::cout <<"\t" << "HomeMadeMemcpy2" << "\n";

    for(long Size = SizeBegin ; Size <= SizeEnd ; Size *= SizeInc){
        // Alloc
        int* a = new int[Size];
        int* b = new int[Size];
        int* c = new int[Size];

        int* a2 = new int[Size];
        int* b2 = new int[Size];
        int* c2 = new int[Size];

        // Naive loop
        const double startBig = omp_get_wtime ();
        for(long idx = 0 ; idx < Size ; ++idx){
                a[idx] = a2[idx];
                b[idx] = b2[idx];
                c[idx] = c2[idx];
        }
        const double endBig = omp_get_wtime ();

        // 3 Loops
        const double startSmall = omp_get_wtime ();
        for(long idx = 0 ; idx < Size ; ++idx){
                a[idx] = a2[idx];
        }
        for(long idx = 0 ; idx < Size ; ++idx){
                b[idx] = b2[idx];
        }
        for(long idx = 0 ; idx < Size ; ++idx){
                c[idx] = c2[idx];
        }
        const double endSmall = omp_get_wtime ();

        // System memcpy
        const double startMem = omp_get_wtime ();
        memcpy(c,c2,sizeof(int)*Size);
        memcpy(b,b2,sizeof(int)*Size);
        memcpy(a,a2,sizeof(int)*Size);
        const double endMem = omp_get_wtime ();

        // Usual memcpy implementation
        const double startHMMem = omp_get_wtime ();
        homeMadeMemcpy(c,c2,Size);
        homeMadeMemcpy(b,b2,Size);
        homeMadeMemcpy(a,a2,Size);
        const double endHMMem = omp_get_wtime ();

        // Another memcpy implementation
        const double startHMMem2 = omp_get_wtime ();
        homeMadeMemcpy2(c,c2,Size);
        homeMadeMemcpy2(b,b2,Size);
        homeMadeMemcpy2(a,a2,Size);
        const double endHMMem2 = omp_get_wtime ();

        // Print
        std::cout << Size;
        std::cout <<"\t" << ((endBig - startBig));
        std::cout <<"\t" << ((endSmall - startSmall));
        std::cout <<"\t" << ((endMem - startMem));
        std::cout <<"\t" << ((endHMMem - startHMMem));
        std::cout <<"\t" << ((endHMMem2 - startHMMem2)) << "\n";

        // Dealloc
        delete [] a ;
        delete [] b ;
        delete [] c ;
        delete [] a2 ;
        delete [] b2 ;
        delete [] c2 ;
    }

    return 0;
}

The result

Without optimization (-O0)

S\T One Big Loop Tree Loops Memcpy HomeMadeMemcpy HomeMadeMemcpy2
100 2,58E-006 3,34E-006 4,43E-006 2,79E-006 1,42E-006
10000 0,000276837 0,000259037 1,25E-005 0,000205585 6,38E-005
1000000 0,0169273 0,00911932 0,00190559 0,00780513 0,00217896
100000000 1,31256 0,920602 0,189516 0,735825 0,222358

With optimization (-O2)

Custom functions are better with optimization, memcpy & big loop do not change…

S\T One Big Loop Tree Loops Memcpy HomeMadeMemcpy HomeMadeMemcpy2
100 1,67E-006 4,98E-005 5,35E-006 1,77E-006 1,04E-006
10000 0,00012131 5,09E-005 1,27E-005 7,59E-005 1,33E-005
1000000 0,0196256 0,00268528 0,00165134 0,00269971 0,000606299
100000000 1,02943 0,257611 0,182456 0,274724 0,0642852

Conclusion

Memcpy is recommended or a custom memcpy function but never one loop!!

Enjoyed reading this post?
Subscribe to the RSS feed and have all new posts delivered straight to you.

Comments are closed.

Celadon theme by the Themes Boutique