C++ – Several loops or one big loop – Memcpy performance, memory cache
One week ago a friend asked a question that seemed easy to answer :
“Is it better if I copy 3 arrays into another ones in one loop or in 3 small loops?” (supposing arrays have the same size).
I thought “I suppose one big loop is better because your compiler will manage the cache, and you will have a small part of your arrays in the fast memory” … I was completely wrong…
The source to test the several ideas
I put many stuffs in it, from the web, etc… I also tried memcpy because I read on many blog that this function is slow… maybe.
#include <iostream>
#include <omp.h>
#include <string.h>
// Openmp just use to get a simple timer
//g++ main.cpp -lgomp -fopenmp -o test
// A version of the memcpy in c
template <class T>
void homeMadeMemcpy(T *__restrict s1, const T *__restrict s2, size_t n){
for(; 0<n; --n)*s1++ = *s2++;
}
// another version http://www.student.cs.uwaterloo.ca/~cs350/common/os161-src-html/kern_2arch_2mips_2include_2types_8h.html#83242de74310b4aec3fc506bc3644211
typedef unsigned long uintptr_t;
void homeMadeMemcpy2(void *dst, const void *src, size_t len)
{
size_t i;
if ((uintptr_t)dst % sizeof(long) == 0 &&
(uintptr_t)src % sizeof(long) == 0 &&
len % sizeof(long) == 0) {
long *d = (long *)dst;
const long *s = (const long *)src;
for (i=0; i<len/sizeof(long); i++) {
d[i] = s[i];
}
}
else {
char *d = (char*)dst;
const char *s = (const char *) src;
for (i=0; i<len; i++) {
d[i] = s[i];
}
}
}
int main(void){
const long SizeEnd( 100000000 );
const long SizeBegin( 100 );
const long SizeInc(100);
std::cout <<"S\\T";
std::cout <<"\t" << "One Big Loop";
std::cout <<"\t" << "Tree Loops";
std::cout <<"\t" << "Memcpy";
std::cout <<"\t" << "HomeMadeMemcpy";
std::cout <<"\t" << "HomeMadeMemcpy2" << "\n";
for(long Size = SizeBegin ; Size <= SizeEnd ; Size *= SizeInc){
// Alloc
int* a = new int[Size];
int* b = new int[Size];
int* c = new int[Size];
int* a2 = new int[Size];
int* b2 = new int[Size];
int* c2 = new int[Size];
// Naive loop
const double startBig = omp_get_wtime ();
for(long idx = 0 ; idx < Size ; ++idx){
a[idx] = a2[idx];
b[idx] = b2[idx];
c[idx] = c2[idx];
}
const double endBig = omp_get_wtime ();
// 3 Loops
const double startSmall = omp_get_wtime ();
for(long idx = 0 ; idx < Size ; ++idx){
a[idx] = a2[idx];
}
for(long idx = 0 ; idx < Size ; ++idx){
b[idx] = b2[idx];
}
for(long idx = 0 ; idx < Size ; ++idx){
c[idx] = c2[idx];
}
const double endSmall = omp_get_wtime ();
// System memcpy
const double startMem = omp_get_wtime ();
memcpy(c,c2,sizeof(int)*Size);
memcpy(b,b2,sizeof(int)*Size);
memcpy(a,a2,sizeof(int)*Size);
const double endMem = omp_get_wtime ();
// Usual memcpy implementation
const double startHMMem = omp_get_wtime ();
homeMadeMemcpy(c,c2,Size);
homeMadeMemcpy(b,b2,Size);
homeMadeMemcpy(a,a2,Size);
const double endHMMem = omp_get_wtime ();
// Another memcpy implementation
const double startHMMem2 = omp_get_wtime ();
homeMadeMemcpy2(c,c2,Size);
homeMadeMemcpy2(b,b2,Size);
homeMadeMemcpy2(a,a2,Size);
const double endHMMem2 = omp_get_wtime ();
// Print
std::cout << Size;
std::cout <<"\t" << ((endBig - startBig));
std::cout <<"\t" << ((endSmall - startSmall));
std::cout <<"\t" << ((endMem - startMem));
std::cout <<"\t" << ((endHMMem - startHMMem));
std::cout <<"\t" << ((endHMMem2 - startHMMem2)) << "\n";
// Dealloc
delete [] a ;
delete [] b ;
delete [] c ;
delete [] a2 ;
delete [] b2 ;
delete [] c2 ;
}
return 0;
}
The result
Without optimization (-O0)
S\T One Big Loop Tree Loops Memcpy HomeMadeMemcpy HomeMadeMemcpy2
100 2,58E-006 3,34E-006 4,43E-006 2,79E-006 1,42E-006
10000 0,000276837 0,000259037 1,25E-005 0,000205585 6,38E-005
1000000 0,0169273 0,00911932 0,00190559 0,00780513 0,00217896
100000000 1,31256 0,920602 0,189516 0,735825 0,222358
With optimization (-O2)
Custom functions are better with optimization, memcpy & big loop do not change…
S\T One Big Loop Tree Loops Memcpy HomeMadeMemcpy HomeMadeMemcpy2
100 1,67E-006 4,98E-005 5,35E-006 1,77E-006 1,04E-006
10000 0,00012131 5,09E-005 1,27E-005 7,59E-005 1,33E-005
1000000 0,0196256 0,00268528 0,00165134 0,00269971 0,000606299
100000000 1,02943 0,257611 0,182456 0,274724 0,0642852
Conclusion
Memcpy is recommended or a custom memcpy function but never one loop!!
Subscribe to the RSS feed and have all new posts delivered straight to you.


