<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>HPC, AI, C++, Silverlight ... Among Others</title>
	<atom:link href="http://berenger.eu/blog/feed/" rel="self" type="application/rss+xml" />
	<link>http://berenger.eu/blog</link>
	<description>This website is my personal blog and my technical sharing center.</description>
	<lastBuildDate>Wed, 25 Jan 2012 09:25:53 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<item>
		<title>Creates facial transformations with octobooth</title>
		<link>http://berenger.eu/blog/2012/01/22/creates-facial-transformations-with-octobooth/</link>
		<comments>http://berenger.eu/blog/2012/01/22/creates-facial-transformations-with-octobooth/#comments</comments>
		<pubDate>Sun, 22 Jan 2012 21:22:57 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1290</guid>
		<description><![CDATA[A friend has created a website called http://www.octobooth.com/  OctoBooth is a website that creates facial transformations Change race, age or sex, and share photos with your friends OctoBooth is free, and no registration is required Enjoy.]]></description>
			<content:encoded><![CDATA[<p>A friend has created a website called <a href="http://www.octobooth.com/" target="_blank">http://www.octobooth.com/</a></p>
<blockquote><p> OctoBooth is a website that creates facial transformations</p>
<p>Change race, age or sex, and share photos with your friends</p>
<p>OctoBooth is free, and no registration is required</p></blockquote>
<p>Enjoy.</p>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2012/01/22/creates-facial-transformations-with-octobooth/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][Qt] Parallel Quick Sort with QtConcurrent (Shared memory generic quick sort)</title>
		<link>http://berenger.eu/blog/2011/12/02/cqt-parallel-quick-sort-with-qtconcurrent-shared-memory-generic-quick-sort/</link>
		<comments>http://berenger.eu/blog/2011/12/02/cqt-parallel-quick-sort-with-qtconcurrent-shared-memory-generic-quick-sort/#comments</comments>
		<pubDate>Fri, 02 Dec 2011 11:32:21 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Resource]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1262</guid>
		<description><![CDATA[PS : I developed several quick sort (available on this blog), a sequential version, an openmp tasks version, a openmp not inplace version, an mpi version and a Qt concurent version. Here I developed a quick sort based on the great qt feature QtConcurrent. It is mostly similar to the openmp tasks versions. Advices Change [...]]]></description>
			<content:encoded><![CDATA[<p>PS : I developed several quick sort (available on this blog), <a href="http://berenger.eu/blog/?p=1052">a sequential version</a>, an <a href="http://berenger.eu/blog/?p=1201">openmp tasks version</a>, <a href="http://berenger.eu/blog/?p=1117">a openmp not inplace version</a>, <a href="http://berenger.eu/blog/?p=1128">an mpi version</a> and <a href="http://berenger.eu/blog/?p=1262">a Qt concurent version</a>.</p>
<p>Here I developed a quick sort based on the great qt feature QtConcurrent.<br />
It is mostly similar to the <a href="http://berenger.eu/blog/?p=1201">openmp tasks versions</a>.<br />
<span id="more-1262"></span></p>
<h2> Advices </h2>
<p>Change the fallowing lines in the code to perform different tests</p>
<pre class="brush: cpp; title: ; notranslate">
QThreadPool::globalInstance()-&gt;setMaxThreadCount(1);
const long Size = 10000000;
</pre>
<h2> The code </h2>
<pre class="brush: cpp; title: ; notranslate">
#include &lt;QtCore/QCoreApplication&gt;
#include &lt;QTime&gt;
#include &lt;QtConcurrentRun&gt;

#include &lt;iostream&gt;

////////////////////////////////////////////////////////////
// Miscialenous functions
////////////////////////////////////////////////////////////

/** Swap to value */
template &lt;class NumType&gt;
inline void Swap(NumType&amp; value, NumType&amp; other){
    NumType temp = value;
    value = other;
    other = temp;
}

////////////////////////////////////////////////////////////
// Quick sort
////////////////////////////////////////////////////////////

/* use in the sequential qs */
template &lt;class SortType&gt;
long QsPartition(SortType outputArray[], long left, long right){
    const long part = right;
    Swap(outputArray[part],outputArray[left + (right - left ) / 2]);
    const SortType partValue = outputArray[part];
    --right;

    while(true){
        while(outputArray[left] &lt; partValue){
            ++left;
        }
        while(right &gt;= left &amp;&amp; partValue &lt;= outputArray[right]){
            --right;
        }
        if(right &lt; left) break;

        Swap(outputArray[left],outputArray[right]);
        ++left;
        --right;
    }

    Swap(outputArray[part],outputArray[left]);

    return left;
}

/* a sequential qs */
template &lt;class SortType&gt;
void QsSequential(SortType array[], const long left, const long right){
    if(left &lt; right){
        const long part = QsPartition(array, left, right);
        QsSequential(array,part + 1,right);
        QsSequential(array,left,part - 1);
    }
}

/** A task dispatcher */
template &lt;class SortType&gt;
void QuickSortTask(SortType array[], const long left, const long right, const int deep){
    if(left &lt; right){
        if( deep ){
            const long part = QsPartition(array, left, right);

            QtConcurrent::run(QuickSortTask&lt;SortType&gt;, array, part + 1, right, deep - 1);
            QtConcurrent::run(QuickSortTask&lt;SortType&gt;, array, left, part - 1, deep - 1);
        }
        else {
            const long part = QsPartition(array, left, right);
            QsSequential(array,part + 1,right);
            QsSequential(array,left,part - 1);
        }
    }
}

////////////////////////////////////////////////////////////
// Main
////////////////////////////////////////////////////////////

template &lt;class SortedType&gt;
bool isSorted(SortedType array[], const long size){
    for(int idx = 1; idx &lt; size ; ++idx){
        if(array[idx-1] &gt; array[idx]){
            return false;
        }
    }
    return true;
}

template &lt;class SortedType&gt;
void print(SortedType array[], const int size){
    for(int idx = 0 ;idx &lt; size; ++idx){
        std::cout &lt;&lt; array[idx] &lt;&lt; &quot;\t&quot;;
    }
    std::cout &lt;&lt; &quot;\n&quot;;
}

int main(int argc, char** argv){
    QCoreApplication app(argc, argv);
    // Change to test efficiency
    // QThreadPool::globalInstance()-&gt;setMaxThreadCount(1);

    const long Size = 10000000;//600000000;
    long* const array = new long[Size];

    // Create array
    srand(0);
    for(long idx = 0 ; idx &lt; Size ; ++idx){
        array[idx] = int(Size*(float(rand())/RAND_MAX));
    }

    printf(&quot;Sorting %ld elements\n&quot;, Size);
    // Start sorting
    QTime timer;
    timer.start();
    QtConcurrent::run(QuickSortTask&lt;long&gt;, array, 0, Size - 1, 6);
    QThreadPool::globalInstance()-&gt;waitForDone();
    printf(&quot;Elapsed time %f s\n&quot;, timer.elapsed()/1000.0);

    // Test result
    if(isSorted(array,Size)){
        printf(&quot;Is sorted\n&quot;);
    }
    else{
        printf(&quot;Error array is not sorted!\n&quot;);
        if( Size &lt;= 20) print(array,Size);
        return -1;
    }

    // remove array and quit
    delete [] array;
    return 0;
}
</pre>
<p>Licence : lgpl.</p>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/12/02/cqt-parallel-quick-sort-with-qtconcurrent-shared-memory-generic-quick-sort/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][HPC] C++ tips for High Performance Programming</title>
		<link>http://berenger.eu/blog/2011/11/25/chpc-c-tips-for-high-performance-programming/</link>
		<comments>http://berenger.eu/blog/2011/11/25/chpc-c-tips-for-high-performance-programming/#comments</comments>
		<pubDate>Fri, 25 Nov 2011 15:56:02 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1255</guid>
		<description><![CDATA[A week ago I had to give some tips about C++ in HPC, here is an extract of the document. It has been done quickly, it is an unpretentious document. Of course an entire book about efficient C++ is better. Get the doc here (pdf)]]></description>
			<content:encoded><![CDATA[<p>A week ago I had to give some tips about C++ in HPC,<br />
here is an extract of the document.<br />
<span id="more-1255"></span></p>
<p>It has been done quickly, it is an unpretentious document.<br />
Of course an entire book about efficient C++ is better.</p>
<p><a href="http://berenger.eu/blog/wp-content/uploads/2011/11/CHPP.pdf">Get the doc here (pdf)</a></p>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/11/25/chpc-c-tips-for-high-performance-programming/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[Qt][GPU] C++ Application for dynamic transition images (QtConcurrent, QtOpenCL)</title>
		<link>http://berenger.eu/blog/2011/11/25/qtgpu-c-application-for-dynamic-transition-images-qtconcurrent-qtopencl/</link>
		<comments>http://berenger.eu/blog/2011/11/25/qtgpu-c-application-for-dynamic-transition-images-qtconcurrent-qtopencl/#comments</comments>
		<pubDate>Fri, 25 Nov 2011 15:47:56 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[GPU]]></category>
		<category><![CDATA[Resource]]></category>
		<category><![CDATA[Tutorial]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1241</guid>
		<description><![CDATA[In this post, I will briefly introduce my first test of the qtopencl tool. What I developed a basic application that goes from one image to another in a given time step (4s but it is customizable). Lets say we have 3 pixels, source (from source image) dest (from destination image) and frame (from the [...]]]></description>
			<content:encoded><![CDATA[<p>In this post, I will briefly introduce my first test of the qtopencl tool.</p>
<p><span id="more-1241"></span></p>
<h2>What</h2>
<p>I developed a basic application that goes from one image to another in a given time step (4s but it is customizable).<br />
Lets say we have 3 pixels, source (from source image) dest (from destination image) and frame (from the computed image).</p>
<ul>
<li>At time 0: frame = source</li>
<li>At time STEP: frame = dest</li>
<li>At time STEP/2: frame is between source and dest.</li>
</ul>
<p>More generally,<br />
frame(r,g,b) = (dest(r,g,b) &#8211; source(r,g,b)) * percent + source(r,g,b)</p>
<p>Examples:</p>
<p><a href="http://berenger.eu/blog/wp-content/uploads/2011/11/transfer.jpg"><img src="http://berenger.eu/blog/wp-content/uploads/2011/11/transfer.jpg" alt="Transfer image example" /></a></p>
<h2>The application</h2>
<p><a href="http://berenger.eu/blog/wp-content/uploads/2011/11/transfersoft.jpg"><img src="http://berenger.eu/blog/wp-content/uploads/2011/11/transfersoft.jpg" alt="Transfer Image Interface" /></a></p>
<p>So the application lets you choose the images, choose the type of computation (thread,sequential,gpu) and start stop.</p>
<h2>The efficency</h2>
<p>Images size : 1024 x 768<br />
Processors : i7 (4 x CPU) 2.8 GHz<br />
Ubuntu : 11.04<br />
Transfer Time : 4s<br />
Sequential FPS : 6.3 f/s<br />
Thread FPS : 25.s f/s<br />
GPU FPS : 145.2 f/s</p>
<h2>Processing Code</h2>
<h3>Sequential</h3>
<pre class="brush: cpp; title: ; notranslate">
void ImageWorker::runSequential(){
    stopFlag = false;

    int totalTime        = 0;
    int nbFrameProcessed = 0;

    // emit source
    emit result(sourceImage, 0);

    QTime timer;
    timer.start();
    // compute until TimeToDest
    while(!stopFlag &amp;&amp; timer.elapsed() &lt; TimeToDest){
        totalTime        += sequentialStep( (timer.elapsed()/float(TimeToDest)) );
        nbFrameProcessed += 1;
        const float fps   = nbFrameProcessed / (totalTime/1000.0);

        emit result(frame, fps);
    }

    // emit dest
    const float fps   = nbFrameProcessed / (totalTime/1000.0);
    emit result(destImage, fps);
}

int ImageWorker::sequentialStep(const float percent){
    QTime timer;
    timer.start();

    for( int idxX = 0; idxX &lt; sourceImage.width() ; ++idxX){
        for( int idxY = 0; idxY &lt; sourceImage.height() ; ++idxY){
            // ARGB quadruplet on the format #AARRGGBB convert into signed
            int sourceValue = static_cast&lt;int&gt;(sourceImage.pixel(idxX, idxY));
            int destValue = static_cast&lt;int&gt;(destImage.pixel(idxX, idxY));

            const int red   = ((((destValue &gt;&gt; 16) &amp; 0xFF) - ((sourceValue &gt;&gt; 16) &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt; 16) &amp; 0xFF);
            const int green = ((((destValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF) - ((sourceValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF);
            const int blue  = ((((destValue &gt;&gt;  0) &amp; 0xFF) - ((sourceValue &gt;&gt;  0) &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt;  0) &amp; 0xFF);

            // set value into intermediate image
            frame.setPixel(idxX, idxY, qRgb(red, green, blue));
        }
    }

    return timer.elapsed();
}
</pre>
<h3>QConcurent</h3>
<pre class="brush: cpp; title: ; notranslate">
void ImageWorker::runConcurrent(){
    stopFlag = false;

    QVector&lt; QPair&lt;int, int&gt; &gt; chunkSizes(QThread::idealThreadCount());
    const double aChunk = ceil(double(sourceImage.height())/QThread::idealThreadCount());
    chunkSizes[0] = QPair&lt;int, int&gt;(0, aChunk);
    for( int idxThread = 1 ; idxThread &lt; QThread::idealThreadCount() ; ++idxThread){
        chunkSizes[idxThread].first  = chunkSizes[idxThread-1].second;
        chunkSizes[idxThread].second = (idxThread + 1) * aChunk;
    }
    chunkSizes[QThread::idealThreadCount() - 1].second = sourceImage.height();

    int totalTime        = 0;
    int nbFrameProcessed = 0;

    emit result(sourceImage, 0);

    QTime timer;
    timer.start();
    while(!stopFlag &amp;&amp; timer.elapsed() &lt; TimeToDest){
        totalTime        += concurrentStep( (timer.elapsed()/float(TimeToDest)), chunkSizes );
        nbFrameProcessed += 1;
        const float fps   = nbFrameProcessed / (totalTime/1000.0);

        emit result(frame, fps);
    }

    const float fps   = nbFrameProcessed / (totalTime/1000.0);
    emit result(destImage, fps);
}

void convertImages(const QImage&amp; sourceImage, const QImage&amp; destImage, QImage* const frame,
                   const float percent, const QPair&lt;int, int&gt;&amp; realHeight){
    const int width = sourceImage.width();
    QRgb * const threadBuffer = new QRgb[width];

    static QMutex locker;

    for( int idxY = realHeight.first; idxY &lt; realHeight.second ; ++idxY){
        for( int idxX = 0; idxX &lt; width ; ++idxX){
            // ARGB quadruplet on the format #AARRGGBB
            int sourceValue = static_cast&lt;int&gt;(sourceImage.pixel(idxX, idxY));
            int destValue = static_cast&lt;int&gt;(destImage.pixel(idxX, idxY));

            const int red   = ((((destValue &gt;&gt; 16) &amp; 0xFF) - ((sourceValue &gt;&gt; 16) &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt; 16) &amp; 0xFF);
            const int green = ((((destValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF) - ((sourceValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt;  <img src='http://berenger.eu/blog/wp-includes/images/smilies/icon_cool.gif' alt='8)' class='wp-smiley' /> &amp; 0xFF);
            const int blue  = ((((destValue &gt;&gt;  0) &amp; 0xFF) - ((sourceValue &gt;&gt;  0) &amp; 0xFF)) * percent) + ((sourceValue &gt;&gt;  0) &amp; 0xFF);

            threadBuffer[idxX] = qRgb(red, green, blue);
        }
        locker.lock();
        for( int idxX = 0; idxX &lt; width ; ++idxX){
            frame-&gt;setPixel(idxX, idxY, threadBuffer[idxX]);
        }
        locker.unlock();
    }

    delete[] threadBuffer;
}

int ImageWorker::concurrentStep(const float percent, const QVector&lt; QPair&lt;int, int&gt; &gt;&amp; chunkSizes){
    QTime timer;
    timer.start();

    for( int idxThread = 0 ; idxThread &lt; QThread::idealThreadCount() ; ++idxThread){
        QtConcurrent::run(convertImages, sourceImage, destImage, &amp;frame, percent, chunkSizes[idxThread]);
    }

    // QThreadPool::globalInstance()-&gt;activeThreadCount()
    QThreadPool::globalInstance()-&gt;waitForDone();

    return timer.elapsed();
}
</pre>
<h3>QtOpencl</h3>
<pre class="brush: cpp; title: ; notranslate">
#include &lt;qclcontext.h&gt;
#include &lt;qclprogram.h&gt;
#include &lt;qclkernel.h&gt;
#include &lt;qclimage.h&gt;

void ImageWorker::runOpencl(){
    stopFlag = false;

    QCLContext context;

    if (!context.create())
        qFatal(&quot;Could not create OpenCL context&quot;);

    if (!context.create(QCLDevice::GPU))
        qFatal(&quot;Could not create OpenCL context&quot;);

    QCLProgram program = context.buildProgramFromSourceFile(QLatin1String(&quot;:/transferimage.cl&quot;));

    QCLImage2D sourceImageBuffer = context.createImage2DCopy(sourceImage, QCLMemoryObject::ReadOnly);
    QCLImage2D destImageBuffer = context.createImage2DCopy(destImage, QCLMemoryObject::ReadOnly);
    QCLImage2D frameBuffer = context.createImage2DDevice(frame.format(), frame.size(), QCLMemoryObject::WriteOnly);

    QCLKernel compute = program.createKernel(&quot;transfer&quot;);
    compute.setGlobalWorkSize(sourceImage.size());
    compute.setLocalWorkSize(8, 8);

    int totalTime        = 0;
    int nbFrameProcessed = 0;

    emit result(sourceImage, 0);

    QTime timerFrame;
    QTime timer;
    timer.start();
    while(!stopFlag &amp;&amp; timer.elapsed() &lt; TimeToDest){
        timerFrame.start();
        compute(sourceImageBuffer, destImageBuffer, frameBuffer, (timer.elapsed()/float(TimeToDest)) );
        frameBuffer.read(&amp;frame);

        totalTime        += timerFrame.elapsed();
        nbFrameProcessed += 1;
        const float fps   = nbFrameProcessed / (totalTime/1000.0);

        emit result(frame, fps);
    }

    const float fps   = nbFrameProcessed / (totalTime/1000.0);
    emit result(destImage, fps);
}

const sampler_t samp = CLK_ADDRESS_CLAMP_TO_EDGE |
                       CLK_FILTER_LINEAR;
__kernel void transfer(__read_only image2d_t sourceImage,
                       __read_only image2d_t destImage,
                       __write_only image2d_t frameImage,
                       float percent)
{
    int2 pos = (int2)(get_global_id(0), get_global_id(1));
    float4 sourceColor = read_imagef(sourceImage, samp, pos);
    float4 destColor = read_imagef(destImage, samp, pos);

    float4 frameColor;
    frameColor.x = ((destColor.x - sourceColor.x) * percent) + sourceColor.x;
    frameColor.y = ((destColor.y - sourceColor.y) * percent) + sourceColor.y;
    frameColor.z = ((destColor.z - sourceColor.z) * percent) + sourceColor.z;
    frameColor.w = sourceColor.w;

    write_imagef(frameImage, pos, clamp(frameColor, 0.0f, 1.0f));
}
</pre>
<h2>Download the code</h2>
<p><a href="http://berenger.eu/blog/wp-content/uploads/2011/11/TransferImageBerenger.zip">The code is here.</a></p>
<h2>References</h2>
<p>http://labs.qt.nokia.com/2010/04/07/using-opencl-with-qt/</p>
<p>http://doc.qt.nokia.com/opencl-snapshot/concurrent.html</p>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/11/25/qtgpu-c-application-for-dynamic-transition-images-qtconcurrent-qtopencl/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[GPU] Install Cuda Opencl on Ubuntu</title>
		<link>http://berenger.eu/blog/2011/11/25/gpu-install-cuda-opencl-on-ubuntu/</link>
		<comments>http://berenger.eu/blog/2011/11/25/gpu-install-cuda-opencl-on-ubuntu/#comments</comments>
		<pubDate>Fri, 25 Nov 2011 10:47:11 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[Others]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[GPU]]></category>
		<category><![CDATA[Resource]]></category>
		<category><![CDATA[Tutorial]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1236</guid>
		<description><![CDATA[The steps to install cuda, opencl for nvidia gpu on ubuntu. Keep default directories. Download what you need from NVidia website: http://developer.nvidia.com/cuda-downloads Take: &#8220;CUDA Toolkit for Ubuntu Linux 10.10&#8243; (gpucomputingsdk_4.0.17_linux) &#8220;GPU Computing SDK &#8211; complete package including all code samples&#8221; (cudatoolkit_4.0.17_linux_32_ubuntu10.10.run) And if you want (not recommended): &#8220;Developer Drivers for Linux&#8221; (devdriver_4.0_linux_32_270.41.19.run) Install NVidia Driver [...]]]></description>
			<content:encoded><![CDATA[<p>The steps to install cuda, opencl for nvidia gpu on ubuntu.<br />
<span id="more-1236"></span><br />
Keep default directories.</p>
<p>Download what you need from NVidia website:<br />
<a title="http://developer.nvidia.com/cuda-downloads" href="http://developer.nvidia.com/cuda-downloads" target="_blank">http://developer.nvidia.com/cuda-downloads</a></p>
<p>Take:<br />
&#8220;CUDA Toolkit for Ubuntu Linux 10.10&#8243; (gpucomputingsdk_4.0.17_linux)<br />
&#8220;GPU Computing SDK &#8211; complete package including all code samples&#8221; (cudatoolkit_4.0.17_linux_32_ubuntu10.10.run)<br />
And if you want (not recommended):<br />
&#8220;Developer Drivers for Linux&#8221; (devdriver_4.0_linux_32_270.41.19.run)</p>
<h2>Install NVidia Driver</h2>
<p><span style="color: #ff0000;">I recommend not to install Official NVidia Driver!!</span><br />
(I did the first time and had a lot of problems!)<br />
Use the safest install:</p>
<pre class="brush: bash; title: ; notranslate">
sudo apt-get install linux-headers-generic
sudo apt-get install nvidia-current
sudo nvidia-xconfig
</pre>
<p>If you need to remove bad drivers:</p>
<pre class="brush: bash; title: ; notranslate">
sudo apt-get remove --purge nvidia-current
</pre>
<p>Anyway, if you prefer you can install official nvidia package:</p>
<pre class="brush: bash; title: ; notranslate">
sudo /etc/init.d/gdm stop
sudo sh devdriver_4.0_linux_32_270.41.19.run
reboot
</pre>
<h2>Install SDK</h2>
<p><a href="http://developer.nvidia.com/cuda-downloads" target="_blank">SDK has to be downloaded from the NVidia offical website.</a></p>
<p>Run the downloaded package:</p>
<pre class="brush: bash; title: ; notranslate">
sudo sh cudatoolkit_4.0.17_linux_32_ubuntu10.10.run
</pre>
<p>Then change your path and log dirs:</p>
<pre class="brush: bash; title: ; notranslate">
gedit ~/.bashrc
</pre>
<p>And paste:</p>
<pre class="brush: bash; title: ; notranslate">
# opencl
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib
export PATH=$PATH:/usr/local/cuda/lib:/usr/local/cuda/bin
</pre>
<p>(note, in case of 32bits you do not need /usr/local/cuda/lib64)</p>
<p>Restart your terminal or tape:</p>
<pre class="brush: bash; title: ; notranslate">
source ~/.bashrc
</pre>
<h2>Install Examples</h2>
<p><a href="http://developer.nvidia.com/cuda-downloads" target="_blank">Examples have to be downloaded from the NVidia offical website.</a></p>
<pre class="brush: bash; title: ; notranslate">
sudo sh gpucomputingsdk_4.0.17_linux
</pre>
<p>I need to install some dependencies:</p>
<pre class="brush: bash; title: ; notranslate">
sudo apt-get install libxmu-dev libxmu6
sudo apt-get install freeglut3-dev
</pre>
<p>Then compile OpenCl examples (remplace user name with yours):</p>
<pre class="brush: bash; title: ; notranslate">
cd ~
sudo chown -R “user name” NVIDIA_GPU_Computing_SDK
sudo chmod -R 777 NVIDIA_GPU_Computing_SDK
cd NVIDIA_GPU_Computing_SDK
cd OpenCL
make
</pre>
<p>Run examples in (for example run oclNbody)</p>
<pre class="brush: bash; title: ; notranslate">
cd ~/NVIDIA_GPU_Computing_SDK/OpenCL/bin/linux/release
./oclNbody
</pre>
<h2>References</h2>
<p>http://vgerscorner.wordpress.com/2010/10/24/opencl-ubuntu-install-guide/</p>
<p>http://forums.nvidia.com/index.php?showtopic=87692h2</p>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/11/25/gpu-install-cuda-opencl-on-ubuntu/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][Omp] OpenMP version _OPENMP directive</title>
		<link>http://berenger.eu/blog/2011/10/17/comp-openmp-version-_openmp-directive/</link>
		<comments>http://berenger.eu/blog/2011/10/17/comp-openmp-version-_openmp-directive/#comments</comments>
		<pubDate>Mon, 17 Oct 2011 08:16:40 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[openmp]]></category>
		<category><![CDATA[Resource]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1230</guid>
		<description><![CDATA[Sometime you may want to know what version of openmp you are using at compile time. This is possible using the _OPENMP directive. Based on the specification: http://openmp.org/wp/openmp-specifications/ You can notice that the openmp vers. 3.0 has been released in 2008.05 From this information you can use task or not depending on the version of [...]]]></description>
			<content:encoded><![CDATA[<p>Sometime you may want to know what version of openmp you are using at compile time.<br />
This is possible using the _OPENMP directive.</p>
<p><span id="more-1230"></span></p>
<p>Based on the specification:<br />
<a href="http://openmp.org/wp/openmp-specifications/" target="_blank">http://openmp.org/wp/openmp-specifications/</a><br />
You can notice that the openmp vers. 3.0 has been released in 2008.05<br />
From this information you can use task or not depending on the version of your openmp.</p>
<pre class="brush: cpp; title: ; notranslate">
#include &lt;cstdio&gt;
#include &lt;omp.h&gt;

int main(){
#if _OPENMP &gt;= 200805
    // I use tasks
    printf(&quot;_OPENMP &gt;= 200805 (vers = %d)\n&quot;,_OPENMP);
#else
    // Tasks do not exist...
    printf(&quot;_OPENMP &lt; 200805 (vers = %d)\n&quot;,_OPENMP);
#endif

    return 0;
}
</pre>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/10/17/comp-openmp-version-_openmp-directive/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][Mpi] Bitonic parallel Sort (Bitonic Sorting network in parallel)</title>
		<link>http://berenger.eu/blog/2011/10/14/cmpi-bitonic-parallel-sort-bitonic-sorting-network-in-parallel/</link>
		<comments>http://berenger.eu/blog/2011/10/14/cmpi-bitonic-parallel-sort-bitonic-sorting-network-in-parallel/#comments</comments>
		<pubDate>Fri, 14 Oct 2011 13:23:47 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1223</guid>
		<description><![CDATA[In this post I put the code of a bitonic sorting in parallel. The method are templatized so you can use it as you like. Be aware that this version need a number of processes power of 2. Reference Library Support for Parallel Sorting in Scientific Computations http://web.mst.edu/~ercal/387/P3/pr-proj-3.pdf http://en.wikipedia.org/wiki/Bitonic_sort http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/oddn.htm http://en.wikipedia.org/wiki/Bisection_method The Code]]></description>
			<content:encoded><![CDATA[<p>In this post I put the code of a bitonic sorting in parallel.<br />
The method are templatized so you can use it as you like.</p>
<p>Be aware that this version need a number of processes power of 2.</p>
<p><span id="more-1223"></span></p>
<h2> Reference </h2>
<p><a href="http://www.google.com/url?sa=t&#038;source=web&#038;cd=1&#038;ved=0CBoQFjAA&#038;url=http%3A%2F%2Fwww.tu-chemnitz.de%2Finformatik%2FPI%2Fforschung%2Fpub%2Fdownload%2FDHR_europar07.ps&#038;ei=bzeYTrn2OYjLsgbJ8ayABA&#038;usg=AFQjCNHIGl-01REpZ6PRDlsJsIPcESE-JQ&#038;sig2=o4MrhhQ8a3lwRSa5RQ4njQ" target="_blank">Library Support for Parallel Sorting in Scientific Computations</a><br />
<a href="http://web.mst.edu/~ercal/387/P3/pr-proj-3.pdf" title="A good document" target="_blank">http://web.mst.edu/~ercal/387/P3/pr-proj-3.pdf</a><br />
<a href="http://en.wikipedia.org/wiki/Bitonic_sort" target="_blank">http://en.wikipedia.org/wiki/Bitonic_sort</a><br />
<a href="http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/oddn.htm" target="_blank">http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/oddn.htm</a><br />
<a href="http://en.wikipedia.org/wiki/Bisection_method" target="_blank">http://en.wikipedia.org/wiki/Bisection_method</a></p>
<h2> The Code </h2>
<pre class="brush: cpp; title: ; notranslate">
#include &lt;cstdio&gt;
#include &lt;cstdlib&gt;
#include &lt;cmath&gt;

#include &lt;mpi.h&gt;

////////////////////////////////////////////////////////////////
// Sequential sort first!
////////////////////////////////////////////////////////////////

template &lt;class SortType&gt;
inline void swap(SortType&amp; v1, SortType&amp; v2){
    const SortType tmp = v1;
    v1 = v2;
    v2 = tmp;
}

template &lt;class SortType, class CompareType, class FSize&gt;
int partition(SortType* const array, FSize left, FSize right){
    SortType part = right;
    swap(array[part],array[(right+left) / 2]);
    --right;

    while(true){
        while( CompareType(array[left]) &lt; CompareType(array[part])){
            ++left;
        }
        while(right &gt;= left &amp;&amp; CompareType(array[part]) &lt;= CompareType(array[right])){
            --right;
        }
        if(right &lt; left) break;

        swap(array[left],array[right]);
        ++left;
        --right;
    }

    swap(array[part],array[left]);

    return left;
}

template &lt;class SortType, class CompareType, class FSize&gt;
void qs(SortType* const array, const FSize left, const FSize right){
    if(left &lt; right){
        const FSize part = partition&lt;SortType,CompareType&gt;(array, left, right);
        qs&lt;SortType,CompareType&gt;(array,part + 1,right);
        qs&lt;SortType,CompareType&gt;(array,left,part - 1);
    }
}

template &lt;class SortType, class CompareType, class FSize&gt;
void quick(SortType* const array, const FSize size){
    qs&lt;SortType,CompareType&gt;(array,0,size-1);
}

////////////////////////////////////////////////////////////////
// Bitonic parallel sort !
////////////////////////////////////////////////////////////////

// Mpi flag
static const int FlagMin = 5;
static const int FlagMax = 6;
static const int FlagMinMess = 4;
static const int FlagMaxMess = 3;

// This function exchange data with the other rank,
// its send the max value and receive min value
template &lt;class SortType, class CompareType, class FSize&gt;
void sendMaxAndGetMin(SortType array[], const FSize size, const int otherRank){
    FSize left  = -1;
    FSize right = size - 1;
    FSize pivot = left + (right - left + 1)/2;
    CompareType otherValue = -1;
    CompareType tempCompareValue = CompareType(array[pivot]);
    MPI_Sendrecv(&amp;tempCompareValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMin,&amp;otherValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMax,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

    while( pivot != left &amp;&amp; pivot != right  &amp;&amp; array[pivot] != otherValue) {

        if( array[pivot] &lt; otherValue ){
            left = pivot;
        }
        else {
            right = pivot;
        }
        pivot = left + (right - left + 1)/2;
        tempCompareValue = CompareType(array[pivot]);

        MPI_Sendrecv(&amp;tempCompareValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMin,&amp;otherValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMax,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
    }

    if( otherValue &lt;= array[pivot] ){
        MPI_Sendrecv_replace(&amp;array[pivot], (size - pivot) * sizeof(SortType) , MPI_BYTE,
                               otherRank, FlagMinMess, otherRank, FlagMaxMess,
                               MPI_COMM_WORLD, MPI_STATUS_IGNORE);

    }
    else if( array[pivot] &lt; otherValue){
        if(pivot != size - 1){
            MPI_Sendrecv_replace(&amp;array[pivot + 1], (size - pivot - 1) * sizeof(SortType) , MPI_BYTE,
                                   otherRank, FlagMinMess, otherRank, FlagMaxMess,
                                   MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
    }

}

// This function exchange data with the other rank,
// its send the min value and receive max value
template &lt;class SortType, class CompareType, class FSize&gt;
void sendMinAndGetMax(SortType array[], const FSize size, const int otherRank){
    FSize left  = 0;
    FSize right = size ;
    FSize pivot = left + (right - left)/2;
    CompareType otherValue = -1;
    CompareType tempCompareValue = CompareType(array[pivot]);
    MPI_Sendrecv(&amp;tempCompareValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMax,&amp;otherValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMin,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

    while(  pivot != left  &amp;&amp; array[pivot] != otherValue) {

        if( array[pivot] &lt; otherValue ){
            left = pivot;
        }
        else {
            right = pivot;
        }
        pivot = left + (right - left)/2;
        tempCompareValue = CompareType(array[pivot]);
        MPI_Sendrecv(&amp;tempCompareValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMax,&amp;otherValue,sizeof(CompareType),MPI_BYTE,otherRank,FlagMin,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
    }

    if( array[pivot] &lt;= otherValue ){
        MPI_Sendrecv_replace(&amp;array[0], (pivot + 1) * sizeof(SortType) , MPI_BYTE,
                               otherRank, FlagMaxMess, otherRank, FlagMinMess,
                               MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
    else if( otherValue &lt; array[pivot]){
        if(pivot != 0){
            MPI_Sendrecv_replace(&amp;array[0], (pivot) * sizeof(SortType) , MPI_BYTE,
                                   otherRank, FlagMaxMess, otherRank, FlagMinMess,
                                   MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
    }
}

/*
From :

http://web.mst.edu/~ercal/387/P3/pr-proj-3.pdf

Parallel Bitonic Sort Algorithm for processor Pk (for k := 0 . . . P − 1)
d:= log P
// cube dimension
sort(local − datak ) // sequential sort
// Bitonic Sort follows
for i:=1 to d do
    window-id = Most Significant (d-i) bits of Pk
    for j:=(i-1) down to 0 do
        if((window-id is even AND j th bit of Pk = 0)
        OR (window-id is odd AND j th bit of Pk = 1))
            then call CompareLow(j)
        else
            call CompareHigh(j)
        endif
    endfor
endfor
  */
template &lt;class SortType, class CompareType, class FSize&gt;
void bitonic(SortType array[], const FSize size, const int np, const int rank){
    quick&lt;SortType,CompareType&gt;(array, size);

    const int logNp = log2(np);
    for(int bitIdx = 1 ; bitIdx &lt;= logNp ; ++bitIdx){
        // window-id = Most Significant (d-i) bits of Pk
        const int diBit =  (rank &gt;&gt; bitIdx) &amp; 0x1;

        for(int otherBit = bitIdx - 1 ; otherBit &gt;= 0 ; --otherBit){
            // if((window-id is even AND j th bit of Pk = 0)
            // OR (window-id is odd AND j th bit of Pk = 1))

            const int myOtherBit = (rank &gt;&gt; otherBit) &amp; 0x1;
            const int otherRank = rank ^ (1 &lt;&lt; otherBit);

            if( diBit != myOtherBit ){
                sendMinAndGetMax&lt;SortType,CompareType&gt;(array, size, otherRank);
            }
            else{
                sendMaxAndGetMin&lt;SortType,CompareType&gt;(array, size, otherRank);
            }
            // A merge sort is possible since the array is composed
            // by two part already sorted, but we want to do this in space
            quick&lt;SortType,CompareType&gt;(array, size);
        }
    }
}

////////////////////////////////////////////////////////////////
// Utils
////////////////////////////////////////////////////////////////

template &lt;class SortType, class FSize&gt;
bool isSorted(const SortType array[], const FSize size){
    for(int idx = 1 ; idx &lt; size ; ++idx){
        if( array[idx-1] &gt; array[idx]){
            return false;
        }
    }
    return true;
}

void print(const int array[], const int size, const int rank){
    for(int idx = 0 ; idx &lt; size ; ++idx){
        printf(&quot;array[%d][%d] = %d\n&quot;, rank, idx, array[idx]);
    }
}

int main(int argc, char ** argv){
    MPI_Init(&amp;argc, &amp;argv);

    int rank = 0;
    int nprocs = 0;

    MPI_Comm_size(MPI_COMM_WORLD,&amp;nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&amp;rank);

    const int Size = 500;
    long long array[Size];
    srand(Size);

    for(int idx = 0 ; idx &lt; Size ; ++idx){
        //array[idx] = nprocs - rank;
        //array[idx] = rank;
        array[idx] = Size * (rand()/float(RAND_MAX));
    }

    bitonic&lt;long long,int&gt;(array, Size, nprocs, rank);
    //print(array, Size, rank);

    int sorted = isSorted(array,Size);
    bool localySorted = false;
    MPI_Reduce( &amp;sorted, &amp;localySorted, 1, MPI_INT, MPI_LAND , 0, MPI_COMM_WORLD );

    int*const allExtrem = new int[nprocs * 2];
    int extrem[2];
    extrem[0] = array[0];
    extrem[1] = array[Size-1];
    MPI_Gather(extrem, 2, MPI_INT, allExtrem, 2, MPI_INT, 0, MPI_COMM_WORLD);

    printf(sorted?&quot;Is sorted\n&quot;:&quot;NO is not sorted\n&quot;);

    if( rank == 0){
        printf(localySorted?&quot;All sorted\n&quot;:&quot;NO all not sorted\n&quot;);

        int extremOk = true;
        for(int idxProc = 1 ; idxProc &lt; nprocs &amp;&amp; extremOk; ++idxProc){
            if( allExtrem[2 * (idxProc - 1) + 1] &gt; allExtrem[2 * idxProc]){
                extremOk = false;
            }
        }

        printf(extremOk?&quot;Extrem ok\n&quot;:&quot;NO extrem error\n&quot;);
    }

    delete[] allExtrem;

    MPI_Finalize();

    return 0;
}
</pre>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/10/14/cmpi-bitonic-parallel-sort-bitonic-sorting-network-in-parallel/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[Qt][OpenMP][MPI] mpi/openmp qt creator .pro flag</title>
		<link>http://berenger.eu/blog/2011/10/13/qtopenmpmpi-mpiopenmp-qt-creator-pro-flag/</link>
		<comments>http://berenger.eu/blog/2011/10/13/qtopenmpmpi-mpiopenmp-qt-creator-pro-flag/#comments</comments>
		<pubDate>Thu, 13 Oct 2011 11:38:13 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Others]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[Resource]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1218</guid>
		<description><![CDATA[Add the right options to your .pro to compile with openmp or mpi under Qt creator.]]></description>
			<content:encoded><![CDATA[<p>Add the right options to your .pro to compile with openmp or mpi under Qt creator.</p>
<p><span id="more-1218"></span></p>
<pre class="brush: php; title: ; notranslate">
# QMAKE_CC = mpicc
# QMAKE_CXX = mpic++
QMAKE_CFLAGS = $$system(mpicc --showme:compile)
QMAKE_CXXFLAGS = $$system(mpic++ --showme:compile)
QMAKE_LFLAGS += $$system(mpic++ --showme:link)
INCLUDEPATH += &quot;/opt/openmpi-1.5.1/include&quot;
LIBS += -L/opt/openmpi-1.5.1/lib -lmpi_cxx -lmpi -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl

LIBS += -lgomp -fopenmp -lblas
QMAKE_CXXFLAGS += -fopenmp
</pre>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/10/13/qtopenmpmpi-mpiopenmp-qt-creator-pro-flag/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][OpenMP] Custom barrier, a barrier for a group of threads</title>
		<link>http://berenger.eu/blog/2011/10/12/copenmp-custom-barrier-a-barrier-for-a-group-of-threads/</link>
		<comments>http://berenger.eu/blog/2011/10/12/copenmp-custom-barrier-a-barrier-for-a-group-of-threads/#comments</comments>
		<pubDate>Wed, 12 Oct 2011 09:01:44 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[openmp]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1208</guid>
		<description><![CDATA[Openmp give a barrier for all threads. Here is a class to perform a barrier with only a group of threads. The code is from : The Art of Multiprocessor Programming]]></description>
			<content:encoded><![CDATA[<p>Openmp give a barrier for all threads.<br />
Here is a class to perform a barrier with only a group of threads.</p>
<p><span id="more-1208"></span></p>
<p>The code is from :<br />
<a href="http://www.amazon.com/Art-Multiprocessor-Programming-Maurice-Herlihy/dp/0123705916/ref=sr_1_1?ie=UTF8&#038;qid=1318410070&#038;sr=8-1" title="The Art of Multiprocessor Programming" target="_blank">The Art of Multiprocessor Programming</a></p>
<pre class="brush: cpp; title: ; notranslate">
#ifndef FOMPBARRIER_HPP
#define FOMPBARRIER_HPP

#include &lt;omp.h&gt;
#include &lt;climits&gt;

/** This function is a custom omp barrier
  * Because openmp give only a global barrier we need
  * to be ablo to peform a barrier operation between a group
  * of thread only.
  */

class FOmpBarrier {
private:
    int nbThreads;          //&lt;The number of threads for this barrier
    int currentNbThread;    //&lt;The current number of threads waiting
    bool sense;             //&lt;Direct barrier feedback protection
    omp_lock_t mutex;       //&lt;To have an atomic int

    FOmpBarrier(FOmpBarrier&amp;){}
    FOmpBarrier&amp; operator=(FOmpBarrier&amp;){return *this;}

public:
    /** Constructor with the number of threads */
    FOmpBarrier(const int inNbThreads = INT_MAX)
        : nbThreads(inNbThreads), currentNbThread(0), sense(false) {
        omp_init_lock( &amp;mutex );
    }

    /** Destructor, release the omp lock */
    ~FOmpBarrier(){
        omp_destroy_lock( &amp;mutex );
    }

    /** Perform a barrier */
    void wait(){
        const bool mySense = sense;
        omp_set_lock( &amp;mutex );
        const int nbThreadsArrived = (++currentNbThread);
        omp_unset_lock( &amp;mutex );

        if(nbThreadsArrived == nbThreads) {
            currentNbThread = 0;
            sense = !sense;
            #pragma omp flush(sense)
        }
        else {
            volatile const bool* const ptSense = &amp;sense;
            while( (*ptSense) == mySense){
            }
        }
    }

    /** Change the number of threads */
    void setNbThreads(const int inNbThread){
        omp_set_lock( &amp;mutex );
        nbThreads = inNbThread;
        omp_unset_lock( &amp;mutex );
    }
};

#endif // FOMPBARRIER_HPP
</pre>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/10/12/copenmp-custom-barrier-a-barrier-for-a-group-of-threads/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>[C++][OpenMP] A shared memory quick sort openmp tasks (example, source code)</title>
		<link>http://berenger.eu/blog/2011/10/06/c-openmp-a-shared-memory-quick-sort-with-openmp-tasks-example-source-code/</link>
		<comments>http://berenger.eu/blog/2011/10/06/c-openmp-a-shared-memory-quick-sort-with-openmp-tasks-example-source-code/#comments</comments>
		<pubDate>Thu, 06 Oct 2011 09:42:54 +0000</pubDate>
		<dc:creator>Berenger</dc:creator>
				<category><![CDATA[C++]]></category>
		<category><![CDATA[Programming]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[openmp]]></category>
		<category><![CDATA[Tutorial]]></category>

		<guid isPermaLink="false">http://berenger.eu/blog/?p=1201</guid>
		<description><![CDATA[After a first shoot of quick sort on shared memory (to be able to create an mpi version, but that is not inplace) I created a real shared memory version. Be aware that these versions need the OpenMP task! PS : I developed several quick sort (available on this blog), a sequential version, an openmp [...]]]></description>
			<content:encoded><![CDATA[<p>After a first <a href="http://berenger.eu/blog/?p=1117">shoot of quick sort on shared memory</a> (to be able to create an mpi version, but that is not inplace)<br />
I created a real shared memory version.<br />
Be aware that these versions need the OpenMP task!</p>
<p>PS : I developed several quick sort (available on this blog), <a href="http://berenger.eu/blog/?p=1052">a sequential version</a>, an <a href="http://berenger.eu/blog/?p=1201">openmp tasks version</a>, <a href="http://berenger.eu/blog/?p=1117">a openmp not inplace version</a>, <a href="http://berenger.eu/blog/?p=1128">an mpi version</a> and <a href="http://berenger.eu/blog/?p=1262">a Qt concurent version</a>.</p>
<p><span id="more-1201"></span></p>
<p><a title="Sorts" href="http://berenger.eu/blog/?p=1052" target="_blank">A sequential version is also available here.</a></p>
<h2>The Code</h2>
<pre class="brush: cpp; title: ; notranslate">
#include &lt;cstdio&gt;
#include &lt;omp.h&gt;
#include &lt;cmath&gt;
#include &lt;ctime&gt;
#include &lt;cstdlib&gt;
#include &lt;cstring&gt;

/* Result
g++ --version
g++ (GCC) 4.6.0
  ====== Quick Sort =====
Sorting 600000000 elements
1 threads...
Elapsed time 83.777031 s
Is sorted
2 threads...
Elapsed time 43.194530 s
Is sorted
4 threads...
Elapsed time 23.494222 s
Is sorted
8 threads...
Elapsed time 18.924786 s
Is sorted
*/
////////////////////////////////////////////////////////////
// Miscialenous functions
////////////////////////////////////////////////////////////

/** Swap to value */
template &lt;class NumType&gt;
inline void Swap(NumType&amp; value, NumType&amp; other){
    NumType temp = value;
    value = other;
    other = temp;
}

////////////////////////////////////////////////////////////
// Quick sort
////////////////////////////////////////////////////////////

/* use in the sequential qs */
template &lt;class SortType&gt;
long QsPartition(SortType outputArray[], long left, long right){
    const long part = right;
    Swap(outputArray[part],outputArray[left + (right - left ) / 2]);
    const SortType partValue = outputArray[part];
    --right;

    while(true){
        while(outputArray[left] &lt; partValue){
            ++left;
        }
        while(right &gt;= left &amp;&amp; partValue &lt;= outputArray[right]){
            --right;
        }
        if(right &lt; left) break;

        Swap(outputArray[left],outputArray[right]);
        ++left;
        --right;
    }

    Swap(outputArray[part],outputArray[left]);

    return left;
}

/* a sequential qs */
template &lt;class SortType&gt;
void QsSequential(SortType array[], const long left, const long right){
    if(left &lt; right){
        const long part = QsPartition(array, left, right);
        QsSequential(array,part + 1,right);
        QsSequential(array,left,part - 1);
    }
}

/** A task dispatcher */
template &lt;class SortType&gt;
void QuickSortOmpTask(SortType array[], const long left, const long right, const int deep){
    if(left &lt; right){
        if( deep ){
            const long part = QsPartition(array, left, right);
            #pragma omp task
            QuickSortOmpTask(array,part + 1,right, deep - 1);
            #pragma omp task
            QuickSortOmpTask(array,left,part - 1, deep - 1);
        }
        else {
            const long part = QsPartition(array, left, right);
            QsSequential(array,part + 1,right);
            QsSequential(array,left,part - 1);
        }
    }
}

/** The openmp quick sort */
template &lt;class SortType&gt;
void QuickSortOmp(SortType array[], const long size){
    #pragma omp parallel
    {
        #pragma omp single nowait
        {
            QuickSortOmpTask(array, 0, size - 1 , 15);
        }
    }
}

////////////////////////////////////////////////////////////
// Main
////////////////////////////////////////////////////////////

bool isSorted(long long array[], const long size){
    for(int idx = 1; idx &lt; size ; ++idx){
        if(array[idx-1] &gt; array[idx]){
            return false;
        }
    }
    return true;
}

void print(long long array[], const int size){
    for(int idx = 0 ;idx &lt; size; ++idx){
        printf(&quot;%lld\t&quot;,array[idx]);
    }
    printf(&quot;\n&quot;);
}

int main(int, char**){
    const long Size = 600000000;
    long long* const array = new long long[Size];

    printf(&quot;Sorting %ld elements\n&quot;, Size);

    for(int idxThread = 1 ; idxThread &lt;= 8 ; idxThread *= 2){
        printf(&quot;%d threads...\n&quot;, idxThread);

        omp_set_num_threads(idxThread);

        srand(0);
        for(long idx = 0 ; idx &lt; Size ; ++idx){
            array[idx] = int(Size*(float(rand())/RAND_MAX));
        }

        const double startTime = omp_get_wtime();
        QuickSortOmp(array, Size);
        printf(&quot;Elapsed time %lf s\n&quot;, omp_get_wtime() - startTime);

        if(isSorted(array,Size)){
            printf(&quot;Is sorted\n&quot;);
        }
        else{
            printf(&quot;Error array is not sorted!\n&quot;);
            if( Size &lt;= 20) print(array,Size);
            return -1;
        }

    }

    delete [] array;

    return 0;
}
</pre>
<div style='clear:both'></div>]]></content:encoded>
			<wfw:commentRss>http://berenger.eu/blog/2011/10/06/c-openmp-a-shared-memory-quick-sort-with-openmp-tasks-example-source-code/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

