Here is my implementation of the paper:
“Fast Exponential Computation on SIMD Architectures”
https://www.researchgate.net/publication/272178514_Fast_Exponential_Computation_on_SIMD_Architectures
This code has been taken from Inastemp ( https://gitlab.mpcdf.mpg.de/bbramas/inastemp ) which is under MIT licence.
In the paper we need to have the coefficients from the Remez polynomials to interpolate the function 1 + xk(i) – 2.^xk(i) between [0;1).
I follow the algorithm given by wikipedia (sorry the French page is better https://fr.wikipedia.org/wiki/Algorithme_de_Remez :)
Here is my scilab code to do so (for n in [2;10]):
// From https://fr.wikipedia.org/wiki/Algorithme_de_Remez a = 0 b = 1 - %eps format('v',21) dispall=%f for n = 2:10 if dispall printf("N = %d\n",n) end //////////////////////////////////////////////: xk = zeros(n+2,1) for k = 1:(n+2) xk(k) = ((a+b)/2) + ((a-b)/2) * cos(((k-1)*%pi)/(n+1)) end if dispall disp(xk) end //////////////////////////////////////////////: M = zeros(n+2, n+2) for i = 1:(n+1) M(:,i) = xk.^(i-1) end for i = 1:(n+2) M(i, n+2) = (-1).^i end if dispall disp(M) end //////////////////////////////////////////////: func = zeros(n+2, 1) for i = 1:(n+2) //func(i) = exp(xk(i)) func(i) = 1 + xk(i) - 2.^xk(i) end if dispall disp(func) end //////////////////////////////////////////////: [x0,kerA]=linsolve(M,func) //////////////////////////////////////////////: if dispall disp(x0) end printf("constexpr static std::array<double,%d> GetCoefficient%d() {\n", n+1, n+1); printf(" return {{\n"); for i = 1:(n+1) sep="," if i == n+1 then sep="" end printf(" %.20e%s\n", -x0(i), sep); end printf(" }};\n"); printf("}\n\n"); end
It gives me the coefficients for the different accuracies:
//-->exec('/home/bbramas/Projects/inastemp/CMakeModules/remez.sce',-1) constexpr static std::array<double,3> GetCoefficient3() { return {{ -2.47142816509723700288e-03, 3.48943001636461247461e-01, -3.44000145306266491563e-01 }}; } constexpr static std::array<double,4> GetCoefficient4() { return {{ 1.06906116358144185133e-04, 3.03543677780836240743e-01, -2.24339532327269441936e-01, -7.92041454535668681958e-02 }}; } constexpr static std::array<double,5> GetCoefficient5() { return {{ -3.70138142771437266806e-06, 3.07033820309224325662e-01, -2.41638288055762540107e-01, -5.16904731562965388814e-02, -1.36976563343097993558e-02 }}; } constexpr static std::array<double,6> GetCoefficient6() { return {{ 1.06823753710239477000e-07, 3.06845249656632845792e-01, -2.40139721982230797126e-01, -5.58662282412822480682e-02, -8.94283890931273951763e-03, -1.89646052380707734290e-03 }}; } constexpr static std::array<double,7> GetCoefficient7() { return {{ -2.64303273610414963822e-09, 3.06853075372807815313e-01, -2.40230549677691723742e-01, -5.54802224547989303316e-02, -9.68497459444197204836e-03, -1.23843111224273085859e-03, -2.18892247566917477666e-04 }}; } constexpr static std::array<double,8> GetCoefficient8() { return {{ 5.72265234348656066133e-11, 3.06852812183173784266e-01, -2.40226356058427820139e-01, -5.55053022725605083032e-02, -9.61350625581030605871e-03, -1.34302437845634000529e-03, -1.42962470418959216190e-04, -2.16607474999407558923e-05 }}; } constexpr static std::array<double,9> GetCoefficient9() { return {{ -1.10150186041739869460e-12, 3.06852819617161765020e-01, -2.40226511645233870018e-01, -5.55040609720754696266e-02, -9.61837182960864275905e-03, -1.33266405715993271723e-03, -1.55186852765468104613e-04, -1.41484352491262699514e-05, -1.87582286605066256753e-06 }}; } constexpr static std::array<double,10> GetCoefficient10() { return {{ 1.79040320992239805871e-11, 3.06852815055756844576e-01, -2.40226385506041861806e-01, -5.55053584940081654042e-02, -9.61174262279892825667e-03, -1.35164210003994454852e-03, -1.23291147980286128769e-04, -4.53940620364305641833e-05, 1.46363500589519947862e-05, -3.63750326480946818984e-06 }}; } constexpr static std::array<double,11> GetCoefficient11() { return {{ 7.32388148129676088418e-13, 3.06852819216552274995e-01, -2.40226499275945526435e-01, -5.55042073859920090384e-02, -9.61749102796678571881e-03, -1.33571753728242812072e-03, -1.48718480159015542822e-04, -2.26598047213222231406e-05, 4.91492761180322572151e-06, -3.00875847392884227107e-06, 5.68126156224525271282e-07 }}; }
I add some constants
static const long int S64 = (1L << 52); static const long int S32 = (1L << 23); public: constexpr static double CoeffLog2E(){ // const double Euler = 2.71828182845904523536028747135266249775724709369995; // return std::log2(Euler()); return 1.442695040888963407359924681001892137426645954153; } constexpr static double CoeffA64(){ return double(S64); } constexpr static double CoeffB64(){ return double(S64*1023); } constexpr static double CoeffA32(){ return double(S32); } constexpr static double CoeffB32(){ return double(S32*127); }
And here is an example of implementation for AVX:
static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E())); static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64())); static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64())); static const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[8])); static const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[7])); static const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[6])); static const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[5])); static const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[4])); static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[3])); static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[2])); static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[1])); static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[0])); __m256d x = val * COEFF_LOG2E; const __m256d fractional_part = x - Floor(x); __m256d factor = COEFF_P5_X ; factor = (factor * fractional_part + COEFF_P5_Y); factor = (factor * fractional_part + COEFF_P5_Z); factor = (factor * fractional_part + COEFF_P5_A); factor = (factor * fractional_part + COEFF_P5_B); factor = (factor * fractional_part + COEFF_P5_C); factor = (factor * fractional_part + COEFF_P5_D); factor = (factor * fractional_part + COEFF_P5_E); factor = (factor * fractional_part + COEFF_P5_F); x -= factor; x = (COEFF_A * x + COEFF_B); __m128d valupper = _mm256_extractf128_pd(x, 1); _mm256_zeroupper(); __m128d vallower = _mm256_castpd256_pd128(x); alignas(64) long long int allvalint[VecLength] = {_mm_cvtsd_si64(vallower), _mm_cvtsd_si64(_mm_shuffle_pd(vallower, vallower, 1)), _mm_cvtsd_si64(valupper), _mm_cvtsd_si64(_mm_shuffle_pd(valupper, valupper, 1))}; return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)allvalint));