niklas-heer · niklas-heer · Oct 24, 2022 · Oct 23, 2022 · Oct 23, 2022 · Oct 23, 2022
diff --git a/Earthfile b/Earthfile
@@ -32,6 +32,7 @@ collect-data:
   BUILD +clj
   BUILD +clj-bb
   BUILD +cpp
+  BUILD +cpp-avx2
   BUILD +crystal
   BUILD +cs
   BUILD +d
@@ -41,6 +42,7 @@ collect-data:
   BUILD +java
   BUILD +julia
   BUILD +julia-compiled
+  BUILD +julia_ux4
   BUILD +nodejs
   BUILD +lua
   BUILD +luajit
@@ -114,6 +116,14 @@ cpp:
   RUN --no-cache g++ leibniz.cpp -o leibniz -O3 -s -static -flto -march=native -mtune=native -fomit-frame-pointer -fno-signed-zeros -fno-trapping-math -fassociative-math
   DO +BENCH --name="cpp" --lang="C++ (g++)" --version="g++ --version" --cmd="./leibniz"
 
+cpp-avx2:
+  FROM +alpine
+  RUN apk add --no-cache gcc build-base
+
+  COPY ./src/leibniz_avx2.cpp ./
+  RUN --no-cache g++ leibniz.cpp -o leibniz_avx2 -O3 -s -static -flto -march=native -mtune=native -fomit-frame-pointer -fno-signed-zeros -fno-trapping-math -fassociative-math
+  DO +BENCH --name="cpp-avx2" --lang="C++ (avx2)" --version="g++ --version" --cmd="./leibniz_avx2"
+
 crystal:
   FROM crystallang/crystal:1.6-alpine
   RUN apk add --no-cache hyperfine
@@ -221,6 +231,16 @@ julia-compiled:
   RUN julia -e 'using Pkg; Pkg.add(["StaticCompiler", "StaticTools"]); using StaticCompiler, StaticTools; include("./leibniz_compiled.jl"); compile_executable(mainjl, (), "./")'
   DO +BENCH --name="julia-compiled" --lang="Julia (AOT compiled)" --version="julia --version" --cmd="./mainjl"
 
+julia-ux4:
+  # We have to use a special image since there is no Julia package on alpine 🤷‍♂️
+  FROM julia:1.8.2-alpine3.16
+  RUN apk add --no-cache hyperfine
+  COPY +build/scmeta ./
+
+  COPY ./src/rounds.txt ./
+  COPY ./src/leibniz_ux4.jl ./
+  DO +BENCH --name="julia" --lang="Julia (ux4)" --version="julia --version" --cmd="julia leibniz_ux4.jl"
+
 nodejs:
   FROM +alpine
   RUN apk add --no-cache nodejs-current

diff --git a/leibniz_avx2.cpp b/leibniz_avx2.cpp
@@ -0,0 +1,56 @@
+#include <cstdio>
+
+#include <immintrin.h>
+
+
+double _x = 1.0;
+double pi = 1.0;
+
+
+int main()
+{
+    unsigned rounds;
+    unsigned int unroll = 4;
+    auto infile = std::fopen("rounds.txt", "r");  // open file
+    std::fscanf(infile, "%u", &rounds);           // read from file
+    std::fclose(infile);                          // close file
+
+    __m256d x       = _mm256_set_pd(-1.0,1.0,-1.0,1.0);
+    __m256d den     = _mm256_set_pd(0.0,0.0,0.0,0.0);
+    __m256d inc     = _mm256_set_pd(4.0,4.0,4.0,4.0);
+    __m256d two     = _mm256_set_pd(2.0,2.0,2.0,2.0);
+    __m256d mone    = _mm256_set_pd(-1.0,-1.0,-1.0,-1.0);
+    __m256d one     = _mm256_set_pd(1.0,1.0,1.0,1.0);
+    __m256d ivec    = _mm256_set_pd(2.0,3.0,4.0,5.0);
+    __m256d pivec   = _mm256_set_pd(0.0,0.0,0.0,0.0);
+
+    rounds += 2u; // do this outside the loop
+    unsigned int vec_end = rounds - rounds % unroll;
+
+    for (unsigned i=2u ; i < vec_end ; i+=unroll) // use ++i instead of i++
+    {
+        //#x = -x; // some compilers optimize this better than x *= -1
+        // compute den = (2 * i - 1)
+        den     = _mm256_add_pd(_mm256_mul_pd(two,ivec),mone);
+
+        // increment ivec, so ivec += inc
+        ivec    = _mm256_add_pd(ivec,inc);
+
+        // compute partial sums
+        pivec   = _mm256_add_pd(pivec,_mm256_div_pd(x,den));
+    }
+
+    // gather the partial sums
+    double* pi_v = (double*)&pivec;
+    pi +=  pi_v[0] + pi_v[1] + pi_v[2] + pi_v[3];
+
+    // now the wind-down loop
+    for (unsigned i=vec_end ; i < rounds ; ++i) 
+    {
+        _x = -_x;
+        pi += (_x / (2u * i - 1u));
+    }
+
+    pi *= 4;
+    std::printf("%.16f\n", pi); // print 16 decimal digits of pi
+}
diff --git a/src/leibniz_ux4.jl b/src/leibniz_ux4.jl
@@ -0,0 +1,20 @@
+function f(rounds)
+    pi = 1.0
+    x  = -1.0
+    r2 = rounds + 2
+    vend = r2 - r2 % 4
+    @simd for i in 2:4:r2
+        pi +=   x / (2.0 * i -  1.0) - 
+		x / (2.0 * i +  1.0) + 
+		x / (2.0 * i +  3.0) - 
+		x / (2.0 * i +  5.0) 
+    end
+    for i in vend+1:r2
+   	pi += 1.0 / (2.0 * (i + 0.0) - 1.0)
+	x = -x
+    end
+    return pi*4
+end
+
+rounds = parse(Int64, readchomp("rounds.txt"))
+print(f(rounds))