Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX2 variation of c++ #74

Merged
merged 7 commits into from
Oct 24, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Earthfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ collect-data:
BUILD +clj
BUILD +clj-bb
BUILD +cpp
BUILD +cpp-avx2
BUILD +crystal
BUILD +cs
BUILD +d
Expand All @@ -41,6 +42,7 @@ collect-data:
BUILD +java
BUILD +julia
BUILD +julia-compiled
BUILD +julia_ux4
BUILD +nodejs
BUILD +lua
BUILD +luajit
Expand Down Expand Up @@ -114,6 +116,14 @@ cpp:
RUN --no-cache g++ leibniz.cpp -o leibniz -O3 -s -static -flto -march=native -mtune=native -fomit-frame-pointer -fno-signed-zeros -fno-trapping-math -fassociative-math
DO +BENCH --name="cpp" --lang="C++ (g++)" --version="g++ --version" --cmd="./leibniz"

cpp-avx2:
FROM +alpine
RUN apk add --no-cache gcc build-base

COPY ./src/leibniz_avx2.cpp ./
RUN --no-cache g++ leibniz.cpp -o leibniz_avx2 -O3 -s -static -flto -march=native -mtune=native -fomit-frame-pointer -fno-signed-zeros -fno-trapping-math -fassociative-math
DO +BENCH --name="cpp-avx2" --lang="C++ (avx2)" --version="g++ --version" --cmd="./leibniz_avx2"

crystal:
FROM crystallang/crystal:1.6-alpine
RUN apk add --no-cache hyperfine
Expand Down Expand Up @@ -221,6 +231,16 @@ julia-compiled:
RUN julia -e 'using Pkg; Pkg.add(["StaticCompiler", "StaticTools"]); using StaticCompiler, StaticTools; include("./leibniz_compiled.jl"); compile_executable(mainjl, (), "./")'
DO +BENCH --name="julia-compiled" --lang="Julia (AOT compiled)" --version="julia --version" --cmd="./mainjl"

julia-ux4:
# We have to use a special image since there is no Julia package on alpine 🤷‍♂️
FROM julia:1.8.2-alpine3.16
RUN apk add --no-cache hyperfine
COPY +build/scmeta ./

COPY ./src/rounds.txt ./
COPY ./src/leibniz_ux4.jl ./
DO +BENCH --name="julia" --lang="Julia (ux4)" --version="julia --version" --cmd="julia leibniz_ux4.jl"

nodejs:
FROM +alpine
RUN apk add --no-cache nodejs-current
Expand Down
56 changes: 56 additions & 0 deletions leibniz_avx2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <cstdio>

#include <immintrin.h>


double _x = 1.0;
double pi = 1.0;


int main()
{
unsigned rounds;
unsigned int unroll = 4;
auto infile = std::fopen("rounds.txt", "r"); // open file
std::fscanf(infile, "%u", &rounds); // read from file
std::fclose(infile); // close file

__m256d x = _mm256_set_pd(-1.0,1.0,-1.0,1.0);
__m256d den = _mm256_set_pd(0.0,0.0,0.0,0.0);
__m256d inc = _mm256_set_pd(4.0,4.0,4.0,4.0);
__m256d two = _mm256_set_pd(2.0,2.0,2.0,2.0);
__m256d mone = _mm256_set_pd(-1.0,-1.0,-1.0,-1.0);
__m256d one = _mm256_set_pd(1.0,1.0,1.0,1.0);
__m256d ivec = _mm256_set_pd(2.0,3.0,4.0,5.0);
__m256d pivec = _mm256_set_pd(0.0,0.0,0.0,0.0);

rounds += 2u; // do this outside the loop
unsigned int vec_end = rounds - rounds % unroll;

for (unsigned i=2u ; i < vec_end ; i+=unroll) // use ++i instead of i++
{
//#x = -x; // some compilers optimize this better than x *= -1
// compute den = (2 * i - 1)
den = _mm256_add_pd(_mm256_mul_pd(two,ivec),mone);

// increment ivec, so ivec += inc
ivec = _mm256_add_pd(ivec,inc);

// compute partial sums
pivec = _mm256_add_pd(pivec,_mm256_div_pd(x,den));
}

// gather the partial sums
double* pi_v = (double*)&pivec;
pi += pi_v[0] + pi_v[1] + pi_v[2] + pi_v[3];

// now the wind-down loop
for (unsigned i=vec_end ; i < rounds ; ++i)
{
_x = -_x;
pi += (_x / (2u * i - 1u));
}

pi *= 4;
std::printf("%.16f\n", pi); // print 16 decimal digits of pi
}
20 changes: 20 additions & 0 deletions src/leibniz_ux4.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
function f(rounds)
pi = 1.0
x = -1.0
r2 = rounds + 2
vend = r2 - r2 % 4
@simd for i in 2:4:r2
pi += x / (2.0 * i - 1.0) -
x / (2.0 * i + 1.0) +
x / (2.0 * i + 3.0) -
x / (2.0 * i + 5.0)
end
for i in vend+1:r2
pi += 1.0 / (2.0 * (i + 0.0) - 1.0)
x = -x
end
return pi*4
end

rounds = parse(Int64, readchomp("rounds.txt"))
print(f(rounds))