From 1174345943e1e4e3c4d3c5b7f35c981eeea8f23d Mon Sep 17 00:00:00 2001 From: Stepan Nassyr <s.nassyr@fz-juelich.de> Date: Thu, 26 Sep 2019 14:20:07 +0200 Subject: [PATCH] More ARM prfm variants, use bench_common --- src/add_functions.hpp | 30 ++++++++++++++++ src/scale_functions.hpp | 26 ++++++++++++++ src/stream_bench.cpp | 4 +++ src/triad_functions.hpp | 77 ++++++++++++++++++++++++++++++++++++++++ tests/src/add_test.cpp | 2 +- tests/src/copy_test.cpp | 3 +- tests/src/scale_test.cpp | 3 +- tests/src/triad_test.cpp | 3 +- 8 files changed, 144 insertions(+), 4 deletions(-) diff --git a/src/add_functions.hpp b/src/add_functions.hpp index ef171be..7dc39d6 100644 --- a/src/add_functions.hpp +++ b/src/add_functions.hpp @@ -58,6 +58,36 @@ void add_asm_neon_ld4(double_type* to, double_type* from1, double_type* from2, s ); } +template<typename double_type> +void add_asm_neon_ld4_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t" + "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t" + "fadd v1.2d,v1.2d,v5.2d\n\t" + "prfm pldl1strm,[%[from_ptr1],#64]\n\t" + "fadd v2.2d,v2.2d,v6.2d\n\t" + "prfm pldl1strm,[%[from_ptr2],#64]\n\t" + "fadd v3.2d,v3.2d,v7.2d\n\t" + "fadd v4.2d,v4.2d,v8.2d\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr1] "+r"(ptr_from1), + [from_ptr2] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : + : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8" + + ); +} + template<typename double_type> void add_asm_neon_ld1(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N) { diff --git a/src/scale_functions.hpp b/src/scale_functions.hpp index 288ce95..5497cc0 100644 --- a/src/scale_functions.hpp +++ b/src/scale_functions.hpp @@ -50,6 +50,32 @@ void scale_asm_neon_ld4(double_type* to, double_type* from, std::size_t offset, ); } +template<typename double_type> +void scale_asm_neon_ld4_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from = from+offset; + asm volatile( + "ld1r {v0.2d},[%[scalar_ptr]]\n\t" + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr]],#64\n\t" + "fmul v1.2d,v1.2d,v0.2d\n\t" + "prfm pldl1strm,[%[from_ptr],#64]\n\t" + "fmul v2.2d,v2.2d,v0.2d\n\t" + "fmul v3.2d,v3.2d,v0.2d\n\t" + "fmul v4.2d,v4.2d,v0.2d\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr] "+r"(ptr_from), + [to_ptr] "+r"(ptr_to) + : [scalar_ptr] "r"(&factor) + : "memory","cc","v0","v1","v2","v3","v4" + ); +} + template<typename double_type> void scale_asm_neon_ld1(double_type* to, double_type* from, std::size_t offset, std::size_t N, double_type factor) { diff --git a/src/stream_bench.cpp b/src/stream_bench.cpp index 76c1e30..3d001d1 100644 --- a/src/stream_bench.cpp +++ b/src/stream_bench.cpp @@ -397,6 +397,7 @@ int main(int argc, const char *argv[]) bm.bench("asm_normal_d", scale_asm_normal_d<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); bm.bench("asm_neon_ld1", scale_asm_neon_ld1<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); bm.bench("asm_neon_ld4", scale_asm_neon_ld4<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); + bm.bench("asm_neon_ld4_prfm", scale_asm_neon_ld4_prfm<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) @@ -446,6 +447,7 @@ int main(int argc, const char *argv[]) bm.bench("asm_recommended", add_asm_recommended<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); bm.bench("asm_neon_ld1", add_asm_neon_ld1<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); bm.bench("asm_neon_ld4", add_asm_neon_ld4<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); + bm.bench("asm_neon_ld4_prfm", add_asm_neon_ld4_prfm<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) @@ -496,8 +498,10 @@ int main(int argc, const char *argv[]) #if defined(__aarch64__) bm.bench("asm_normal_d", triad_asm_normal_d<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_recommended", triad_asm_recommended<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); + bm.bench("asm_recommended_prfm",triad_asm_recommended_prfm<double>,init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_neon_ld1", triad_asm_neon_ld1<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_neon_ld4", triad_asm_neon_ld4<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); + bm.bench("asm_neon_ld4_prfm", triad_asm_neon_ld4_prfm<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) diff --git a/src/triad_functions.hpp b/src/triad_functions.hpp index 993a83f..266144c 100644 --- a/src/triad_functions.hpp +++ b/src/triad_functions.hpp @@ -58,6 +58,36 @@ void triad_asm_neon_ld4(double_type* to, double_type* from1, double_type* from2, ); } +template<typename double_type> +void triad_asm_neon_ld4_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t" + "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t" + "fmla v1.2d,v5.2d,%[scalar].2d[0]\n\t" + "prfm pldl1strm,[%[from_ptr1],#64]\n\t" + "fmla v2.2d,v6.2d,%[scalar].2d[0]\n\t" + "prfm pldl1strm,[%[from_ptr2],#64]\n\t" + "fmla v3.2d,v7.2d,%[scalar].2d[0]\n\t" + "fmla v4.2d,v8.2d,%[scalar].2d[0]\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr1] "+r"(ptr_from1), + [from_ptr2] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : [scalar] "w"(factor) + : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8" + + ); +} + template<typename double_type> void triad_asm_neon_ld1(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) { @@ -151,6 +181,53 @@ void triad_asm_recommended(double_type* to, double_type* from1, double_type* fro : "memory","cc","d0","d1" ); } + +// recommended plus prefetch +template<typename double_type> +void triad_asm_recommended_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "ldr d4,[%[scalar_ptr]]\n\t" + "1:\n\t" + "ldp d0,d1,[%[from1_ptr],0]\n\t" + "ldp d2,d3,[%[from2_ptr],0]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "prfm pldl1strm,[%[from1_ptr],#64]\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],0]\n\t" + "ldp d0,d1,[%[from1_ptr],#16]\n\t" + "ldp d2,d3,[%[from2_ptr],#16]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "prfm pldl1strm,[%[from2_ptr],#64]\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#16]\n\t" + "ldp d0,d1,[%[from1_ptr],#32]\n\t" + "ldp d2,d3,[%[from2_ptr],#32]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#32]\n\t" + "ldp d0,d1,[%[from1_ptr],#48]\n\t" + "ldp d2,d3,[%[from2_ptr],#48]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#48]\n\t" + "add %[from1_ptr],%[from1_ptr],#64\n\t" + "add %[from2_ptr],%[from2_ptr],#64\n\t" + "add %[to_ptr],%[to_ptr],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from1_ptr] "+r"(ptr_from1), + [from2_ptr] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : [scalar_ptr] "r"(&factor) + : "memory","cc","d0","d1" + ); +} #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) diff --git a/tests/src/add_test.cpp b/tests/src/add_test.cpp index cac4391..6968df7 100644 --- a/tests/src/add_test.cpp +++ b/tests/src/add_test.cpp @@ -1,8 +1,8 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> #include "add_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/copy_test.cpp b/tests/src/copy_test.cpp index ac2b67d..8e7274e 100644 --- a/tests/src/copy_test.cpp +++ b/tests/src/copy_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "copy_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/scale_test.cpp b/tests/src/scale_test.cpp index c7b3978..3eaeb71 100644 --- a/tests/src/scale_test.cpp +++ b/tests/src/scale_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "scale_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/triad_test.cpp b/tests/src/triad_test.cpp index 30de188..fd8ff1b 100644 --- a/tests/src/triad_test.cpp +++ b/tests/src/triad_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "triad_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" -- GitLab