diff --git a/src/add_functions.hpp b/src/add_functions.hpp index ef171be23896c055277dfa093be30f7085d19438..7dc39d64e6f8c75004a9be83e1881d5dbbeb6555 100644 --- a/src/add_functions.hpp +++ b/src/add_functions.hpp @@ -58,6 +58,36 @@ void add_asm_neon_ld4(double_type* to, double_type* from1, double_type* from2, s ); } +template<typename double_type> +void add_asm_neon_ld4_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t" + "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t" + "fadd v1.2d,v1.2d,v5.2d\n\t" + "prfm pldl1strm,[%[from_ptr1],#64]\n\t" + "fadd v2.2d,v2.2d,v6.2d\n\t" + "prfm pldl1strm,[%[from_ptr2],#64]\n\t" + "fadd v3.2d,v3.2d,v7.2d\n\t" + "fadd v4.2d,v4.2d,v8.2d\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr1] "+r"(ptr_from1), + [from_ptr2] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : + : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8" + + ); +} + template<typename double_type> void add_asm_neon_ld1(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N) { diff --git a/src/scale_functions.hpp b/src/scale_functions.hpp index 288ce95e04bda70f21b2b2c2268e9c25bfea98ff..5497cc0b8fa9f2a472a0404664c4f08a614b77d6 100644 --- a/src/scale_functions.hpp +++ b/src/scale_functions.hpp @@ -50,6 +50,32 @@ void scale_asm_neon_ld4(double_type* to, double_type* from, std::size_t offset, ); } +template<typename double_type> +void scale_asm_neon_ld4_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from = from+offset; + asm volatile( + "ld1r {v0.2d},[%[scalar_ptr]]\n\t" + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr]],#64\n\t" + "fmul v1.2d,v1.2d,v0.2d\n\t" + "prfm pldl1strm,[%[from_ptr],#64]\n\t" + "fmul v2.2d,v2.2d,v0.2d\n\t" + "fmul v3.2d,v3.2d,v0.2d\n\t" + "fmul v4.2d,v4.2d,v0.2d\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr] "+r"(ptr_from), + [to_ptr] "+r"(ptr_to) + : [scalar_ptr] "r"(&factor) + : "memory","cc","v0","v1","v2","v3","v4" + ); +} + template<typename double_type> void scale_asm_neon_ld1(double_type* to, double_type* from, std::size_t offset, std::size_t N, double_type factor) { diff --git a/src/stream_bench.cpp b/src/stream_bench.cpp index 76c1e3072cffdecee35a8fe6218dc6f556b00744..3d001d16fecdc21f2b786288fa95f26b59c01e38 100644 --- a/src/stream_bench.cpp +++ b/src/stream_bench.cpp @@ -397,6 +397,7 @@ int main(int argc, const char *argv[]) bm.bench("asm_normal_d", scale_asm_normal_d<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); bm.bench("asm_neon_ld1", scale_asm_neon_ld1<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); bm.bench("asm_neon_ld4", scale_asm_neon_ld4<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); + bm.bench("asm_neon_ld4_prfm", scale_asm_neon_ld4_prfm<double>, init_scale, iter_count, destination.data(), source1.data(), offset, slice_size, factor); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) @@ -446,6 +447,7 @@ int main(int argc, const char *argv[]) bm.bench("asm_recommended", add_asm_recommended<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); bm.bench("asm_neon_ld1", add_asm_neon_ld1<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); bm.bench("asm_neon_ld4", add_asm_neon_ld4<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); + bm.bench("asm_neon_ld4_prfm", add_asm_neon_ld4_prfm<double>, init_add, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) @@ -496,8 +498,10 @@ int main(int argc, const char *argv[]) #if defined(__aarch64__) bm.bench("asm_normal_d", triad_asm_normal_d<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_recommended", triad_asm_recommended<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); + bm.bench("asm_recommended_prfm",triad_asm_recommended_prfm<double>,init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_neon_ld1", triad_asm_neon_ld1<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); bm.bench("asm_neon_ld4", triad_asm_neon_ld4<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); + bm.bench("asm_neon_ld4_prfm", triad_asm_neon_ld4_prfm<double>, init_triad, iter_count, destination.data(), source1.data(), source2.data(), offset, slice_size, factor); #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) diff --git a/src/triad_functions.hpp b/src/triad_functions.hpp index 993a83fa44068d6ec85f51780213efc5b5f93526..266144c95da36e9fd12fa5bec20bc6baa9a8bd4a 100644 --- a/src/triad_functions.hpp +++ b/src/triad_functions.hpp @@ -58,6 +58,36 @@ void triad_asm_neon_ld4(double_type* to, double_type* from1, double_type* from2, ); } +template<typename double_type> +void triad_asm_neon_ld4_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "1:\n\t" + "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t" + "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t" + "fmla v1.2d,v5.2d,%[scalar].2d[0]\n\t" + "prfm pldl1strm,[%[from_ptr1],#64]\n\t" + "fmla v2.2d,v6.2d,%[scalar].2d[0]\n\t" + "prfm pldl1strm,[%[from_ptr2],#64]\n\t" + "fmla v3.2d,v7.2d,%[scalar].2d[0]\n\t" + "fmla v4.2d,v8.2d,%[scalar].2d[0]\n\t" + "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from_ptr1] "+r"(ptr_from1), + [from_ptr2] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : [scalar] "w"(factor) + : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8" + + ); +} + template<typename double_type> void triad_asm_neon_ld1(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) { @@ -151,6 +181,53 @@ void triad_asm_recommended(double_type* to, double_type* from1, double_type* fro : "memory","cc","d0","d1" ); } + +// recommended plus prefetch +template<typename double_type> +void triad_asm_recommended_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N, double_type factor) +{ + static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type"); + double_type* ptr_to = to+offset; + double_type* ptr_from1 = from1+offset; + double_type* ptr_from2 = from2+offset; + asm volatile( + "ldr d4,[%[scalar_ptr]]\n\t" + "1:\n\t" + "ldp d0,d1,[%[from1_ptr],0]\n\t" + "ldp d2,d3,[%[from2_ptr],0]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "prfm pldl1strm,[%[from1_ptr],#64]\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],0]\n\t" + "ldp d0,d1,[%[from1_ptr],#16]\n\t" + "ldp d2,d3,[%[from2_ptr],#16]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "prfm pldl1strm,[%[from2_ptr],#64]\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#16]\n\t" + "ldp d0,d1,[%[from1_ptr],#32]\n\t" + "ldp d2,d3,[%[from2_ptr],#32]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#32]\n\t" + "ldp d0,d1,[%[from1_ptr],#48]\n\t" + "ldp d2,d3,[%[from2_ptr],#48]\n\t" + "fmadd d0,d2,d4,d0\n\t" + "fmadd d1,d3,d4,d1\n\t" + "stp d0,d1,[%[to_ptr],#48]\n\t" + "add %[from1_ptr],%[from1_ptr],#64\n\t" + "add %[from2_ptr],%[from2_ptr],#64\n\t" + "add %[to_ptr],%[to_ptr],#64\n\t" + "subs %[count], %[count], #8\n\t" + "b.ne 1b\n\t" + : [count] "+r"(N), + [from1_ptr] "+r"(ptr_from1), + [from2_ptr] "+r"(ptr_from2), + [to_ptr] "+r"(ptr_to) + : [scalar_ptr] "r"(&factor) + : "memory","cc","d0","d1" + ); +} #elif defined(__amd64__) || defined(__amd64)\ || defined(__x86_64__) || defined(__x86_64)\ || defined(_M_X64) || defined(_M_AMD64) diff --git a/tests/src/add_test.cpp b/tests/src/add_test.cpp index cac4391c61972a58698b30cd0894b54d74808a37..6968df7b3ccad0219d9be68ea5b144604ff1e4cb 100644 --- a/tests/src/add_test.cpp +++ b/tests/src/add_test.cpp @@ -1,8 +1,8 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> #include "add_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/copy_test.cpp b/tests/src/copy_test.cpp index ac2b67dcf88b6128a95bda36ed61fbc624755774..8e7274ee3a828f7cf13ee73a575d7410a2d3b8e3 100644 --- a/tests/src/copy_test.cpp +++ b/tests/src/copy_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "copy_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/scale_test.cpp b/tests/src/scale_test.cpp index c7b397842abb6676fd5af82c5ea36e35c5e8b057..3eaeb717dc6cdb6bbf486c9937946086f76d265d 100644 --- a/tests/src/scale_test.cpp +++ b/tests/src/scale_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "scale_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp" diff --git a/tests/src/triad_test.cpp b/tests/src/triad_test.cpp index 30de1883d9421da337f4fb93471fa91867939304..fd8ff1b597c93b34a7531106460655cc33fb6727 100644 --- a/tests/src/triad_test.cpp +++ b/tests/src/triad_test.cpp @@ -1,8 +1,9 @@ #include <iostream> #include <vector> +#include <bench_common/aligned_allocator.hpp> + #include "triad_functions.hpp" -#include "aligned_allocator.hpp" #include "result_compare.hpp"