Skip to content
Snippets Groups Projects
Select Git revision
  • master
1 result

add_functions.hpp

Blame
  • add_functions.hpp 16.21 KiB
    #ifndef ADD_FUNCTIONS_HPP
    #define ADD_FUNCTIONS_HPP
    
    #include <vector>
    #include <cstdint>
    #include <cstring>
    #include <algorithm>
    
    template<typename double_type>
    void add_clike(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        for(std::size_t i = 0; i < N; i++)
        {
            to[i+offset] = from1[i+offset]+from2[i+offset];
        }
    }
    
    template<typename double_type>
    void add_cpplike(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        std::transform(from1+offset, from1+offset+N,
                       from2,
                       to+offset,
                       [](double_type v1, double_type v2)
                       {
                           return v1+v2;
                       });
    }
    
    #if defined(__aarch64__)
    template<typename double_type>
    void add_asm_neon_ld4(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t"
                "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t"
                "fadd v1.2d,v1.2d,v5.2d\n\t"
                "fadd v2.2d,v2.2d,v6.2d\n\t"
                "fadd v3.2d,v3.2d,v7.2d\n\t"
                "fadd v4.2d,v4.2d,v8.2d\n\t"
                "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t"
                "subs %[count], %[count], #8\n\t"
                "b.ne 1b\n\t"
                : [count]    "+r"(N),
                  [from_ptr1] "+r"(ptr_from1),
                  [from_ptr2] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8"
    
                );
    }
    
    template<typename double_type>
    void add_asm_neon_ld4_prfm(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr1]],#64\n\t"
                "ld4 {v5.2d,v6.2d,v7.2d,v8.2d},[%[from_ptr2]],#64\n\t"
                "fadd v1.2d,v1.2d,v5.2d\n\t"
                "prfm pldl1strm,[%[from_ptr1],#64]\n\t"
                "fadd v2.2d,v2.2d,v6.2d\n\t"
                "prfm pldl1strm,[%[from_ptr2],#64]\n\t"
                "fadd v3.2d,v3.2d,v7.2d\n\t"
                "fadd v4.2d,v4.2d,v8.2d\n\t"
                "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t"
                "subs %[count], %[count], #8\n\t"
                "b.ne 1b\n\t"
                : [count]    "+r"(N),
                  [from_ptr1] "+r"(ptr_from1),
                  [from_ptr2] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","v1","v2","v3","v4","v5","v6","v7","v8"
    
                );
    }
    
    template<typename double_type>
    void add_asm_neon_ld1(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "ld1 {v1.2d},[%[from_ptr1]],#16\n\t"
                "ld1 {v2.2d},[%[from_ptr2]],#16\n\t"
                "fadd v1.2d,v1.2d,v2.2d\n\t"
                "st1 {v1.2d},[%[to_ptr]],#16\n\t"
                "subs %[count], %[count], #2\n\t"
                "b.ne 1b\n\t"
                : [count]    "+r"(N),
                  [from_ptr1] "+r"(ptr_from1),
                  [from_ptr2] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","v1","v2"
                );
    }
    
    template<typename double_type>
    void add_asm_normal_d(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "ldp d0,d1,[%[from_ptr1]],#16\n\t"
                "ldp d2,d3,[%[from_ptr2]],#16\n\t"
                "fadd d0,d0,d2\n\t"
                "fadd d1,d1,d3\n\t"
                "stp d0,d1,[%[to_ptr]],#16\n\t"
                "subs %[count], %[count], #2\n\t"
                "b.ne 1b\n\t"
                : [count]    "+r"(N),
                  [from_ptr1] "+r"(ptr_from1),
                  [from_ptr2] "+r"(ptr_from2),
    
                  [to_ptr]   "+r"(ptr_to)
                :
                : "memory","cc","d0","d1"
                );
    }
    // recommended long memory copy from the A57 Software Optimization Guide
    template<typename double_type>
    void add_asm_recommended(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "ldp d0,d1,[%[from1_ptr],0]\n\t"
                "ldp d2,d3,[%[from2_ptr],0]\n\t"
                "fadd d0,d0,d2\n\t"
                "fadd d1,d1,d3\n\t"
                "stp d0,d1,[%[to_ptr],0]\n\t"
                "ldp d0,d1,[%[from1_ptr],#16]\n\t"
                "ldp d2,d3,[%[from2_ptr],#16]\n\t"
                "fadd d0,d0,d2\n\t"
                "fadd d1,d1,d3\n\t"
                "stp d0,d1,[%[to_ptr],#16]\n\t"
                "ldp d0,d1,[%[from1_ptr],#32]\n\t"
                "ldp d2,d3,[%[from2_ptr],#32]\n\t"
                "fadd d0,d0,d2\n\t"
                "fadd d1,d1,d3\n\t"
                "stp d0,d1,[%[to_ptr],#32]\n\t"
                "ldp d0,d1,[%[from1_ptr],#48]\n\t"
                "ldp d2,d3,[%[from2_ptr],#48]\n\t"
                "fadd d0,d0,d2\n\t"
                "fadd d1,d1,d3\n\t"
                "stp d0,d1,[%[to_ptr],#48]\n\t"
                "add %[from1_ptr],%[from1_ptr],#64\n\t"
                "add %[from2_ptr],%[from2_ptr],#64\n\t"
                "add %[to_ptr],%[to_ptr],#64\n\t"
                "subs %[count], %[count], #8\n\t"
                "b.ne 1b\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                :
                : "memory","cc","d0","d1"
                );
    }
    #elif defined(__amd64__) || defined(__amd64)\
          || defined(__x86_64__) || defined(__x86_64)\
          || defined(_M_X64) || defined(_M_AMD64)
    
    template<typename double_type>
    inline void add_x86_asm_avx2(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "vmovapd (%[from1_ptr]),%%ymm0\n\t"
                "vmovapd (%[from2_ptr]),%%ymm1\n\t"
                "vaddpd %%ymm1,%%ymm0,%%ymm0\n\t"
                "vmovapd %%ymm0,(%[to_ptr])\n\t"
                "add $32,%[from1_ptr]\n\t"
                "add $32,%[from2_ptr]\n\t"
                "add $32,%[to_ptr]\n\t"
                "sub $4,%[count]\n\t"
                "jne 1b\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","ymm0","ymm1"
                );
    }
    
    template<typename double_type>
    inline void add_x86_asm_avx2_4x(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "vmovapd (%[from1_ptr]),%%ymm0\n\t"
                "vmovapd 32(%[from1_ptr]),%%ymm1\n\t"
                "vmovapd 64(%[from1_ptr]),%%ymm2\n\t"
                "vmovapd 96(%[from1_ptr]),%%ymm3\n\t"
                "vmovapd (%[from2_ptr]),%%ymm4\n\t"
                "vmovapd 32(%[from2_ptr]),%%ymm5\n\t"
                "vmovapd 64(%[from2_ptr]),%%ymm6\n\t"
                "vmovapd 96(%[from2_ptr]),%%ymm7\n\t"
                "vaddpd %%ymm4,%%ymm0,%%ymm0\n\t"
                "vaddpd %%ymm5,%%ymm1,%%ymm1\n\t"
                "vaddpd %%ymm6,%%ymm2,%%ymm2\n\t"
                "vaddpd %%ymm7,%%ymm3,%%ymm3\n\t"
                "vmovapd %%ymm0,(%[to_ptr])\n\t"
                "vmovapd %%ymm1,32(%[to_ptr])\n\t"
                "vmovapd %%ymm2,64(%[to_ptr])\n\t"
                "vmovapd %%ymm3,96(%[to_ptr])\n\t"
                "add $128,%[from1_ptr]\n\t"
                "add $128,%[from2_ptr]\n\t"
                "add $128,%[to_ptr]\n\t"
                "sub $16,%[count]\n\t"
                "jne 1b\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7"
                );
    }
    
    template<typename double_type>
    inline void add_x86_asm_avx2_8x(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "vmovapd (%[from1_ptr]),%%ymm0\n\t"
                "vmovapd 32(%[from1_ptr]),%%ymm1\n\t"
                "vmovapd 64(%[from1_ptr]),%%ymm2\n\t"
                "vmovapd 96(%[from1_ptr]),%%ymm3\n\t"
                "vmovapd 128(%[from1_ptr]),%%ymm4\n\t"
                "vmovapd 160(%[from1_ptr]),%%ymm5\n\t"
                "vmovapd 192(%[from1_ptr]),%%ymm6\n\t"
                "vmovapd 224(%[from1_ptr]),%%ymm7\n\t"
                "vmovapd (%[from2_ptr]),%%ymm8\n\t"
                "vmovapd 32(%[from2_ptr]),%%ymm9\n\t"
                "vmovapd 64(%[from2_ptr]),%%ymm10\n\t"
                "vmovapd 96(%[from2_ptr]),%%ymm11\n\t"
                "vmovapd 128(%[from2_ptr]),%%ymm12\n\t"
                "vmovapd 160(%[from2_ptr]),%%ymm13\n\t"
                "vmovapd 192(%[from2_ptr]),%%ymm14\n\t"
                "vmovapd 224(%[from2_ptr]),%%ymm15\n\t"
                "vaddpd %%ymm8,%%ymm0,%%ymm0\n\t"
                "vaddpd %%ymm9,%%ymm1,%%ymm1\n\t"
                "vaddpd %%ymm10,%%ymm2,%%ymm2\n\t"
                "vaddpd %%ymm11,%%ymm3,%%ymm3\n\t"
                "vaddpd %%ymm12,%%ymm4,%%ymm4\n\t"
                "vaddpd %%ymm13,%%ymm5,%%ymm5\n\t"
                "vaddpd %%ymm14,%%ymm6,%%ymm6\n\t"
                "vaddpd %%ymm15,%%ymm7,%%ymm7\n\t"
                "vmovapd %%ymm0,(%[to_ptr])\n\t"
                "vmovapd %%ymm1,32(%[to_ptr])\n\t"
                "vmovapd %%ymm2,64(%[to_ptr])\n\t"
                "vmovapd %%ymm3,96(%[to_ptr])\n\t"
                "vmovapd %%ymm4,128(%[to_ptr])\n\t"
                "vmovapd %%ymm5,160(%[to_ptr])\n\t"
                "vmovapd %%ymm6,192(%[to_ptr])\n\t"
                "vmovapd %%ymm7,224(%[to_ptr])\n\t"
                "add $256,%[from1_ptr]\n\t"
                "add $256,%[from2_ptr]\n\t"
                "add $256,%[to_ptr]\n\t"
                "sub $32,%[count]\n\t"
                "jne 1b\n\t"
                "sfence\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
    
                );
    }
    
    template<typename double_type>
    inline void add_x86_asm_avx2_4x_nontemporal(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "vmovapd (%[from1_ptr]),%%ymm0\n\t"
                "vmovapd 32(%[from1_ptr]),%%ymm1\n\t"
                "vmovapd 64(%[from1_ptr]),%%ymm2\n\t"
                "vmovapd 96(%[from1_ptr]),%%ymm3\n\t"
                "vmovapd (%[from2_ptr]),%%ymm4\n\t"
                "vmovapd 32(%[from2_ptr]),%%ymm5\n\t"
                "vmovapd 64(%[from2_ptr]),%%ymm6\n\t"
                "vmovapd 96(%[from2_ptr]),%%ymm7\n\t"
                "vaddpd %%ymm4,%%ymm0,%%ymm0\n\t"
                "vaddpd %%ymm5,%%ymm1,%%ymm1\n\t"
                "vaddpd %%ymm6,%%ymm2,%%ymm2\n\t"
                "vaddpd %%ymm7,%%ymm3,%%ymm3\n\t"
                "vmovntpd %%ymm0,(%[to_ptr])\n\t"
                "vmovntpd %%ymm1,32(%[to_ptr])\n\t"
                "vmovntpd %%ymm2,64(%[to_ptr])\n\t"
                "vmovntpd %%ymm3,96(%[to_ptr])\n\t"
                "add $128,%[from1_ptr]\n\t"
                "add $128,%[from2_ptr]\n\t"
                "add $128,%[to_ptr]\n\t"
                "sub $16,%[count]\n\t"
                "jne 1b\n\t"
                "sfence\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7"
                );
    }
    
    template<typename double_type>
    inline void add_x86_asm_avx2_8x_nontemporal(double_type* to, double_type* from1, double_type* from2, std::size_t offset, std::size_t N)
    {
        static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
        double_type* ptr_to = to+offset;
        double_type* ptr_from1 = from1+offset;
        double_type* ptr_from2 = from2+offset;
        asm volatile(
                "1:\n\t"
                "vmovapd (%[from1_ptr]),%%ymm0\n\t"
                "vmovapd 32(%[from1_ptr]),%%ymm1\n\t"
                "vmovapd 64(%[from1_ptr]),%%ymm2\n\t"
                "vmovapd 96(%[from1_ptr]),%%ymm3\n\t"
                "vmovapd 128(%[from1_ptr]),%%ymm4\n\t"
                "vmovapd 160(%[from1_ptr]),%%ymm5\n\t"
                "vmovapd 192(%[from1_ptr]),%%ymm6\n\t"
                "vmovapd 224(%[from1_ptr]),%%ymm7\n\t"
                "vmovapd (%[from2_ptr]),%%ymm8\n\t"
                "vmovapd 32(%[from2_ptr]),%%ymm9\n\t"
                "vmovapd 64(%[from2_ptr]),%%ymm10\n\t"
                "vmovapd 96(%[from2_ptr]),%%ymm11\n\t"
                "vmovapd 128(%[from2_ptr]),%%ymm12\n\t"
                "vmovapd 160(%[from2_ptr]),%%ymm13\n\t"
                "vmovapd 192(%[from2_ptr]),%%ymm14\n\t"
                "vmovapd 224(%[from2_ptr]),%%ymm15\n\t"
                "vaddpd %%ymm8,%%ymm0,%%ymm0\n\t"
                "vaddpd %%ymm9,%%ymm1,%%ymm1\n\t"
                "vaddpd %%ymm10,%%ymm2,%%ymm2\n\t"
                "vaddpd %%ymm11,%%ymm3,%%ymm3\n\t"
                "vaddpd %%ymm12,%%ymm4,%%ymm4\n\t"
                "vaddpd %%ymm13,%%ymm5,%%ymm5\n\t"
                "vaddpd %%ymm14,%%ymm6,%%ymm6\n\t"
                "vaddpd %%ymm15,%%ymm7,%%ymm7\n\t"
                "vmovntpd %%ymm0,(%[to_ptr])\n\t"
                "vmovntpd %%ymm1,32(%[to_ptr])\n\t"
                "vmovntpd %%ymm2,64(%[to_ptr])\n\t"
                "vmovntpd %%ymm3,96(%[to_ptr])\n\t"
                "vmovntpd %%ymm4,128(%[to_ptr])\n\t"
                "vmovntpd %%ymm5,160(%[to_ptr])\n\t"
                "vmovntpd %%ymm6,192(%[to_ptr])\n\t"
                "vmovntpd %%ymm7,224(%[to_ptr])\n\t"
                "add $256,%[from1_ptr]\n\t"
                "add $256,%[from2_ptr]\n\t"
                "add $256,%[to_ptr]\n\t"
                "sub $32,%[count]\n\t"
                "jne 1b\n\t"
                "sfence\n\t"
                : [count]    "+r"(N),
                  [from1_ptr] "+r"(ptr_from1),
                  [from2_ptr] "+r"(ptr_from2),
                  [to_ptr]   "+r"(ptr_to)
                : 
                : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
    
                );
    }
    
    
    #endif
    
    #endif /* end of include guard: ADD_FUNCTIONS_HPP */