#ifndef COPY_FUNCTIONS_HPP
#define COPY_FUNCTIONS_HPP

#include <vector>
#include <algorithm>
#include <cstdint>
#include <cstring>

template<typename double_type>
void copy_clike(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    for(std::size_t i = 0; i < N; i++)
    {
        to[i+offset] = from[i+offset];
    }
}

template<typename double_type>
void copy_memcpy(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    memcpy(to+offset, from+offset,N*sizeof(double_type));
}

template<typename double_type>
void copy_cpplike(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    std::copy(from+offset, from+offset+N, to+offset);
}

template<typename double_type>
void copy_cpplike_n(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    std::copy_n(from+offset, N, to+offset);
}


#if defined(__aarch64__)
template<typename double_type>
void copy_asm_neon_ld4(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr]],#64\n\t"
            "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t"
            "subs %[count], %[count], #8\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","v1","v2","v3","v4"
            );
}

template<typename double_type>
void copy_asm_neon_ld1(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ld1 {v1.2d},[%[from_ptr]],#16\n\t"
            "st1 {v1.2d},[%[to_ptr]],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","v1"
            );
}

template<typename double_type>
void copy_asm_normal_d(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ldp d0,d1,[%[from_ptr]],#16\n\t"
            "stp d0,d1,[%[to_ptr]],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            :
            : "memory","cc","d0","d1"
            );
}

// recommended long memory copy from the A57 Software Optimization Guide
template<typename double_type>
void copy_asm_recommended(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ldp d0,d1,[%[from_ptr],0]\n\t"
            "stp d0,d1,[%[to_ptr],0]\n\t"
            "ldp d0,d1,[%[from_ptr],#16]\n\t"
            "stp d0,d1,[%[to_ptr],#16]\n\t"
            "ldp d0,d1,[%[from_ptr],#32]\n\t"
            "stp d0,d1,[%[to_ptr],#32]\n\t"
            "ldp d0,d1,[%[from_ptr],#48]\n\t"
            "stp d0,d1,[%[to_ptr],#48]\n\t"
            "add %[from_ptr],%[from_ptr],#64\n\t"
            "add %[to_ptr],%[to_ptr],#64\n\t"
            "subs %[count], %[count], #8\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            :
            : "memory","cc","d0","d1"
            );
}

template<typename double_type>
void copy_asm_normal_q(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ldp q0,q1,[%[from_ptr]],#32\n\t"
            "stp q0,q1,[%[to_ptr]],#32\n\t"
            "subs %[count], %[count], #4\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","q0","q1"
            );
}

template<typename double_type>
void copy_asm_nontemporal(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "ldnp d0,d1,[%[from_ptr]]\n\t"
            "stnp d0,d1,[%[to_ptr]]\n\t"
            "add %[from_ptr],%[from_ptr],#16\n\t"
            "add %[to_ptr],%[to_ptr],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","d1","d2"
            );
}

template<typename double_type>
void copy_asm_neon_ld4_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2keep,[%[from_ptr]]\n\t"
            "prfm pstl2keep,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ld4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[from_ptr]],#64\n\t"
            "st4 {v1.2d,v2.2d,v3.2d,v4.2d},[%[to_ptr]],#64\n\t"
            "subs %[count], %[count], #8\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","v1","v2","v3","v4"
            );
}

template<typename double_type>
void copy_asm_neon_ld1_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2keep,[%[from_ptr]]\n\t"
            "prfm pstl2keep,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ld1 {v1.2d},[%[from_ptr]],#16\n\t"
            "st1 {v1.2d},[%[to_ptr]],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","v1"
            );
}

template<typename double_type>
void copy_asm_normal_d_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2keep,[%[from_ptr]]\n\t"
            "prfm pstl2keep,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ldp d0,d1,[%[from_ptr]],#16\n\t"
            "stp d0,d1,[%[to_ptr]],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","d0","d1"
            );
}

template<typename double_type>
void copy_asm_recommended_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2keep,[%[from_ptr]]\n\t"
            "prfm pstl2keep,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ldp d0,d1,[%[from_ptr],0]\n\t"
            "stp d0,d1,[%[to_ptr],0]\n\t"
            "ldp d0,d1,[%[from_ptr],#16]\n\t"
            "stp d0,d1,[%[to_ptr],#16]\n\t"
            "ldp d0,d1,[%[from_ptr],#32]\n\t"
            "stp d0,d1,[%[to_ptr],#32]\n\t"
            "ldp d0,d1,[%[from_ptr],#48]\n\t"
            "stp d0,d1,[%[to_ptr],#48]\n\t"
            "add %[from_ptr],%[from_ptr],#64\n\t"
            "add %[to_ptr],%[to_ptr],#64\n\t"
            "subs %[count], %[count], #8\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            :
            : "memory","cc","d0","d1"
            );
}

template<typename double_type>
void copy_asm_normal_q_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2keep,[%[from_ptr]]\n\t"
            "prfm pstl2keep,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ldp q0,q1,[%[from_ptr]],#32\n\t"
            "stp q0,q1,[%[to_ptr]],#32\n\t"
            "subs %[count], %[count], #4\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","q0","q1"
            );
}

template<typename double_type>
void copy_asm_nontemporal_prfm(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type_type has to be the same size as double_type");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "prfm pldl2strm,[%[from_ptr]]\n\t"
            "prfm pstl2strm,[%[to_ptr]]\n\t"
            "1:\n\t"
            "ldnp d0,d1,[%[from_ptr]]\n\t"
            "stnp d0,d1,[%[to_ptr]]\n\t"
            "add %[from_ptr],%[from_ptr],#16\n\t"
            "add %[to_ptr],%[to_ptr],#16\n\t"
            "subs %[count], %[count], #2\n\t"
            "b.ne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","d0","d1"
            );
}
#elif defined(__amd64__) || defined(__amd64)\
      || defined(__x86_64__) || defined(__x86_64)\
      || defined(_M_X64) || defined(_M_AMD64)

template<typename double_type>
inline void copy_x86_asm_avx2(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "vmovapd (%[from_ptr]),%%ymm0\n\t"
            "vmovapd %%ymm0,(%[to_ptr])\n\t"
            "add $32,%[from_ptr]\n\t"
            "add $32,%[to_ptr]\n\t"
            "sub $4,%[count]\n\t"
            "jne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","ymm0"
            );
}

template<typename double_type>
inline void copy_x86_asm_avx2_4x(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "vmovapd (%[from_ptr]),%%ymm0\n\t"
            "vmovapd 32(%[from_ptr]),%%ymm1\n\t"
            "vmovapd 64(%[from_ptr]),%%ymm2\n\t"
            "vmovapd 96(%[from_ptr]),%%ymm3\n\t"
            "vmovapd %%ymm0,(%[to_ptr])\n\t"
            "vmovapd %%ymm1,32(%[to_ptr])\n\t"
            "vmovapd %%ymm2,64(%[to_ptr])\n\t"
            "vmovapd %%ymm3,96(%[to_ptr])\n\t"
            "add $128,%[from_ptr]\n\t"
            "add $128,%[to_ptr]\n\t"
            "sub $16,%[count]\n\t"
            "jne 1b\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","ymm0","ymm1","ymm2","ymm3"
            );
}

template<typename double_type>
inline void copy_x86_asm_avx2_8x(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "vmovapd (%[from_ptr]),%%ymm0\n\t"
            "vmovapd 32(%[from_ptr]),%%ymm1\n\t"
            "vmovapd 64(%[from_ptr]),%%ymm2\n\t"
            "vmovapd 96(%[from_ptr]),%%ymm3\n\t"
            "vmovapd 128(%[from_ptr]),%%ymm4\n\t"
            "vmovapd 160(%[from_ptr]),%%ymm5\n\t"
            "vmovapd 192(%[from_ptr]),%%ymm6\n\t"
            "vmovapd 224(%[from_ptr]),%%ymm7\n\t"
            "vmovapd %%ymm0,(%[to_ptr])\n\t"
            "vmovapd %%ymm1,32(%[to_ptr])\n\t"
            "vmovapd %%ymm2,64(%[to_ptr])\n\t"
            "vmovapd %%ymm3,96(%[to_ptr])\n\t"
            "vmovapd %%ymm4,128(%[to_ptr])\n\t"
            "vmovapd %%ymm5,160(%[to_ptr])\n\t"
            "vmovapd %%ymm6,192(%[to_ptr])\n\t"
            "vmovapd %%ymm7,224(%[to_ptr])\n\t"
            "add $256,%[from_ptr]\n\t"
            "add $256,%[to_ptr]\n\t"
            "sub $32,%[count]\n\t"
            "jne 1b\n\t"
            "sfence\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7"
            );
}

template<typename double_type>
inline void copy_x86_asm_avx2_4x_nontemporal(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "vmovapd (%[from_ptr]),%%ymm0\n\t"
            "vmovapd 32(%[from_ptr]),%%ymm1\n\t"
            "vmovapd 64(%[from_ptr]),%%ymm2\n\t"
            "vmovapd 96(%[from_ptr]),%%ymm3\n\t"
            "vmovntpd %%ymm0,(%[to_ptr])\n\t"
            "vmovntpd %%ymm1,32(%[to_ptr])\n\t"
            "vmovntpd %%ymm2,64(%[to_ptr])\n\t"
            "vmovntpd %%ymm3,96(%[to_ptr])\n\t"
            "add $128,%[from_ptr]\n\t"
            "add $128,%[to_ptr]\n\t"
            "sub $16,%[count]\n\t"
            "jne 1b\n\t"
            "sfence\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","ymm0","ymm1","ymm2","ymm3"
            );
}

template<typename double_type>
inline void copy_x86_asm_avx2_8x_nontemporal(double_type* to, double_type* from, std::size_t offset, std::size_t N)
{
    static_assert(sizeof(double_type) == sizeof(double), "double_type has to be the same size as double");
    double_type* ptr_to = to+offset;
    double_type* ptr_from = from+offset;
    asm volatile(
            "1:\n\t"
            "vmovapd (%[from_ptr]),%%ymm0\n\t"
            "vmovapd 32(%[from_ptr]),%%ymm1\n\t"
            "vmovapd 64(%[from_ptr]),%%ymm2\n\t"
            "vmovapd 96(%[from_ptr]),%%ymm3\n\t"
            "vmovapd 128(%[from_ptr]),%%ymm4\n\t"
            "vmovapd 160(%[from_ptr]),%%ymm5\n\t"
            "vmovapd 192(%[from_ptr]),%%ymm6\n\t"
            "vmovapd 224(%[from_ptr]),%%ymm7\n\t"
            "vmovntpd %%ymm0,(%[to_ptr])\n\t"
            "vmovntpd %%ymm1,32(%[to_ptr])\n\t"
            "vmovntpd %%ymm2,64(%[to_ptr])\n\t"
            "vmovntpd %%ymm3,96(%[to_ptr])\n\t"
            "vmovntpd %%ymm4,128(%[to_ptr])\n\t"
            "vmovntpd %%ymm5,160(%[to_ptr])\n\t"
            "vmovntpd %%ymm6,192(%[to_ptr])\n\t"
            "vmovntpd %%ymm7,224(%[to_ptr])\n\t"
            "add $256,%[from_ptr]\n\t"
            "add $256,%[to_ptr]\n\t"
            "sub $32,%[count]\n\t"
            "jne 1b\n\t"
            "sfence\n\t"
            : [count]    "+r"(N),
              [from_ptr] "+r"(ptr_from),
              [to_ptr]   "+r"(ptr_to)
            : 
            : "memory","cc","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7"
            );
}

#endif


#endif /* end of include guard: COPY_FUNCTIONS_HPP */