divert(`-1') define(`complex_doublesve', `defn(format(``complex_doublesve.%s'', `$1'))') define(`complex_doublesve_set', `define(format(``complex_doublesve.%s'', `$1'), `$2')') dnl complex_doublesve_set(`vec', `svfloat64x2_t')dnl complex_doublesve_set(`intvec', `svuint64_t')dnl complex_doublesve_set(`scalpointer', `double')dnl dnl complex_doublesve_set(`load', `svld2_f64')dnl complex_doublesve_set(`gatherload', `svld1_gather_index')dnl complex_doublesve_set(`store', `svst2_f64')dnl complex_doublesve_set(`count', `svcntd() / 2')dnl complex_doublesve_set(`index', `svindex_u64')dnl complex_doublesve_set(`true', `svptrue_b64()')dnl complex_doublesve_set(`while', `svwhilelt_b64')dnl complex_doublesve_set(`neg', `svneg_f64_z')dnl complex_doublesve_set(`abs', `svabs_f64_z')dnl dnl complex_doublesve_set(`add', `svadd_f64_z')dnl complex_doublesve_set(`sub', `svsub_f64_z')dnl complex_doublesve_set(`mul', `svmul_f64_z')dnl complex_doublesve_set(`div', `svdiv_f64_z')dnl dnl dnl define(`complex_double_sve_intr_while_lower', `pg = svwhilelt_b64($1, $2); ') define(`complex_double_sve_intr_load', `ifdef(`disable_complex_instructions', ` svfloat64x2_t $3 = svld2_f64(pg, (const double *) &$1[$2 * 2]); ', ` svfloat64_t $3 = svld1_f64(pg, (const double *) &$1[$2 / 2]); ') ') define(`complex_double_sve_intr_dup', `ifdef(`disable_complex_instructions', ` svfloat64x2_t $2 = {svdup_f64(creal($1)), svdup_f64(cimag($1))}; ', ` svfloat64_t $2 = svdupq_f64( creal($1), cimag($1)); ') ') define(`complex_double_sve_intr_add', `ifdef(`disable_complex_instructions', ` svfloat64_t re = svadd_f64_z(pg, $1.v0, $2.v0); svfloat64_t im = svadd_f64_z(pg, $1.v1, $2.v1); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$3[$4 * 2], res); ', ` svfloat64_t res = svadd_f64_z(pg, $1, $2); svst1_f64(pg, (double *) &$3[$4 / 2], res); ') ') define(`complex_double_sve_intr_sub', `ifdef(`disable_complex_instructions', ` svfloat64_t re = svsub_f64_z(pg, $1.v0, $2.v0); svfloat64_t im = svsub_f64_z(pg, $1.v1, $2.v1); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$3[$4 * 2], res); ', ` svfloat64_t res = svsub_f64_z(pg, $1, $2); svst1_f64(pg, (double *) &$3[$4 / 2], res); ') ') define(`complex_double_sve_intr_mul', `ifdef(`disable_complex_instructions', ` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0); svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1); svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1); svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$3[$4 * 2], res); ', ` svfloat64_t zero = svdup_f64(0.0); svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0); svfloat64_t res_im = svcmla_f64_z(pg, zero, $1, $2, 90); svfloat64_t res = svadd_f64_z(pg, res_re, res_im); svst1_f64(pg, (double *) &$3[$4 / 2], res); ') ') define(`complex_double_sve_intr_div', `TODO svfloat64_t res = svdiv_f64_z(pg, $1, $2); svst1_f64(pg, (double *) &$3[$4], res); ') define(`complex_double_sve_intr_muladd', `ifdef(`disable_complex_instructions', ` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0); svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1); svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1); svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0); re = svadd_f64_z(pg, re, $3.v0); im = svadd_f64_z(pg, im, $3.v1); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$4[$5 * 2], res); ', ` svfloat64_t zero = svdup_f64(0.0); svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0); svfloat64_t res_im = svcmla_f64_z(pg, $3, $1, $2, 90); svfloat64_t res = svadd_f64_z(pg, res_re, res_im); svst1_f64(pg, (double *) &$4[$5 / 2], res); ') ') define(`complex_double_sve_intr_addmul', `ifdef(`disable_complex_instructions', ` svfloat64_t retemp = svmul_f64_z(pg, $2.v0, $3.v0); svfloat64_t imtemp = svmul_f64_z(pg, $2.v0, $3.v1); svfloat64_t re = svmls_f64_z(pg, retemp, $2.v1, $3.v1); svfloat64_t im = svmla_f64_z(pg, imtemp, $2.v1, $3.v0); re = svadd_f64_z(pg, re, $1.v0); im = svadd_f64_z(pg, im, $1.v1); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$4[$5 * 2], res); ', ` svfloat64_t zero = svdup_f64(0.0); svfloat64_t res_re = svcmla_f64_z(pg, zero, $2, $3, 0); svfloat64_t res_im = svcmla_f64_z(pg, $1, $2, $3, 90); svfloat64_t res = svadd_f64_z(pg, res_re, res_im); svst1_f64(pg, (double *) &$4[$5 / 2], res); ') ') define(`complex_double_sve_intr_mulsub', `ifdef(`disable_complex_instructions', ` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0); svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1); svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1); svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0); re = svsub_f64_z(pg, re, $3.v0); im = svsub_f64_z(pg, im, $3.v1); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$4[$5 * 2], res); ', ` svfloat64_t zero = svdup_f64(0.0); $3 = svneg_f64_z(pg, $3); svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0); svfloat64_t res_im = svcmla_f64_z(pg, $3, $1, $2, 90); svfloat64_t res = svadd_f64_z(pg, res_re, res_im); svst1_f64(pg, (double *) &$4[$5 / 2], res); ') ') define(`complex_double_sve_intr_submul', `ifdef(`disable_complex_instructions', ` svfloat64_t retemp = svmul_f64_z(pg, $2.v0, $3.v0); svfloat64_t imtemp = svmul_f64_z(pg, $2.v0, $3.v1); svfloat64_t re = svmls_f64_z(pg, retemp, $2.v1, $3.v1); svfloat64_t im = svmla_f64_z(pg, imtemp, $2.v1, $3.v0); re = svsub_f64_z(pg, $1.v0, re); im = svsub_f64_z(pg, $1.v1, im); svfloat64x2_t res = {re, im}; svst2_f64(pg, (double *) &$4[$5 * 2], res); ', ` svfloat64_t zero = svdup_f64(0.0); $2 = svneg_f64_z(pg, $2); svfloat64_t res_re = svcmla_f64_z(pg, zero, $2, $3, 0); svfloat64_t res_im = svcmla_f64_z(pg, $1, $2, $3, 90); svfloat64_t res = svadd_f64_z(pg, res_re, res_im); svst1_f64(pg, (double *) &$4[$5 / 2], res); ') ') define(`complex_double_sve_intr_inc', ` $1 += svcntd() / 2;') define(`complex_double_sve_intr_any', ` svptest_any(svptrue_b64(), $1)') define(`complex_double_sve_asm_while_lower', ` "whilelo p0.d, $1, $2 \n\t"') define(`complex_double_sve_asm_load', `ifdef(`disable_complex_instructions', ` "ld2d {z$3.d, z$4.d}, p0/z, [%[$1], $2, lsl #3] \n\t"', ` "ld1d z$3.d, p0/z, [%[$1], $2, lsl #3] \n\t"')') define(`complex_double_sve_asm_dup', `ifdef(`disable_complex_instructions', ` "ld1rd z$2.d, p0/z, %[$1real] \n\t" "ld1rd z$3.d, p0/z, %[$1imag] \n\t"', ` "ld1rqd z$2.d, p0/z, %[$1] \n\t"')') define(`complex_double_sve_asm_mul', `ifdef(`disable_complex_instructions', ` "fmov z5.d, p0/m, #0.0 \n\t" "fmov z6.d, p0/m, #0.0 \n\t" "fmla z5.d, p0/m, z$1.d, z$3.d \n\t" "fmla z6.d, p0/m, z$1.d, z$4.d \n\t" "fmls z5.d, p0/m, z$2.d, z$4.d \n\t" "fmla z6.d, p0/m, z$2.d, z$3.d \n\t" "lsl x2, $6, #1 \n\t" "st2d {z5.d, z6.d}, p0, [%[$5], $6, lsl #3] \n\t"', ` "fmov z4.d, p0/m, #0.0 \n\t" "fcmla z4.d, p0/m, z$1.d, z$2.d, 0 \n\t" "fcmla z4.d, p0/m, z$1.d, z$2.d, 90 \n\t" "st1d z4.d, p0, [%[$5], $6, lsl #3] \n\t"')') define(`complex_double_sve_asm_div', ` " to do \n\t"') define(`complex_double_sve_asm_add', `ifdef(`disable_complex_instructions', ` "fadd z$1.d, p0/m, z$1.d, z$3.d \n\t" "fadd z$2.d, p0/m, z$2.d, z$4.d \n\t" "lsl x2, $6, #1 \n\t" "st2d {z$1.d, z$2.d}, p0, [%[$5], $6, lsl #3] \n\t"', ` "fadd z$1.d, p0/m, z$1.d, z$2.d \n\t" "st1d z$1.d, p0, [%[$5], $6, lsl #3] \n\t"')') define(`complex_double_sve_asm_sub', `ifdef(`disable_complex_instructions', ` "fsub z$1.d, p0/m, z$1.d, z$3.d \n\t" "fsub z$2.d, p0/m, z$2.d, z$4.d \n\t" "lsl x2, $6, #1 \n\t" "st2d {z$1.d, z$2.d}, p0, [%[$5]], $6, lsl #3] \n\t"', ` "fsub z$1.d, p0/m, z$1.d, z$2.d \n\t" "st1d z$1.d, p0, [%[$5], $6, lsl #3] \n\t"')') define(`complex_double_sve_asm_muladd', `ifdef(`disable_complex_instructions', ` "fmov z7.d, p0/m, #0.0 \n\t" "fmov z8.d, p0/m, #0.0 \n\t" "fmla z7.d, p0/m, z$1.d, z$3.d \n\t" "fmla z8.d, p0/m, z$1.d, z$4.d \n\t" "fmls z7.d, p0/m, z$2.d, z$4.d \n\t" "fmla z8.d, p0/m, z$2.d, z$3.d \n\t" "fadd z7.d, p0/m, z7.d, z$5.d \n\t" "fadd z8.d, p0/m, z8.d, z$6.d \n\t" "lsl x2, $8, #1 \n\t" "st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"', ` "fcmla z$3.d, p0/m, z$1.d, z$2.d, 0 \n\t" "fcmla z$3.d, p0/m, z$1.d, z$2.d, 90 \n\t" "st1d z$3.d, p0, [%[$7], $8, lsl #3] \n\t"')') define(`complex_double_sve_asm_addmul', `ifdef(`disable_complex_instructions', ` "fmov z7.d, p0/m, #0.0 \n\t" "fmov z8.d, p0/m, #0.0 \n\t" "fmla z7.d, p0/m, z$3.d, z$5.d \n\t" "fmla z8.d, p0/m, z$3.d, z$6.d \n\t" "fmls z7.d, p0/m, z$4.d, z$6.d \n\t" "fmla z8.d, p0/m, z$4.d, z$5.d \n\t" "fadd z7.d, p0/m, z7.d, z$1.d \n\t" "fadd z8.d, p0/m, z8.d, z$2.d \n\t" "lsl x2, $8, #1 \n\t" "st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"', ` "fcmla z$1.d, p0/m, z$2.d, z$3.d, 0 \n\t" "fcmla z$1.d, p0/m, z$2.d, z$3.d, 90 \n\t" "st1d z$1.d, p0, [%[$7], $8, lsl #3] \n\t"')') define(`complex_double_sve_asm_mulsub', `ifdef(`disable_complex_instructions', ` "fmov z7.d, p0/m, #0.0 \n\t" "fmov z8.d, p0/m, #0.0 \n\t" "fmla z7.d, p0/m, z$1.d, z$3.d \n\t" "fmla z8.d, p0/m, z$1.d, z$4.d \n\t" "fmls z7.d, p0/m, z$2.d, z$4.d \n\t" "fmla z8.d, p0/m, z$2.d, z$3.d \n\t" "fsub z7.d, p0/m, z7.d, z$5.d \n\t" "fsub z8.d, p0/m, z8.d, z$6.d \n\t" "lsl x2, $8, #1 \n\t" "st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"', ` "fneg z4.d, p0/m, z$3.d \n\t" "fcmla z4.d, p0/m, z$1.d, z$2.d, 0 \n\t" "fcmla z4.d, p0/m, z$1.d, z$2.d, 90 \n\t" "st1d z4.d, p0, [%[$7], $8, lsl #3] \n\t"')') define(`complex_double_sve_asm_submul', `ifdef(`disable_complex_instructions', ` "fmov z7.d, p0/m, #0.0 \n\t" "fmov z8.d, p0/m, #0.0 \n\t" "fneg z$2.d, p0/m, z$2.d \n\t" "fneg z$3.d, p0/m, z$3.d \n\t" "fmla z7.d, p0/m, z$3.d, z$5.d \n\t" "fmla z8.d, p0/m, z$3.d, z$6.d \n\t" "fmls z7.d, p0/m, z$4.d, z$6.d \n\t" "fmla z8.d, p0/m, z$4.d, z$5.d \n\t" "fadd z7.d, p0/m, z7.d, z$1.d \n\t" "fadd z8.d, p0/m, z8.d, z$2.d \n\t" "lsl x2, $8, #1 \n\t" "st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"', ` "fneg z4.d, p0/m, z$2.d \n\t" "fcmla z$1.d, p0/m, z4.d, z$3.d, 0 \n\t" "fcmla z$1.d, p0/m, z4.d, z$3.d, 90 \n\t" "st1d z$1.d, p0, [%[$7], $8, lsl #3] \n\t"')') define(`complex_double_sve_asm_inc', `ifdef(`disable_complex_instructions', ` "uqincd $1 \n\t" "uqincd $1 \n\t"', ` "uqincd $1 \n\t"')') divert`'dnl