Select Git revision
-
Rene Halver authoredRene Halver authored
complex_double.m4 11.40 KiB
divert(`-1')
define(`complex_doublesve', `defn(format(``complex_doublesve.%s'', `$1'))')
define(`complex_doublesve_set', `define(format(``complex_doublesve.%s'', `$1'), `$2')')
dnl
complex_doublesve_set(`vec', `svfloat64x2_t')dnl
complex_doublesve_set(`intvec', `svuint64_t')dnl
complex_doublesve_set(`scalpointer', `double')dnl
dnl
complex_doublesve_set(`load', `svld2_f64')dnl
complex_doublesve_set(`gatherload', `svld1_gather_index')dnl
complex_doublesve_set(`store', `svst2_f64')dnl
complex_doublesve_set(`count', `svcntd() / 2')dnl
complex_doublesve_set(`index', `svindex_u64')dnl
complex_doublesve_set(`true', `svptrue_b64()')dnl
complex_doublesve_set(`while', `svwhilelt_b64')dnl
complex_doublesve_set(`neg', `svneg_f64_z')dnl
complex_doublesve_set(`abs', `svabs_f64_z')dnl
dnl
complex_doublesve_set(`add', `svadd_f64_z')dnl
complex_doublesve_set(`sub', `svsub_f64_z')dnl
complex_doublesve_set(`mul', `svmul_f64_z')dnl
complex_doublesve_set(`div', `svdiv_f64_z')dnl
dnl
dnl
define(`complex_double_sve_intr_while_lower',
`pg = svwhilelt_b64($1, $2); ')
define(`complex_double_sve_intr_load',
`ifdef(`disable_complex_instructions',
` svfloat64x2_t $3 = svld2_f64(pg, (const double *) &$1[$2 * 2]);
',
` svfloat64_t $3 = svld1_f64(pg, (const double *) &$1[$2 / 2]);
')
')
define(`complex_double_sve_intr_dup',
`ifdef(`disable_complex_instructions',
` svfloat64x2_t $2 = {svdup_f64(creal($1)), svdup_f64(cimag($1))};
',
` svfloat64_t $2 = svdupq_f64( creal($1), cimag($1));
')
')
define(`complex_double_sve_intr_add',
`ifdef(`disable_complex_instructions',
` svfloat64_t re = svadd_f64_z(pg, $1.v0, $2.v0);
svfloat64_t im = svadd_f64_z(pg, $1.v1, $2.v1);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$3[$4 * 2], res);
',
` svfloat64_t res = svadd_f64_z(pg, $1, $2);
svst1_f64(pg, (double *) &$3[$4 / 2], res);
')
')
define(`complex_double_sve_intr_sub',
`ifdef(`disable_complex_instructions',
` svfloat64_t re = svsub_f64_z(pg, $1.v0, $2.v0);
svfloat64_t im = svsub_f64_z(pg, $1.v1, $2.v1);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$3[$4 * 2], res);
',
` svfloat64_t res = svsub_f64_z(pg, $1, $2);
svst1_f64(pg, (double *) &$3[$4 / 2], res);
')
')
define(`complex_double_sve_intr_mul',
`ifdef(`disable_complex_instructions',
` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0);
svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1);
svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1);
svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$3[$4 * 2], res);
',
` svfloat64_t zero = svdup_f64(0.0);
svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0);
svfloat64_t res_im = svcmla_f64_z(pg, zero, $1, $2, 90);
svfloat64_t res = svadd_f64_z(pg, res_re, res_im);
svst1_f64(pg, (double *) &$3[$4 / 2], res);
')
')
define(`complex_double_sve_intr_div',
`TODO svfloat64_t res = svdiv_f64_z(pg, $1, $2);
svst1_f64(pg, (double *) &$3[$4], res);
')
define(`complex_double_sve_intr_muladd',
`ifdef(`disable_complex_instructions',
` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0);
svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1);
svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1);
svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0);
re = svadd_f64_z(pg, re, $3.v0);
im = svadd_f64_z(pg, im, $3.v1);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$4[$5 * 2], res);
',
` svfloat64_t zero = svdup_f64(0.0);
svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0);
svfloat64_t res_im = svcmla_f64_z(pg, $3, $1, $2, 90);
svfloat64_t res = svadd_f64_z(pg, res_re, res_im);
svst1_f64(pg, (double *) &$4[$5 / 2], res);
')
')
define(`complex_double_sve_intr_addmul',
`ifdef(`disable_complex_instructions',
` svfloat64_t retemp = svmul_f64_z(pg, $2.v0, $3.v0);
svfloat64_t imtemp = svmul_f64_z(pg, $2.v0, $3.v1);
svfloat64_t re = svmls_f64_z(pg, retemp, $2.v1, $3.v1);
svfloat64_t im = svmla_f64_z(pg, imtemp, $2.v1, $3.v0);
re = svadd_f64_z(pg, re, $1.v0);
im = svadd_f64_z(pg, im, $1.v1);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$4[$5 * 2], res);
',
` svfloat64_t zero = svdup_f64(0.0);
svfloat64_t res_re = svcmla_f64_z(pg, zero, $2, $3, 0);
svfloat64_t res_im = svcmla_f64_z(pg, $1, $2, $3, 90);
svfloat64_t res = svadd_f64_z(pg, res_re, res_im);
svst1_f64(pg, (double *) &$4[$5 / 2], res);
')
')
define(`complex_double_sve_intr_mulsub',
`ifdef(`disable_complex_instructions',
` svfloat64_t retemp = svmul_f64_z(pg, $1.v0, $2.v0);
svfloat64_t imtemp = svmul_f64_z(pg, $1.v0, $2.v1);
svfloat64_t re = svmls_f64_z(pg, retemp, $1.v1, $2.v1);
svfloat64_t im = svmla_f64_z(pg, imtemp, $1.v1, $2.v0);
re = svsub_f64_z(pg, re, $3.v0);
im = svsub_f64_z(pg, im, $3.v1);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$4[$5 * 2], res);
',
` svfloat64_t zero = svdup_f64(0.0);
$3 = svneg_f64_z(pg, $3);
svfloat64_t res_re = svcmla_f64_z(pg, zero, $1, $2, 0);
svfloat64_t res_im = svcmla_f64_z(pg, $3, $1, $2, 90);
svfloat64_t res = svadd_f64_z(pg, res_re, res_im);
svst1_f64(pg, (double *) &$4[$5 / 2], res);
')
')
define(`complex_double_sve_intr_submul',
`ifdef(`disable_complex_instructions',
` svfloat64_t retemp = svmul_f64_z(pg, $2.v0, $3.v0);
svfloat64_t imtemp = svmul_f64_z(pg, $2.v0, $3.v1);
svfloat64_t re = svmls_f64_z(pg, retemp, $2.v1, $3.v1);
svfloat64_t im = svmla_f64_z(pg, imtemp, $2.v1, $3.v0);
re = svsub_f64_z(pg, $1.v0, re);
im = svsub_f64_z(pg, $1.v1, im);
svfloat64x2_t res = {re, im};
svst2_f64(pg, (double *) &$4[$5 * 2], res);
',
` svfloat64_t zero = svdup_f64(0.0);
$2 = svneg_f64_z(pg, $2);
svfloat64_t res_re = svcmla_f64_z(pg, zero, $2, $3, 0);
svfloat64_t res_im = svcmla_f64_z(pg, $1, $2, $3, 90);
svfloat64_t res = svadd_f64_z(pg, res_re, res_im);
svst1_f64(pg, (double *) &$4[$5 / 2], res);
')
')
define(`complex_double_sve_intr_inc',
` $1 += svcntd() / 2;')
define(`complex_double_sve_intr_any',
` svptest_any(svptrue_b64(), $1)')
define(`complex_double_sve_asm_while_lower',
` "whilelo p0.d, $1, $2 \n\t"')
define(`complex_double_sve_asm_load',
`ifdef(`disable_complex_instructions',
` "ld2d {z$3.d, z$4.d}, p0/z, [%[$1], $2, lsl #3] \n\t"',
` "ld1d z$3.d, p0/z, [%[$1], $2, lsl #3] \n\t"')')
define(`complex_double_sve_asm_dup',
`ifdef(`disable_complex_instructions',
` "ld1rd z$2.d, p0/z, %[$1real] \n\t"
"ld1rd z$3.d, p0/z, %[$1imag] \n\t"',
` "ld1rqd z$2.d, p0/z, %[$1] \n\t"')')
define(`complex_double_sve_asm_mul',
`ifdef(`disable_complex_instructions',
` "fmov z5.d, p0/m, #0.0 \n\t"
"fmov z6.d, p0/m, #0.0 \n\t"
"fmla z5.d, p0/m, z$1.d, z$3.d \n\t"
"fmla z6.d, p0/m, z$1.d, z$4.d \n\t"
"fmls z5.d, p0/m, z$2.d, z$4.d \n\t"
"fmla z6.d, p0/m, z$2.d, z$3.d \n\t"
"lsl x2, $6, #1 \n\t"
"st2d {z5.d, z6.d}, p0, [%[$5], $6, lsl #3] \n\t"',
` "fmov z4.d, p0/m, #0.0 \n\t"
"fcmla z4.d, p0/m, z$1.d, z$2.d, 0 \n\t"
"fcmla z4.d, p0/m, z$1.d, z$2.d, 90 \n\t"
"st1d z4.d, p0, [%[$5], $6, lsl #3] \n\t"')')
define(`complex_double_sve_asm_div',
` " to do \n\t"')
define(`complex_double_sve_asm_add',
`ifdef(`disable_complex_instructions',
` "fadd z$1.d, p0/m, z$1.d, z$3.d \n\t"
"fadd z$2.d, p0/m, z$2.d, z$4.d \n\t"
"lsl x2, $6, #1 \n\t"
"st2d {z$1.d, z$2.d}, p0, [%[$5], $6, lsl #3] \n\t"',
` "fadd z$1.d, p0/m, z$1.d, z$2.d \n\t"
"st1d z$1.d, p0, [%[$5], $6, lsl #3] \n\t"')')
define(`complex_double_sve_asm_sub',
`ifdef(`disable_complex_instructions',
` "fsub z$1.d, p0/m, z$1.d, z$3.d \n\t"
"fsub z$2.d, p0/m, z$2.d, z$4.d \n\t"
"lsl x2, $6, #1 \n\t"
"st2d {z$1.d, z$2.d}, p0, [%[$5]], $6, lsl #3] \n\t"',
` "fsub z$1.d, p0/m, z$1.d, z$2.d \n\t"
"st1d z$1.d, p0, [%[$5], $6, lsl #3] \n\t"')')
define(`complex_double_sve_asm_muladd',
`ifdef(`disable_complex_instructions',
` "fmov z7.d, p0/m, #0.0 \n\t"
"fmov z8.d, p0/m, #0.0 \n\t"
"fmla z7.d, p0/m, z$1.d, z$3.d \n\t"
"fmla z8.d, p0/m, z$1.d, z$4.d \n\t"
"fmls z7.d, p0/m, z$2.d, z$4.d \n\t"
"fmla z8.d, p0/m, z$2.d, z$3.d \n\t"
"fadd z7.d, p0/m, z7.d, z$5.d \n\t"
"fadd z8.d, p0/m, z8.d, z$6.d \n\t"
"lsl x2, $8, #1 \n\t"
"st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"',
` "fcmla z$3.d, p0/m, z$1.d, z$2.d, 0 \n\t"
"fcmla z$3.d, p0/m, z$1.d, z$2.d, 90 \n\t"
"st1d z$3.d, p0, [%[$7], $8, lsl #3] \n\t"')')
define(`complex_double_sve_asm_addmul',
`ifdef(`disable_complex_instructions',
` "fmov z7.d, p0/m, #0.0 \n\t"
"fmov z8.d, p0/m, #0.0 \n\t"
"fmla z7.d, p0/m, z$3.d, z$5.d \n\t"
"fmla z8.d, p0/m, z$3.d, z$6.d \n\t"
"fmls z7.d, p0/m, z$4.d, z$6.d \n\t"
"fmla z8.d, p0/m, z$4.d, z$5.d \n\t"
"fadd z7.d, p0/m, z7.d, z$1.d \n\t"
"fadd z8.d, p0/m, z8.d, z$2.d \n\t"
"lsl x2, $8, #1 \n\t"
"st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"',
` "fcmla z$1.d, p0/m, z$2.d, z$3.d, 0 \n\t"
"fcmla z$1.d, p0/m, z$2.d, z$3.d, 90 \n\t"
"st1d z$1.d, p0, [%[$7], $8, lsl #3] \n\t"')')
define(`complex_double_sve_asm_mulsub',
`ifdef(`disable_complex_instructions',
` "fmov z7.d, p0/m, #0.0 \n\t"
"fmov z8.d, p0/m, #0.0 \n\t"
"fmla z7.d, p0/m, z$1.d, z$3.d \n\t"
"fmla z8.d, p0/m, z$1.d, z$4.d \n\t"
"fmls z7.d, p0/m, z$2.d, z$4.d \n\t"
"fmla z8.d, p0/m, z$2.d, z$3.d \n\t"
"fsub z7.d, p0/m, z7.d, z$5.d \n\t"
"fsub z8.d, p0/m, z8.d, z$6.d \n\t"
"lsl x2, $8, #1 \n\t"
"st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"',
` "fneg z4.d, p0/m, z$3.d \n\t"
"fcmla z4.d, p0/m, z$1.d, z$2.d, 0 \n\t"
"fcmla z4.d, p0/m, z$1.d, z$2.d, 90 \n\t"
"st1d z4.d, p0, [%[$7], $8, lsl #3] \n\t"')')
define(`complex_double_sve_asm_submul',
`ifdef(`disable_complex_instructions',
` "fmov z7.d, p0/m, #0.0 \n\t"
"fmov z8.d, p0/m, #0.0 \n\t"
"fneg z$2.d, p0/m, z$2.d \n\t"
"fneg z$3.d, p0/m, z$3.d \n\t"
"fmla z7.d, p0/m, z$3.d, z$5.d \n\t"
"fmla z8.d, p0/m, z$3.d, z$6.d \n\t"
"fmls z7.d, p0/m, z$4.d, z$6.d \n\t"
"fmla z8.d, p0/m, z$4.d, z$5.d \n\t"
"fadd z7.d, p0/m, z7.d, z$1.d \n\t"
"fadd z8.d, p0/m, z8.d, z$2.d \n\t"
"lsl x2, $8, #1 \n\t"
"st2d {z7.d, z8.d}, p0, [%[$7], $8, lsl #3] \n\t"',
` "fneg z4.d, p0/m, z$2.d \n\t"
"fcmla z$1.d, p0/m, z4.d, z$3.d, 0 \n\t"
"fcmla z$1.d, p0/m, z4.d, z$3.d, 90 \n\t"
"st1d z$1.d, p0, [%[$7], $8, lsl #3] \n\t"')')
define(`complex_double_sve_asm_inc',
`ifdef(`disable_complex_instructions',
` "uqincd $1 \n\t"
"uqincd $1 \n\t"',
` "uqincd $1 \n\t"')')
divert`'dnl