diff --git a/Makefile b/Makefile
index 48f7cd68ab446cf2bd2102f8b6f72307d4f19ad1..00f04649121b853c53b2bfc52e92c52e88763dee 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ TARGS=$(patsubst examples/%.c,generated/%.x,$(SRC))
 all : $(TARGS)
 $(TARGS) : $(M4_SRC)
 	armclang -march=armv8+sve $(patsubst generated/%.x,generated/%.c,$@) -o $@
-	armie -msve-vector-bits=128 $@
+	armie -msve-vector-bits=256 $@
 
 
 .PHONY : m4 clean test
diff --git a/README.md b/README.md
index 6e2702491b8b107045f6fc4ef42bdea794bb1dc7..385afcbabd8b593d926250ad7d4c1a61c378317b 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,73 @@
-# Sve code gen
+## SVE code generator
 
-Code generator for sve (MB2020 work package 7, task 1)
\ No newline at end of file
+---
+
+### Overview
+This simple code generator generates C code (vectorized with intrinsics or inline assembly) for simple equations.
+
+Four m4 macro functions are provided:
+
+- `svedef(name, datatype, size)`
+This function defines the vector which we want to use in equation.
+`name` is the name of the array pointer, which we wish to vectorize.
+`type` is the datatype of the array elements. It can be either `float` `double` `complex_float` `complex_double>`.
+`size` is the name of the variable holding the size of the vector.
+This function expands into an empty string.
+- `disable_complex`
+If the datatype of vectors is either `complex\_float` or `complex\_double`, generated code will by default use complex arithmetic instructions.  This macro disables complex arithmetics instruction, and generates code with normal floating point arithmetics.
+this function expands into an empty string.
+- `sveasmfor(equation)`
+`equation` is the equation we want to vectorize. It can be of the following format `output = input1 <+|-|\*|/> input2 [<+|-> input3]`, where `input1`, `input2` or `input3` are the `name` arguments to `svedef()`. If not they will be treated like scalars. This function expand to the actual C code, where vectorization is done via inline assembly.
+
+- `sveintrfor(equation)`
+Same as previous, but the code is generated with intrinsic functions.
+
+---
+
+### Simple demo
+
+Lets take an example of zaxpy routine: **y = a * x + y**
+
+Create a file named zaxpy with the following content:
+```
+svedef(x, complex double, n)
+svedef(y, complex double, n)
+sveasmfor( y = a ∗ x + y )
+```
+
+Invoke the generator with command `m4 <path-to-sve-code-gen>/sve.m4 zaxpy > zaxpy.c`
+This will generate the following code in the file zaxpy.c.
+
+```
+/*#####################---sve-generated-code---###################################*/
+
+__asm__ volatile
+(
+    "ldr x0, %[n]                        \n\t"
+    "lsl x0, x0, #1                     \n\t"
+    "mov x1, xzr                         \n\t"
+    "whilelo p0.d, x1, x0                   \n\t"
+"sveloop:                                   \n\t"
+    "ld1rqd z0.d, p0/z, %[a]              \n\t"
+    "ld1d z1.d, p0/z, [%[x], x1, lsl #3]  \n\t"
+    "ld1d z2.d, p0/z, [%[y], x1, lsl #3]  \n\t"
+    "fcmla z2.d, p0/m, z0.d, z1.d, 0     \n\t"
+    "fcmla z2.d, p0/m, z0.d, z1.d, 90    \n\t"
+    "st1d z2.d, p0, [%[y], x1, lsl #3]    \n\t"
+    "uqincd x1                              \n\t"
+    "whilelo p0.d, x1, x0                   \n\t"
+    "b.any sveloop                              "
+: // output operands
+: // input operands
+[a] "m" (a),
+[x] "r" (x),
+[y] "r" (y),
+[y] "r" (y),
+[n]   "m" (n)
+: // register clobber list
+"memory","cc","x0", "z0","z1","z2","z3","z4","z5","z6","z7","z8","p0"
+);
+
+
+/*#####################---sve-generated-code---###################################*/
+```
diff --git a/examples/complex_double_addmul.c b/examples/complex_double_addmul.c
index bec08e9998be0f803fe6828313a84f3ca7b145d3..81a8860fd0fa46f6d480c6e84dc69c5fcaf226f0 100644
--- a/examples/complex_double_addmul.c
+++ b/examples/complex_double_addmul.c
@@ -38,7 +38,8 @@ int main(int argc, char* argv[])
 svedef(x, complex_double, n)
 svedef(z, complex_double, n)
 disable_complex
-sveintrfor( c = x + y * z)
+
+sveasmfor( c = x + y * z)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_double_muladd.c b/examples/complex_double_muladd.c
index dc5e571e668ccf08f7a6e1d9fce2e0c89ce7eb78..a9e5a0cd2669469a753417100199800b7b3ec867 100644
--- a/examples/complex_double_muladd.c
+++ b/examples/complex_double_muladd.c
@@ -37,8 +37,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_double, n)
 svedef(z, complex_double, n)
-disable_complex
-sveintrfor( c = x * y + z)
+sveasmfor( c = x * y + z)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_double_product.c b/examples/complex_double_product.c
index f1221be77be9d4a10a6873e07d57fc2a06b8fe37..b000348a14cb7b83962a7891429fe6bebf66937c 100644
--- a/examples/complex_double_product.c
+++ b/examples/complex_double_product.c
@@ -32,8 +32,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_double, n)
 svedef(y, complex_double, n)
-disable_complex
-sveintrfor( c = x * y)
+sveasmfor( c = x * y)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_double_scal_product.c b/examples/complex_double_scal_product.c
index 89ef7c2a8faa962865042d789e931254be463de2..94c8c7e9eefd50211303bbf55fa44ad1b869262a 100644
--- a/examples/complex_double_scal_product.c
+++ b/examples/complex_double_scal_product.c
@@ -33,8 +33,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_double, n)
 //vedef(y, complex_double, n)
-    disable_complex
-sveintrfor( c = x * y)
+sveasmfor( c = x * y)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_double_sum.c b/examples/complex_double_sum.c
index f41568ab635aa3c6ac78407dd33603d720cb52d9..53546487da1d9f9e009935db31b14ea5ba175b6f 100644
--- a/examples/complex_double_sum.c
+++ b/examples/complex_double_sum.c
@@ -32,8 +32,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_double, n)
 svedef(y, complex_double, n)
-disable_complex
-sveintrfor( c = x + y)
+sveasmfor( c = x + y)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_float_mulsub.c b/examples/complex_float_mulsub.c
index 8c2aa67091cf7a96ccce35052c3f08f20e7c335f..182cc1455981461d727b5d70d0cdb26ad6368c7e 100644
--- a/examples/complex_float_mulsub.c
+++ b/examples/complex_float_mulsub.c
@@ -37,8 +37,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_float, n)
 svedef(z, complex_float, n)
-disable_complex
-sveintrfor( c = y * x - z)
+sveasmfor( c = y * x - z)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_float_scalar_sub.c b/examples/complex_float_scalar_sub.c
index 8b7196c4e75aeb52ccc41ea918a943a70a89a965..38ee0d4b932e6d1a099660f1e223c8e81eca1266 100644
--- a/examples/complex_float_scalar_sub.c
+++ b/examples/complex_float_scalar_sub.c
@@ -36,8 +36,7 @@ int main(int argc, char* argv[])
   printf("complex float sub");
 
 svedef(y, complex_float, n)
-    disable_complex
-sveintrfor( c = x - y)
+sveasmfor( c = x - y)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_float_sub.c b/examples/complex_float_sub.c
index 87f9a1d5dc0a90c04c266d21ffbc44c3adacad68..d99e58e0bd294b9fe5032eb2ad4b77c5e479faa2 100644
--- a/examples/complex_float_sub.c
+++ b/examples/complex_float_sub.c
@@ -35,8 +35,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_float, n)
 svedef(y, complex_float, n)
-disable_complex
-sveintrfor( c = x - y)
+sveasmfor( c = x - y)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/complex_float_submul.c b/examples/complex_float_submul.c
index edaaf66ba3bf6744802658345265f0a37799ad6a..7ac714a46f32a4f349ab777351f9edfcc350efe5 100644
--- a/examples/complex_float_submul.c
+++ b/examples/complex_float_submul.c
@@ -37,8 +37,7 @@ int main(int argc, char* argv[])
 
 svedef(x, complex_float, n)
 svedef(z, complex_float, n)
-disable_complex
-sveintrfor( c = y - x * z)
+sveasmfor( c = y - x * z)
 
   for (j = 0; j < n; j++)
   printf("%f %f \n", creal(c[j]), cimag(c[j]));
diff --git a/examples/double_sum.c b/examples/double_sum.c
index 187d95afed5d3e0f9743104bab4c66b69bde28fa..3403225d7ddbac8d696bb65d4058b28342404d18 100644
--- a/examples/double_sum.c
+++ b/examples/double_sum.c
@@ -30,8 +30,7 @@ int main(int argc, char* argv[])
 
 svedef(x, double, n)
 svedef(y, double, n)
-disable_complex
-sveintrfor(c = x + y)
+sveasmfor(c = x + y)
 
   printf("c= %f %f %f %f %f %f\n", c[0], c[1], c[2], c[3], c[4], c[5]);
   //printf("%f %f %f %f %f %f\n", c[0], c[1], c[2], c[3], c[4], c[5]);
diff --git a/examples/float_mul.c b/examples/float_mul.c
index f0e94b260f08523f88a32c681f58c5d693fc777c..b56b37e779c78951d212bafa86f5a2efa14ea06d 100644
--- a/examples/float_mul.c
+++ b/examples/float_mul.c
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
 svedef(x, float, n)
 svedef(y, float, n)
 disable_complex
-sveintrfor(c = x * y)
+sveasmfor(c = x * y)
 
 printf("float mul");
   printf("c= %f %f %f %f %f %f\n", c[0], c[1], c[2], c[3], c[4], c[5]);
diff --git a/sve.m4 b/sve.m4
index b7ebf3f93e191117bf45efc1cc0fc0b8977e7a09..1c587cfa8befafbfaca00caba6c31e778399cbeb 100644
--- a/sve.m4
+++ b/sve.m4
@@ -33,7 +33,6 @@ divert
 /*#####################---sve-generated-code---###################################*/
 
 uint64_t counter = 0;
-
 svbool_t indir(`datatype')_sve_intr_while_lower( counter, size ifelse(iscomplex, `yes', `* 2'))
 do
 {
@@ -48,7 +47,6 @@ ifdef(`input3',
 `indir(`datatype')_sve_intr_load(input3, counter, vector3)',
 `indir(`datatype')_sve_intr_dup(input3, vector3)')')
 
-
 ifelse(operation, `mul', datatype`_sve_intr_mul(`vector1', `vector2', output, `counter')',
        operation, `add', datatype`_sve_intr_add(`vector1', `vector2', output, `counter')',
        operation, `sub', datatype`_sve_intr_sub(`vector1', `vector2', output, `counter')',
@@ -63,10 +61,6 @@ indir(`datatype')_sve_intr_inc(counter)
 }
 while (indir(`datatype')_sve_intr_any(pg));
 
-
-
-
-
 /*#####################---sve-generated-code---###################################*/
 ')dnl
 dnl
@@ -97,6 +91,7 @@ dnl `define(`complex_equation', indir(input2`sve', `complex'))')
 
 divert
 /*#####################---sve-generated-code---###################################*/
+
 ifdef(`disable_complex_instructions',
 `ifdef(input1`sve',`',
 `ifelse(
@@ -106,8 +101,8 @@ double input1`imag' = cimag(input1);
 ', datatype, `complex_float',
 `float input1`real' = creal(input1);
 float input1`imag' = cimag(input1);
-')')')
-
+')')')dnl
+dnl
 ifdef(`disable_complex_instructions',
 `ifdef(input2`sve',`',
 `ifelse(
@@ -117,8 +112,8 @@ double input2`imag' = cimag(input2);
 ', datatype, `complex_float',
 `float input2`real' = creal(input2);
 float input2`imag' = cimag(input2);
-')')')
-
+')')')dnl
+dnl
 ifdef(`input3',
 `ifdef(`disable_complex_instructions',
 `ifdef(input3`sve',`',
@@ -129,8 +124,7 @@ double input3`imag' = cimag(input3);
 ', datatype, `complex_float',
 `float input3`real' = creal(input3);
 float input3`imag' = cimag(input3);
-')')')')
-
+')')')')dnl
 __asm__ volatile
 (
     "ldr x0, %[size]                        \n\t"
@@ -200,11 +194,14 @@ ifdef(input2`sve',
 `[input2] "m" (input2),')')
 ifdef(`input3',
 `ifdef(input3`sve',
-`[input3] "r" (input3),',
+`[input3] "r" (input3),
+',
 `ifdef(`disable_complex_instructions',
 `[input3`real'] "m" (input3`real'),
-[input3`imag'] "m" (input3`imag'),',
-`[input3] "m" (input3),')')')
+[input3`imag'] "m" (input3`imag'),
+',
+`[input3] "m" (input3),
+')')')dnl
 [output] "r" (output),
 [size]   "m" (size)
 : // register clobber list