diff --git a/test/run_pytest.sh b/test/run_pytest.sh
index 2f9fc13eb68a9e02a171834c11bd0efecac5fb7c..ac39b23e289cffe8cb7f8d5ba526d3d93dc7a128 100644
--- a/test/run_pytest.sh
+++ b/test/run_pytest.sh
@@ -21,11 +21,11 @@ fi
 #python -m pytest  test_process_netCDF_v2.py
 source ../video_prediction_tools/env_setup/modules_train.sh
 #Test for process step2
-#python -m pytest test_data_preprocess_step2.py
+python -m pytest test_data_preprocess_step2.py
 #python -m pytest test_era5_data.py
 #Test for training
 #First remove all the files in the test folder
 #rm /p/project/deepacf/deeprain/video_prediction_shared_folder/models/test/* 
 #python -m pytest test_train_model_era5.py
 #python -m pytest test_visualize_postprocess.py
-python -m pytest test_meta_postprocess.py
+#python -m pytest test_meta_postprocess.py
diff --git a/test/test_data_preprocess_step2.py b/test/test_data_preprocess_step2.py
index 7f7f3bf30917371ccf283bcd6fb9be380f21d5d5..b8a71408c1870631d7e0aef396ba7870b0ece789 100644
--- a/test/test_data_preprocess_step2.py
+++ b/test/test_data_preprocess_step2.py
@@ -10,24 +10,17 @@ import json
 import datetime
 
 input_dir =  "/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5_test"
-output_dir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5_test"
-datasplit_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/data_split/cv_test.json"
-hparams_dict_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/hparams/era5/convLSTM/model_hparams.json"
 vars_in = ["T2","MSL","gph500"]
 sequences_per_file = 10
-
+sequence_length = 20
 #generate an instance for ERA5Pkl2Tfrecords
 @pytest.fixture(scope="module")
 def era5_dataset_case1():
-    return ERA5Pkl2Tfrecords(input_dir=input_dir,output_dir=output_dir,datasplit_config=datasplit_config,
-                             hparams_dict_config=hparams_dict_config,sequences_per_file=sequences_per_file)
-
-def test_get_datasplit(era5_dataset_case1):
-    assert input_dir == era5_dataset_case1.input_dir
-    d = era5_dataset_case1.get_datasplit()
+    return ERA5Pkl2Tfrecords(input_dir=input_dir,sequence_length=sequence_length,
+                              sequences_per_file=sequences_per_file)
 
 def test_get_months(era5_dataset_case1):
-    assert len(era5_dataset_case1.get_years_months()[1]) == 3
+    assert len(era5_dataset_case1.get_years_months()[1]) == 12
 
 def test_get_metadata(era5_dataset_case1):
     """
@@ -37,14 +30,14 @@ def test_get_metadata(era5_dataset_case1):
     assert era5_dataset_case1.width == 160
     assert era5_dataset_case1.vars_in == ["T2","MSL","gph500"]
 
-def test_parse_hparams(era5_dataset_case1):
-    """
-    Test the updated hparam is properly updated 
-    """
-    print("hparmas:",era5_dataset_case1.hparams)
-    assert era5_dataset_case1.hparams.max_epochs == 20
-    assert era5_dataset_case1.sequence_length == 20
-    assert era5_dataset_case1.hparams.batch_size == 4
+#def test_parse_hparams(era5_dataset_case1):
+#    """
+#    Test the updated hparam is properly updated 
+#    """
+#    print("hparmas:",era5_dataset_case1.hparams)
+#    assert era5_dataset_case1.hparams.max_epochs == 20
+#    assert era5_dataset_case1.sequence_length == 20
+#    assert era5_dataset_case1.hparams.batch_size == 4
 
 def test_save_tf_record(era5_dataset_case1):
     #create a sequence
@@ -69,7 +62,7 @@ def test_read_pkl_and_save_tfrecords(era5_dataset_case1):
     #assert era5_dataset_case1.input_file_year=="/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5-Y2010toY2222M01to12-160x128-2970N1500W-T2_MSL_gph500/pickle/2017"
     #assert the output tfrecords is saved properly
     fname_case1 = "sequence_Y_2017_M_{}_0_to_{}.tfrecords".format(month_test,sequences_per_file-1)
-    if_file_exit = os.path.isfile(os.path.join(output_dir,"tfrecords",fname_case1))
+    if_file_exit = os.path.isfile(os.path.join(output_dir,"tfrecords_seq_len_20",fname_case1))
     print("file check:",os.path.join(output_dir,fname_case1))
     assert if_file_exit == True
 
diff --git a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh
index 85620bfac61a99701fca88ed5db29462e9948962..04ecea9a1cc7f0e76482103cfa89c416cb7e8075 100644
--- a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh
+++ b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh
@@ -33,17 +33,8 @@ if [ -z ${VIRTUAL_ENV} ]; then
 fi
 
 # declare directory-variables which will be modified by config_runscript.py
-source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
-destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
-# further settings
-datasplit_dir=../data_split/cv_test.json
-model=convLSTM
-hparams_dict_config=../hparams/era5/${model}/model_hparams.json
-sequences_per_file=10
+base_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
 sequence_length=20
-
-# run preprocessing (step 2 where Tf-records are generated)
-srun python ../main_scripts/main_preprocess_data_step2.py -input_dir ${source_dir} -output_dir ${destination_dir}
-   -datasplit_config ${datasplit_dir}  -hparams_dict_config ${hparams_dict_config}
-   -sequences_per_file ${sequences_per_file}
-
+sequences_per_file=10
+# run Preprocessing (step 2 where Tf-records are generated)
+srun python ../main_scripts/main_preprocess_data_step2.py -base_dir ${base_dir}  -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file}
diff --git a/video_prediction_tools/data_preprocess/preprocess_data_step2.py b/video_prediction_tools/data_preprocess/preprocess_data_step2.py
index 5bb503d20086d7fc016dab013bc6a42284c4336c..687c2639a22a72c53a1f3dbec30bc7c782947de0 100644
--- a/video_prediction_tools/data_preprocess/preprocess_data_step2.py
+++ b/video_prediction_tools/data_preprocess/preprocess_data_step2.py
@@ -2,12 +2,13 @@
 Class and functions required for preprocessing ERA5 data (preprocessing substep 2)
 """
 __email__ = "b.gong@fz-juelich.de"
-__author__ = "Bing Gong, Scarlet Stadtler, Michael Langguth"
-__date__ = "2020_11_10"
+__author__ = "Bing Gong"
+__date__ = "2020_12_29"
 
 
 # import modules
 import os
+import glob
 import pickle
 import numpy as np
 import json
@@ -18,35 +19,26 @@ from model_modules.video_prediction.datasets import ERA5Dataset
 
 
 class ERA5Pkl2Tfrecords(ERA5Dataset):
-    def __init__(self, input_dir=None, output_dir=None, datasplit_config=None, hparams_dict_config=None, \
-                 sequences_per_file=128, norm="minmax"):
+    def __init__(self, input_dir=None,  sequence_length=20, sequences_per_file=128,norm="minmax"):
         """
         This class is used for converting pkl files to tfrecords
         args:
             input_dir            : str, the path to the PreprocessData directory which is parent directory of "Pickle"
                                    and "tfrecords" files directiory.
-            outpout_dir          : str, the one upper  level of the path to save the tfrecords files 
-            datasplit_config     : the path pointing to the datasplit_config json file
-            hparams_dict_config  : the path to the dict that contains hparameters,
+            sequence_length      : int, default is 20, the sequen length per sample
             sequences_per_file   : int, how many sequences/samples per tfrecord to be saved
             norm                 : str, normalization methods from Norm_data class ("minmax" or "znorm";
                                    default: "minmax")
         """
         self.input_dir = input_dir
-        # ML: Do not append paths inside the script (invisible even for advanced users)
-        #self.output_dir = os.path.join(output_dir, "tfrecords")
-        self.output_dir = output_dir
+        self.input_dir_pkl = os.path.join(input_dir,"pickle")
+        self.output_dir = os.path.join(input_dir, "tfrecords_seq_len_" + str(sequence_length))
         # if the output_dir is not exist, then create it
         os.makedirs(self.output_dir, exist_ok=True)
         # get metadata,includes the var_in, image height, width etc.
         self.get_metadata()
         # Get the data split informaiton
-        self.datasplit_dict_path = datasplit_config
-        self.data_dict = self.get_datasplit()
-        self.hparams_dict_config = hparams_dict_config      
-        self.hparams_dict = self.get_model_hparams_dict()
-        self.hparams = self.parse_hparams()
-        self.sequence_length = self.hparams.sequence_length
+        self.sequence_length = sequence_length
         if norm == "minmax" or norm == "znorm":
             self.norm = norm
         else:
@@ -60,15 +52,20 @@ class ERA5Pkl2Tfrecords(ERA5Dataset):
         Return : 
                 two elements: each contains 1-dim array with the months set from data_split_config json file
         """
-        self.mode_list = []
-        self.years = []
         self.months = []
-        for mode, value in self.d.items():
-            self.mode_list.append(mode)
-            for year, month in value.items():
-                self.years.append(year)
-                self.months.extend(month)
-        return set(self.years), set(self.months)
+        self.years_months = []
+        #search for pickle names with pattern 'X_{}.pkl'for months
+        self.years =  [ name for name in os.listdir(self.input_dir_pkl) if os.path.isdir(os.path.join(self.input_dir_pkl,name)) ] 
+        #search for folder names from pickle folder to get years
+        patt = "X_*.pkl"         
+        for year in self.years:
+            print("pahtL:",os.path.join(self.input_dir_pkl,year,patt))
+            months_pkl_list = glob.glob(os.path.join(self.input_dir_pkl,year,patt))
+            print ("months_pkl_list",months_pkl_list)
+            months_list = [int(m[-6:-4]) for m in months_pkl_list]
+            self.months.extend(months_list)
+            self.years_months.append(months_list)
+        return self.years, list(set(self.months)),self.years_months
 
     def get_stats_file(self):
         """
diff --git a/video_prediction_tools/data_split/bair_action_free/datasplit.json b/video_prediction_tools/data_split/bair_action_free/datasplit.json
new file mode 100644
index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19
--- /dev/null
+++ b/video_prediction_tools/data_split/bair_action_free/datasplit.json
@@ -0,0 +1,14 @@
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/data_split/bair_action_free/datasplit_template.json b/video_prediction_tools/data_split/bair_action_free/datasplit_template.json
new file mode 100644
index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866
--- /dev/null
+++ b/video_prediction_tools/data_split/bair_action_free/datasplit_template.json
@@ -0,0 +1,21 @@
+# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset.
+#       If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs.
+#       However, remember to remove any comment lines (starting with #) from your config-file then!!!
+#
+# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the 
+#              raw dataset
+#              Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here!
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/data_split/era5/datasplit.json b/video_prediction_tools/data_split/era5/datasplit.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dafd53b7143c064beabf67b02c723805c4b52ef
--- /dev/null
+++ b/video_prediction_tools/data_split/era5/datasplit.json
@@ -0,0 +1,14 @@
+{
+    "train":{ 
+             "2017":[1]
+             },
+     "val":
+             {
+             "2017":[2]
+             },
+      "test":
+             {
+             "2017":[3]
+                  
+             }
+ }
diff --git a/video_prediction_tools/data_split/datasplit_template.json b/video_prediction_tools/data_split/era5/datasplit_template.json
similarity index 100%
rename from video_prediction_tools/data_split/datasplit_template.json
rename to video_prediction_tools/data_split/era5/datasplit_template.json
diff --git a/video_prediction_tools/data_split/kth/datasplit.json b/video_prediction_tools/data_split/kth/datasplit.json
new file mode 100644
index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19
--- /dev/null
+++ b/video_prediction_tools/data_split/kth/datasplit.json
@@ -0,0 +1,14 @@
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/data_split/kth/datasplit_template.json b/video_prediction_tools/data_split/kth/datasplit_template.json
new file mode 100644
index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866
--- /dev/null
+++ b/video_prediction_tools/data_split/kth/datasplit_template.json
@@ -0,0 +1,21 @@
+# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset.
+#       If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs.
+#       However, remember to remove any comment lines (starting with #) from your config-file then!!!
+#
+# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the 
+#              raw dataset
+#              Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here!
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/data_split/moving_mnist/datasplit.json b/video_prediction_tools/data_split/moving_mnist/datasplit.json
new file mode 100644
index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19
--- /dev/null
+++ b/video_prediction_tools/data_split/moving_mnist/datasplit.json
@@ -0,0 +1,14 @@
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/data_split/moving_mnist/datasplit_template.json b/video_prediction_tools/data_split/moving_mnist/datasplit_template.json
new file mode 100644
index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866
--- /dev/null
+++ b/video_prediction_tools/data_split/moving_mnist/datasplit_template.json
@@ -0,0 +1,21 @@
+# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset.
+#       If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs.
+#       However, remember to remove any comment lines (starting with #) from your config-file then!!!
+#
+# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the 
+#              raw dataset
+#              Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here!
+{
+    "train":{ 
+             "index1":[0,100],
+	     "index2":[150,200]
+             },
+     "val":
+             {
+             "index1":[110,149]
+             },
+      "test":
+             {
+	     "index1":[150,200]
+             }
+ }
diff --git a/video_prediction_tools/main_scripts/main_preprocess_data_step2.py b/video_prediction_tools/main_scripts/main_preprocess_data_step2.py
index 3695b90b30465b6698223bfcf9415149533be6d2..d5a57b8d173c65e5b8c7bda459053bc21eaf07ab 100644
--- a/video_prediction_tools/main_scripts/main_preprocess_data_step2.py
+++ b/video_prediction_tools/main_scripts/main_preprocess_data_step2.py
@@ -12,26 +12,20 @@ from mpi4py import MPI
 from general_utils import get_unique_vars
 from statistics import Calc_data_stat
 from data_preprocess.preprocess_data_step2 import *
+import warnings
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("-input_dir", type=str)
-    parser.add_argument("-output_dir", type=str)
-    parser.add_argument("-datasplit_config", type=str, \
-                        help="The path to the datasplit_config json file which contains the details of train/val/testing")
-    parser.add_argument("-hparams_dict_config", type=str,\
-                        help="The path to the dict that contains hparameters.", default="")
+    parser.add_argument("-sequence_length", type=int, default=20)
     parser.add_argument("-sequences_per_file", type=int, default=20)
     args = parser.parse_args()
-    ins = ERA5Pkl2Tfrecords(input_dir=args.input_dir, output_dir=args.output_dir,
-                            datasplit_config=args.datasplit_config,
-                            hparams_dict_config=args.hparams_dict_config,
-                            sequences_per_file=args.sequences_per_file)
+    ins = ERA5Pkl2Tfrecords(input_dir=args.input_dir,
+                             sequence_length = args.sequence_length,
+                             sequences_per_file=args.sequences_per_file)
     
-    partition = ins.data_dict
-    partition_data = partition.values()
-    years, months = ins.get_years_months()
+    years, months,years_months = ins.get_years_months()
     input_dir_pkl = os.path.join(args.input_dir, "pickle")
     # ini. MPI
     comm = MPI.COMM_WORLD
@@ -49,31 +43,28 @@ def main():
     
         # loop over whole data set (training, dev and test set) to collect the intermediate statistics
         print("Start collecting statistics from the whole dataset to be processed...")
-        for split in partition.keys():
-            values = partition[split]
-            for year in values.keys():
-                file_dir = os.path.join(stat_dir_prefix, year)
-                for month in values[year]:
-                    if os.path.isfile(os.path.join(file_dir, "stat_" + '{0:02}'.format(month) + ".json")):
-                        # process stat-file:
-                        stat_obj.acc_stat_master(file_dir, int(month))  # process monthly statistic-file
-                    else:
-                        raise ("The stat file does not exist:", os.path.join(file_dir, "stat_" + '{0:02}'.\
-                                                                             format(month) + ".json"))
+       
+        for year in years:
+            file_dir = os.path.join(stat_dir_prefix, year)
+            for month in months:
+                if os.path.isfile(os.path.join(file_dir, "stat_" + '{0:02}'.format(month) + ".json")):
+                    # process stat-file:
+                    stat_obj.acc_stat_master(file_dir, int(month))  # process monthly statistic-file
+                else:
+                    warnings.warn("The stat file for year {} month {} does not exist".format(year, month))
         # finalize statistics and write to json-file
         stat_obj.finalize_stat_master(vars_uni)
         stat_obj.write_stat_json(input_dir_pkl)
 
         # organize parallelized partioning 
-        
         real_years_months = []
-        for year_months in partition_data:
-            print("I am here year:", year_months)
-            for year in year_months.keys():
-                for month in year_months[year]:
-                    print("I am here month", month)
-                    year_month = "Y_{}_M_{}".format(year, month)
-                    real_years_months.append(year_month)
+        for i in range(len(years)):
+            year = years[i]
+            print("I am here year:", year)
+            for month in years_months[i]:
+                print("I am here month", month)
+                year_month = "Y_{}_M_{}".format(year, month)
+                real_years_months.append(year_month)
  
         broadcast_lists = [list(years), real_years_months]
 
@@ -95,14 +86,12 @@ def main():
         real_years_months = message_in[1] 
    
         for year in years:
-            # loop over the years in the datasplit_dict which we want to process,
-            # while months that are not in the datasplit_dict are skipped
             year_rank = "Y_{}_M_{}".format(year, my_rank)
             if year_rank in real_years_months:
                 # Initilial instance
-                ins2 = ERA5Pkl2Tfrecords(input_dir=args.input_dir, output_dir=args.output_dir,
-                            datasplit_config=args.datasplit_config,
-                            hparams_dict_config=args.hparams_dict_config, sequences_per_file=args.sequences_per_file)
+                ins2 = ERA5Pkl2Tfrecords(input_dir=args.input_dir,
+                                         sequence_length = args.sequence_length,
+                                         sequences_per_file=args.sequences_per_file)
                 # create the tfrecords-files
                 ins2.read_pkl_and_save_tfrecords(year=year, month=my_rank)
                 print("Year {} finished", year)