diff --git a/.gitignore b/.gitignore index 5d7d8d4f0ec66e7d19e91b726d39e1d75141e308..a9ce13f09a5ead9fa3eebcf935537256b2ecfdf8 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,4 @@ virtual_env*/ **/hickle *.tfrecords **/era5_size_64_64_3_3t_norm + diff --git a/workflow_parallel_frame_prediction/DataExtraction/external_function.py b/workflow_parallel_frame_prediction/DataExtraction/external_function.py new file mode 100644 index 0000000000000000000000000000000000000000..cf59513b9435cf6cb799d648451d3e94d0855a29 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/external_function.py @@ -0,0 +1,159 @@ +#from mpi4py import MPI +from os import walk +import os +import sys +import subprocess +import logging +import time +import hashlib + +# ======================= List of functions ====================================== # + +# check the rank and print it + +def logger(file_name, logger_level, program_name): + # Log file starter + + logging.basicConfig(filename=file_name, level=logger_level, + format='%(asctime)s:%(levelname)s:%(message)s') + logging.debug(' === PyStager is started === ') + print(str(program_name) + ' is Running .... ') + + +def config_file(config_file_name): + params = {} + for line in open(config_file_name): + line = line.strip() + read_in_value = line.split("=") + if len(read_in_value) == 2: + params[read_in_value[0].strip()] = read_in_value[1].strip() + + source_dir = str(params["Source_Directory"]) + print(source_dir) + destination_dir = str(params["Destination_Directory"]) + log_dir = str(params["Log_Directory"]) + rsync_status = int(params["Rsync_Status"]) + return source_dir, destination_dir, log_dir, rsync_status + + +def directory_scanner(source_path): + # Take a look inside a directories and make a list of ll the folders, sub directories, number of the files and size + # NOTE : It will neglect if there is a sub-directories inside directories!!! + + dir_detail_list = [] # directories details + sub_dir_list = [] + total_size_source = 0 + total_num_files = 0 + list_directories = [] + + list_directories = os.listdir(source_path) + print(list_directories) + print(int(len(list_directories))) + + for d in list_directories: + print(d) + path = source_path + d + print(path) + if os.path.isdir(path): + sub_dir_list.append(d) + sub_dir_list.sort() + num_files = 0 + # size of the files and subdirectories + size_dir = subprocess.check_output(['du', '-sc', path]) + splitted = size_dir.split() # fist item is the size of the folder + size = (splitted[0]) + num_files = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]) + dir_detail_list.extend([d, size, num_files]) + total_num_files = total_num_files + int(num_files) + total_size_source = total_size_source + int(size) + else: + print(path, 'does not exist') + print("===== Debug here =====") + + total_num_directories = int(len(list_directories)) + total_size_source = float(total_size_source / 1000000) + + message = 'Total size of the source directory is:' + str(total_size_source) + 'Gb.' + print(message) + message = "Total number of the files in the source directory is: " + str(total_num_files) + print(message) + message = "Total number of the directories in the source directory is: " + str(total_num_directories) + print(message) + + return dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories + + +def load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories, p): + # create a dictionary with p number of keys + # for each directory they add the name to one of the keys + print ("range 1 to p is",list(range(1,p))) + transfer_dict = dict.fromkeys(list(range(1, p))) + print("transfer_dict:",transfer_dict) + # package_counter = 0 possibility to use the counter to fill + counter = 1 + for Directory_counter in range(0, total_num_directories): + + if transfer_dict[counter] is None: # if the value for the key is None add to it + transfer_dict[counter] = sub_dir_list[Directory_counter] + else: # if key has a value join the new value to the old value + transfer_dict[counter] = "{};{}".format(transfer_dict[counter], sub_dir_list[Directory_counter]) + counter = counter + 1 + if counter == p: + counter = 1 + + return transfer_dict + +def sync_file(source_path, destination_dir, job_name, rsync_status): + rsync_msg = ("rsync -r " + source_path + job_name + "/" + " " + destination_dir + "/" + job_name) + # print('Node:', str(my_rank),'will execute :', rsync_str,'\r\n') + # sync the assigned folder + + if rsync_status == 1: + os.system(rsync_msg) + + + +def hash_directory(source_path,job_name,hash_rep_file,input_status): + #sha256_hash = hashlib.sha256() + md5_hash = hashlib.md5() + + ########## Create a hashed file repasitory for direcotry(ies) assigned to node ####### + hash_repo_text = input_status + "_"+job_name +"_hashed.txt" + os.chdir(hash_rep_file) + hashed_text_note=open(hash_repo_text,"w+") + + # job_name is the name of the subdirectory that is going to be processed + directory_to_process = source_path + job_name + # print(directory_to_process) + files_list = [] + for dirpath, dirnames, filenames in os.walk(directory_to_process): + files_list.extend(filenames) + + os.chdir(directory_to_process) # change to the working directory + + for file_to_process in filenames: + + ## ======= this is the sha256 checksum ========= # + #with open(file_to_process,"rb") as f: + # # Read and update hash in chunks of 4K + # for byte_block in iter(lambda: f.read(4096),b""): + # sha256_hash.update(byte_block) + # hashed_file = sha256_hash.hexdigest() + + with open(file_to_process,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + hashed_file = md5_hash.hexdigest() + + hashed_text_note.write(hashed_file) + + return + +def md5(fname): + md5_hash = hashlib.md5() + with open(fname,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + return md5_hash.hexdigest() diff --git a/workflow_parallel_frame_prediction/DataExtraction/helper_single_master.py b/workflow_parallel_frame_prediction/DataExtraction/helper_single_master.py new file mode 100644 index 0000000000000000000000000000000000000000..a26d76395cca0ecb6d49ba53684e72f5d5d7f5b0 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/helper_single_master.py @@ -0,0 +1,245 @@ +from mpi4py import MPI +from os import walk +import os +import sys +import subprocess +import logging +import time +import hashlib +import argparse +from os import listdir +from os.path import isfile, join + +# ini. MPI +comm = MPI.COMM_WORLD +my_rank = comm.Get_rank() # rank of the node +p = comm.Get_size() # number of assigned nods +my_rank = comm.Get_rank() # rank of the node + + +# ======================= List of functions ====================================== # +if my_rank == 0: # node is master + + logger = logging.getLogger(__file__) + logger.addHandler(logging.StreamHandler(sys.stdout)) + + +def directory_scanner(source_path,load_level): + # Take a look inside a directories and make a list of ll the folders, sub directories, number of the files and size + # NOTE : It will neglect if there is a sub-directories inside directories!!! + # NOTE : It will discriminate between the load level : sub-directories / Files + + dir_detail_list = [] # directories details + list_items_to_process = [] + total_size_source = 0 + total_num_files = 0 + list_directories = [] + + ## =================== Here will be for the Files ================= ## + + if load_level == 1: + + # Listing all the files in the directory + for dirpath, dirnames, filenames in os.walk(source_path): + list_items_to_process.extend(filenames) + + for f in list_items_to_process : + path = source_path +"/"+ str(f) + statinfo = os.stat(path) + size = statinfo.st_size + total_size_source = total_size_source + int(size) + + total_num_files = len(list_items_to_process) # number of the files in the source + total_num_directories = int(0) # TODO need to unify the concept as the number of items + + ## ===================== Here will be for the directories ========== ## + + if load_level == 0: + list_directories = os.listdir(source_path) + + for d in list_directories: + path = source_path + d + if os.path.isdir(path): + list_items_to_process.append(d) + list_items_to_process.sort() + num_files = 0 + # size of the files and subdirectories + size_dir = subprocess.check_output(['du', '-sc', path]) + splitted = size_dir.split() # fist item is the size of the folder + size = (splitted[0]) + num_files = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]) + dir_detail_list.extend([d, size, num_files]) + total_num_files = total_num_files + int(num_files) + total_size_source = total_size_source + int(size) + + else: + message = path,'does not exist' + logging.error(message) + + + total_num_directories = int(len(list_directories)) + + ## ======================= End of the Directory case =================== ## + total_size_source = float(total_size_source / 1000000) # human readable size source + + logger.info("=== Directory Scanner output ===") + message = 'Total size of the source directory is:' + str(total_size_source) + 'Gb.' + logger.info(message) + message = "Total number of the files in the source directory is: " + str(total_num_files) + logger.info(message) + message = "Total number of the directories in the source directory is: " + str(total_num_directories) + logger.info(message) + + # Unifying the naming of this section for both cases : Sub - Directory or File + # dir_detail_list == > Including the name of the directories, size and number of teh files in each directory / for files is empty + # list_items_to_process === > List of items to process (Sub-Directories / Files) + # total_size_source === > Total size of the items to process + # total_num_files === > for Sub - Directories : sum of all files in different directories / for Files is sum of all + # total_num_directories === > for Files = 0 + + return dir_detail_list, list_items_to_process, total_size_source, total_num_files, total_num_directories + +# Source - Directoy +# Destination Rirectory +# Dir_detail_list +# list_items_to_process +# load level + +def data_structure_builder (source_dir, destination_dir, dir_detail_list, list_items_to_process,load_level): + + + if not os.path.exists(destination_dir): # check if the Destination dir. is existing + os_command = ("mkdir " + destination_dir) + os.system(os_command) + logger.info('destination path is created') + else: + logger.info('The destination path exists') + + + os.chdir(destination_dir) # chnage the directory to the destination + + if load_level == 0: + logging.info('Load Level = 0 : Data Sctructure will be build') + + for dir_name in list_items_to_process: + #print(dir_name) + dir_path = destination_dir + dir_name + + # TODO : os.mkdir() it can be cleaned up to use the OS predifnie functions + if not os.path.exists(dir_path): + #print(dir_name + " will be created ") + os_command = ("mkdir " + dir_name) + os.system(os_command) + logging.info(dir_name + " is created ") + + + if load_level == 1: + logger.info('Load Level = 1 : File will be processed') + + return + + + +def load_distributor(dir_detail_list, list_items_to_process, total_size_source, total_num_files, total_num_directories,load_level, processor_num): + firs_slave_processor_id = 1 + # create a dictionary with p number of keys + # for each directory they add the name to one of the keys + # here we define the first availabe slave node as well + transfer_dict = dict.fromkeys(list(range(firs_slave_processor_id, processor_num))) + print(transfer_dict) + logger.info("The follwoing is in the load Balancer ") + logger.info(transfer_dict) + logger.info(list_items_to_process) + logger.info(total_num_directories) + logger.info(total_num_files) + + # package_counter = 0 possibility to use the counter to fill + counter = firs_slave_processor_id # this is the ID of the first available slave to p! + + if load_level == 0: + for Directory_counter in range(0, total_num_directories): + if transfer_dict[counter] is None: # if the value for the key is None add to it + transfer_dict[counter] = list_items_to_process[Directory_counter] + else: # if key has a value join the new value to the old value + transfer_dict[counter] = "{};{}".format(transfer_dict[counter], list_items_to_process[Directory_counter]) + counter = counter + 1 + if counter == processor_num: + counter = firs_slave_processor_id + + if load_level == 1: + for File_counter in range(0, total_num_files): + if transfer_dict[counter] is None: # if the value for the key is None add to it + #print(" M1: New key made for a free processor number {my_rank}".format(my_rank = counter)) + # statemnet if we have more than number of the files processor available + if counter > len(list_items_to_process) + (firs_slave_processor_id - 1 ): + transfer_dict[counter] = None + else: + transfer_dict[counter] = list_items_to_process[File_counter] + + + + else: # if key has a value join the new value to the old value + transfer_dict[counter] = "{};{}".format(transfer_dict[counter], list_items_to_process[File_counter]) + counter = counter + 1 + if counter == processor_num: + counter = firs_slave_processor_id + + logging.info(transfer_dict) + return transfer_dict + +def sync_file(source_path, destination_dir, job_name, rsync_status): + rsync_msg = ("rsync -r " + source_path + job_name + "/" + " " + destination_dir + "/" + job_name) + # print('Node:', str(my_rank),'will execute :', rsync_str,'\r\n') + # sync the assigned folder + + if rsync_status == 1: + os.system(rsync_msg) + + return + + + +def hash_directory(source_path,job_name,hash_rep_file,input_status): + #sha256_hash = hashlib.sha256() + md5_hash = hashlib.md5() + + ########## Create a hashed file repasitory for direcotry(ies) assigned to node ####### + hash_repo_text = input_status + "_"+job_name +"_hashed.txt" + os.chdir(hash_rep_file) + hashed_text_note=open(hash_repo_text,"w+") + + # job_name is the name of the subdirectory that is going to be processed + directory_to_process = source_path + job_name + # print(directory_to_process) + files_list = [] + for dirpath, dirnames, filenames in os.walk(directory_to_process): + files_list.extend(filenames) + + os.chdir(directory_to_process) # change to the working directory + + for file_to_process in filenames: + + ## ======= this is the sha256 checksum ========= # + #with open(file_to_process,"rb") as f: + # # Read and update hash in chunks of 4K + # for byte_block in iter(lambda: f.read(4096),b""): + # sha256_hash.update(byte_block) + # hashed_file = sha256_hash.hexdigest() + + with open(file_to_process,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + hashed_file = md5_hash.hexdigest() + + hashed_text_note.write(hashed_file) + + return + +def md5(fname): + md5_hash = hashlib.md5() + with open(fname,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + return md5_hash.hexdigest() diff --git a/workflow_parallel_frame_prediction/DataExtraction/main_single_master.py b/workflow_parallel_frame_prediction/DataExtraction/main_single_master.py new file mode 100644 index 0000000000000000000000000000000000000000..4894408adeb1a41b106faafb75def912ca5e4ad5 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/main_single_master.py @@ -0,0 +1,243 @@ +from mpi4py import MPI +from os import walk +import sys +import subprocess +import logging +import time +import shutil +import glob +import argparse +import os + + +from helper_single_master import directory_scanner +from helper_single_master import load_distributor +from helper_single_master import hash_directory +from helper_single_master import data_structure_builder +from helper_single_master import md5 + +from prepare_era5_data import prepare_era5_data_one_file + +# How to Run it! +# mpirun -np 6 python mpi_stager_v2.py +# mpiexec -np 6 python mpi_stager_v2.py + + +def main(): + parser=argparse.ArgumentParser() + parser.add_argument("--job_id",type=int,default=100) + parser.add_argument("--source_dir",type=str,default="//home/a.mozaffari/data_era5/2017/") + parser.add_argument("--destination_dir",type=str,default="/home/a.mozaffari/data_dest/") + parser.add_argument("--log_temp",type=str,default="log_temp") + parser.add_argument("--checksum_status",type=int,default = 0) + parser.add_argument("--rsync_status",type=int,default=0) + parser.add_argument("--load_level",type=int,default=0) + parser.add_argument("--clear_destination",type=int,default=1) + args = parser.parse_args() + # for the local machine test + current_path = os.getcwd() + job_id = args.job_id + source_dir = args.source_dir + destination_dir = args.destination_dir + checksum_status = args.checksum_status + rsync_status = args.rsync_status + clear_destination = args.clear_destination + log_temp = args.log_temp + + + # for the local machine test + current_path = os.path.dirname(os.path.abspath(__file__)) + os.chdir(current_path) + time.sleep(0) + +# ini. MPI + comm = MPI.COMM_WORLD + my_rank = comm.Get_rank() # rank of the node + p = comm.Get_size() # number of assigned nods + firs_slave_processor_id = 1 + + + # ==================================== Master Logging ==================================================== # + # DEBUG: Detailed information, typically of interest only when diagnosing problems. + # INFO: Confirmation that things are working as expected. + # WARNING: An indication that something unexpected happened, or indicative of some problem in the near + # ERROR: Due to a more serious problem, the software has not been able to perform some function. + # CRITICAL: A serious error, indicating that the program itself may be unable to continue running. + # It will copy the logging messages to the stdout, for the case of container version on HPC + + if my_rank == 0: # node is master + + # delete the general logger if exist + logger_path = current_path + '/distribution_job_{job_id}.log'.format(job_id=job_id) + if os.path.isfile(logger_path): + print("Logger Exists -> Logger Deleted") + os.remove(logger_path) + logging.basicConfig(filename='distribution_job_{job_id}.log'.format(job_id=job_id), level=logging.DEBUG, + format='%(asctime)s:%(levelname)s:%(message)s') + logger = logging.getLogger(__file__) + logger.addHandler(logging.StreamHandler(sys.stdout)) + start = time.time() # start of the MPI + +# check the existence of the source path : + if not os.path.exists(source_dir): # check if the source dir. is existing + if my_rank == 0: + logger.critical('The source does not exist') + message_out = "Source : {source} is not existing -> Abort".format(source=source_dir) + logger.info('exit status : 1') + sys.exit(1) + +# Check if the destination is existing, if so, it will delete and recreate the destination_dir + if os.path.exists(destination_dir): + if my_rank == 0: + logger.info('The destination exist') + if clear_destination == 1: + shutil.rmtree(destination_dir) + os.mkdir(destination_dir) + logger.critical("Destination : {destination} exist -> Remove and Re-Cereate".format(destination=destination_dir)) + print("Destination : {destination} exist -> Remove and Re-Cereate".format(destination=destination_dir)) + + else: + logger.critical("Destination : {destination} exist -> will not be removed (caution : overwrite)".format(destination=destination_dir)) + print("Destination : {destination} exist -> will not be rmeoved (caution : overwrite)".format(destination=destination_dir)) + + # Create a log folder for slave-nodes to write down their processes + slave_log_path = os.path.join(destination_dir,log_temp) + + if my_rank == 0: + if os.path.exists(slave_log_path) == False: + os.mkdir(slave_log_path) + + if my_rank == 0: # node is master + + # ==================================== Master : Directory scanner {Parent level load level = 0} ================================= # + + logger.info("The source path is : {path}".format(path=source_dir)) + logger.info("The destination path is : {path}".format(path=destination_dir)) + logger.info("==== Directory scanner : start ====") + load_level = 0 + ret_dir_scanner = directory_scanner(source_dir,load_level) + #print(ret_dir_scanner) + + # Unifying the naming of this section for both cases : Sub - Directory or File + # dir_detail_list == > Including the name of the directories, size and number of teh files in each directory / for files is empty + # list_items_to_process === > List of items to process (Sub-Directories / Files) + # total_size_source === > Total size of the items to process + # total_num_files === > for Sub - Directories : sum of all files in different directories / for Files is sum of all + # total_num_directories === > for Files = 0 + + dir_detail_list = ret_dir_scanner[0] + list_items_to_process = ret_dir_scanner[1] + total_size_source = ret_dir_scanner[2] + total_num_files = ret_dir_scanner[3] + total_num_dir = ret_dir_scanner[4] + logger.info("==== Directory scanner : end ====") + + # ================================= Master : Data Structure Builder {Parent level load level = 0} ========================= # + + logger.info("==== Data Structure Builder : start ====") + data_structure_builder(source_dir, destination_dir, dir_detail_list, list_items_to_process,load_level) + logger.info("==== Data Structure Builder : end ====") + # message to inform the slaves that they will recive #Batch of messages including the logger_p + batch_info = list_items_to_process + for slaves in range (1,p): + comm.send(batch_info, dest=slaves) + + for batch_counter in range (0,len(batch_info)): + #relative_source = source_dir + str(batch_info[batch_counter]) +"/" + relative_source = os.path.join(source_dir,str(batch_info[batch_counter])) + print(relative_source) + logger.info("MA{my_rank}: Next to be processed is {task} loacted in {path} ".format(my_rank = my_rank,task=batch_info[batch_counter], path=relative_source)) + load_level = 1 # it will process the files in the relative source + + #________ Directory Scanner ______# + relative_ret_dir_scanner = directory_scanner(relative_source,load_level) + relative_dir_detail_list = relative_ret_dir_scanner[0] + relative_list_items_to_process = relative_ret_dir_scanner[1] + relative_total_size_source = relative_ret_dir_scanner[2] + relative_total_num_files = relative_ret_dir_scanner[3] + relative_total_num_dir = relative_ret_dir_scanner[4] + #________ Load Distribution ________# + relative_ret_load_balancer = load_distributor(relative_dir_detail_list, relative_list_items_to_process, relative_total_size_source, relative_total_num_files, relative_total_num_dir,load_level, p) + relative_transfer_dict = relative_ret_load_balancer + logger.info(relative_transfer_dict) + + #________ Communication ________# + + for processor in range(firs_slave_processor_id, p): + broadcast_list = relative_transfer_dict[processor] + comm.send(broadcast_list, dest=processor) + + receive_counter = 0 + total_number_messages = (p-1) * len(batch_info) - 1 + while receive_counter <= total_number_messages: + message_in = comm.recv() + logger.info("MA{my_rank}: S{message_in} ".format(my_rank=my_rank,message_in=message_in)) + receive_counter = receive_counter + 1 + + + # Cleaning up the slaves temprory log file, if it is empty. + if len(os.listdir(slave_log_path) ) == 0: + print("Temprory log file is empty, it is deleted") + os.removedirs(slave_log_path) + + + end = time.time() + termination_message = "MA{my_rank}: Sucssfully terminated with total time : {wall_time}".format(my_rank=my_rank,wall_time= end-start) + logger.info(termination_message) + sys.exit(0) + + else: # Processor is slave + + # ============================================= Slave : Send / Receive ============================================ # + # recive the #Batch process that will be recived + batch_info = comm.recv(source = 0) + #print("S{my_rank} will receive {todo_message} batch of task to process".format(my_rank=my_rank, todo_message=len(batch_info))) + batch_counter = 0 + + # here will be a loop around all the #batchs + + while batch_counter <= len(batch_info) -1: + message_in = comm.recv(source = 0) + relative_source_directory = os.path.join(source_dir,str(batch_info[batch_counter])) + relative_destination_directory = os.path.join(destination_dir,str(batch_info[batch_counter])) + + if message_in is None: # in case more than number of the dir. processor is assigned ! + slave_out_message = "{my_rank} is idle".format(my_rank=my_rank) + # comm.send(message_out, dest=1) + + else: # if the Slave node has joblist to do + job_list = message_in.split(';') + for job_count in range(0, len(job_list)): + job = job_list[job_count] # job is the name of the directory(ies) assigned to slave_node + #print(job) + if rsync_status == 1: + # prepare the rsync commoand to be excexuted by the worker node + rsync_message = "rsync {relative_source_directory}/{job} {relative_destination_directory}/{job}".format(relative_source_directory=relative_source_directory,job=job, relative_destination_directory=relative_destination_directory) + os.system(rsync_message) + #slave_out_message= " RSYNC process" + else : + ## @Bing here is the job for the slaves + print("S{my_rank} will execute era5 preperation on {job}".format(my_rank=my_rank, job=job)) + prepare_era5_data_one_file(src_file=job,directory_to_process=relative_source_directory, target=job, target_dir=relative_destination_directory) + + + + #if job.endswith(".nc"): + # if os.path.exists(os.path.join(relative_destination_directory, job)): + # print("{job} is has been processed in directory {directory}".format(job=job,directory=relative_destination_directory)) + #else: + # prepare_era5_data_one_file(src_file=job,directory_to_process=relative_source_directory, target=job, target_dir=relative_destination_directory) + # print("File {job} in directory {directory} has been processed in directory".format(job=job,directory=relative_destination_directory)) + # + #slave_out_message = " {in_message} process".format(in_message=my_rank) + # Generate a hash of the output + + message_out = "{my_rank}: is finished the {in_message} .".format(my_rank=my_rank,in_message=batch_info[batch_counter]) + comm.send(message_out, dest=0) + batch_counter = batch_counter + 1 + + MPI.Finalize() + + +if __name__ == "__main__": + main() diff --git a/workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py b/workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..632c29567590466264a39b60bdcd9ae0cd820b05 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py @@ -0,0 +1,213 @@ +from mpi4py import MPI +from os import walk +import sys +import subprocess +import logging +import time +from external_function import directory_scanner +from external_function import load_distributor +from external_function import hash_directory +from external_function import md5 +from prepare_era5_data import * +# How to Run it! +# mpirun -np 6 python mpi_stager_v2.py +import os +from pathlib import Path +import argparse + +def main(): + parser=argparse.ArgumentParser() + parser.add_argument("--source_dir",type=str,default="//home/a.mozaffari/data_era5/2017/") + parser.add_argument("--destination_dir",type=str,default="/home/a.mozaffari/data_dest") + parser.add_argument("--checksum_status",type=int,default = 0) + parser.add_argument("--rsync_status",type=int,default=1) + args = parser.parse_args() + # for the local machine test + current_path = os.getcwd() + source_dir = args.source_dir + destination_dir = args.destination_dir + checksum_status = args.checksum_status + rsync_status = args.rsync_status + + + # for the local machine test + current_path = os.getcwd() + # TODO : it will be integerated in the seperated read_in_file + #rot_grid="/mnt/rasdaman/DeepRain/gridneu.dat" + os.chdir(current_path) + time.sleep(0) + # ini. MPI + comm = MPI.COMM_WORLD + my_rank = comm.Get_rank() # rank of the node + p = comm.Get_size() # number of assigned nods + + # ============ configuration for data preprocessing =================== # + + # ==================================== Master Logging ==================================================== # + # DEBUG: Detailed information, typically of interest only when diagnosing problems. + # INFO: Confirmation that things are working as expected. + # WARNING: An indication that something unexpected happened, or indicative of some problem in the near + # ERROR: Due to a more serious problem, the software has not been able to perform some function. + # CRITICAL: A serious error, indicating that the program itself may be unable to continue running. + + if my_rank == 0: # node is master + logging.basicConfig(filename='stager.log', level=logging.DEBUG, + format='%(asctime)s:%(levelname)s:%(message)s') + start = time.time() # start of the MPI + logging.debug(' === PyStager is started === ') + print('PyStager is Running .... ') + + # ================================== ALL Nodes: Read-in parameters ====================================== # + + # fileName = "parameters.dat" # input parameters file + # fileObj = open(fileName) + # params = {} + + # for line in fileObj: + # line = line.strip() + # read_in_value = line.split("=") + # if len(read_in_value) == 2: + # params[read_in_value[0].strip()] = read_in_value[1].strip() + + # # input from the user: + # source_dir = str(params["Source_Directory"]) + # destination_dir = str(params["Destination_Directory"]) + # log_dir = str(params["Log_Directory"]) + # rsync_status = int(params["Rsync_Status"]) + # checksum_status = int(params["Checksum_Status"]) + + + + # check the existence of teh folders : + + if not os.path.exists(source_dir): # check if the source dir. is existing + if my_rank == 0: + logging.critical('The source does not exist') + logging.info('exit status : 1') + print('Critical : The source does not exist') + + sys.exit(1) + + if not os.path.exists(destination_dir): # check if the Destination dir. is existing + if my_rank == 0: + logging.critical('The Destination does not exist') + logging.info('Create a Destination dir') + if not os.path.exists(destination_dir): os.makedirs(destination_dir) + print('Create a Destination dir') + + + + if my_rank == 0: # node is master + + # ==================================== Master : Directory scanner ================================= # + + print(" # ============== Directory scanner : start ==================# ") + + ret_dir_scanner = directory_scanner(source_dir) + print(ret_dir_scanner) + + dir_detail_list = ret_dir_scanner[0] + sub_dir_list = ret_dir_scanner[1] + total_size_source = ret_dir_scanner[2] + total_num_files = ret_dir_scanner[3] + total_num_dir = ret_dir_scanner[4] + + # =================================== Master : Load Distribution ========================== # + + print(" # ============== Load Distrbution : start ==================# ") + #def load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories, p): + ret_load_balancer = load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_dir, p) + transfer_dict = ret_load_balancer + + + print(ret_load_balancer) + + # ===================================== Master : Send / Receive =============================== # + print(" # ============== Communication : start ==================# ") + + # Send : the list of the directories to the nodes + for nodes in range(1, p): + broadcast_list = transfer_dict[nodes] + comm.send(broadcast_list, dest=nodes) + + # Receive : will wait for a certain time to see if it will receive any critical error from the slaves nodes + idle_counter = p - len(sub_dir_list) + while idle_counter > 1: # non-blocking receive function + message_in = comm.recv() + logging.warning(message_in) + #print('Warning:', message_in) + idle_counter = idle_counter - 1 + + # Receive : Message from slave nodes confirming the sync + message_counter = 1 + while message_counter <= len(sub_dir_list): # non-blocking receive function + message_in = comm.recv() + logging.info(message_in) + message_counter = message_counter + 1 + + # stamp the end of the runtime + end = time.time() + logging.debug(end - start) + logging.info('== PyStager is done ==') + logging.info('exit status : 0') + print('PyStager is finished ') + + + sys.exit(0) + + else: # node is slave + + # ============================================= Slave : Send / Receive ============================================ # + message_in = comm.recv() + + if message_in is None: # in case more than number of the dir. processor is assigned todo Tag it! + message_out = ('Node', str(my_rank), 'is idle') + comm.send(message_out, dest=0) + + else: # if the Slave node has joblist to do + job_list = message_in.split(';') + + for job_count in range(0, len(job_list)): + job = job_list[job_count] # job is the name of the directory(ies) assigned to slave_node + #print(job) + + #grib_2_netcdf(rot_grid,source_dir, destination_dir, job) + + # creat a checksum ( hash) from the source folder. + if checksum_status == 1: + hash_directory(source_dir, job, current_path,"source") + + if rsync_status == 1: + # prepare the rsync commoand to be excexuted by the worker node + rsync_str = ("rsync -r " + source_dir + job + "/" + " " + destination_dir + "/" + job) + #os.system(rsync_str) + + process_era5_in_dir(job, src_dir=source_dir, target_dir=destination_dir) + + if checksum_status == 1: + hash_directory(destination_dir,job,current_path,"destination") + os.chdir(current_path) + source_hash_text = "source" + "_"+ job +"_hashed.txt" + destination_hash_text = "destination" + "_"+ job +"_hashed.txt" + if md5(source_hash_text) == md5(destination_hash_text): + msg_out = 'source: ' + job +' and destination: ' + job +' files are identical' + print(msg_out) + + else: + msg_out = 'integrity of source: ' + job +' and destination: ' + job +' files could not be verified' + print(msg_out) + + else: + rsync_str = "None" + + # Send : the finish of the sync message back to master node + + message_out = ('Node:', str(my_rank), 'finished :', rsync_str, '\r\n') + comm.send(message_out, dest=0) + + MPI.Finalize() + + + +if __name__ == "__main__": + main() diff --git a/workflow_parallel_frame_prediction/DataExtraction/prepare_era5_data.py b/workflow_parallel_frame_prediction/DataExtraction/prepare_era5_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f97bdf8236edb4b4eaac80513d9fb439486705a6 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/prepare_era5_data.py @@ -0,0 +1,154 @@ + +import numpy as np +from datetime import datetime +from netCDF4 import Dataset, date2num +from shiftgrid import shiftgrid +import os + +AUTHOR = 'Severin Hussmann (s.hussmann@fz-juelich.de)' +# specify source and target directories + +def source_file_name(year, month, day, hour): + #src_file = '{:04d}/{:02d}/ecmwf_era5_{:02d}{:02d}{:02d}{:02d}.nc'.format(year, month, year % 100, month, day, hour) + src_file = 'ecmwf_era5_{:02d}{:02d}{:02d}{:02d}.nc'.format(year % 100, month, day, hour) + return src_file + + +def prepare_era5_data_one_file(src_file,directory_to_process,target_dir, target="test.nc"): + try: + out_file = target + print(src_file, ' --> ', os.path.join(target_dir,out_file)) + fh = Dataset(os.path.join(directory_to_process, src_file), mode = 'r') + + lons = fh.variables['lon'][:] + lats = fh.variables['lat'][:] + + # load 2 metre temperature + t2 = fh.variables['T2'][0, :, :] + t2_shift, lons_shift = shiftgrid(180., t2, lons, start = False) + + # load mean sea level pressure + msl = fh.variables['MSL'][0, :, :] + msl_shift, lons_shift = shiftgrid(180., msl, lons, start = False) + + # transform geopotential to geopotential at 500hpa + gph = fh.variables['GPH'][0, :, :, :] + a = fh.variables['a'][:] # convert netCDF to numpy arrays + b = fh.variables['b'][:] # otherwise cannot iterate over netcdf4 variable + ps = fh.variables['ps'][0, :, :] + fh.close() + + # obtain dimensions + z_len, y_len, x_len = gph.shape + + # z_len = 137 + # y_len = 601 + # x_len = 1200 + # Function to calculate the Pressure in hPa at point x/y at level k + # p(k,j,i) = a(k) + b(k)*ps(j,i) + def calcP(z, x, y, a=a, b=b, ps=ps): + p = (a[z] + b[z] * ps[x, y]) / 100 + return p + + # pressure3d + p3d = np.fromfunction(calcP, (z_len, y_len, x_len), a=a, b=b, ps=ps, dtype = int) + + # level2d + yindices, xindices = np.indices((y_len, x_len)) + # calculate lowest level index where pressure is below 500 hPa + # beware of Himalaya, where surface pressure may be below 500 hPa + # - that region should actually contain missing values; here we cheat a little + l2d = np.argmax((p3d - 500) < 0., axis = 0) + l2d[l2d == 0] = 1 + # next lower level should have pressure above 500 hPa + # pressure levels in Gebhard Guenther's netcdf files are from surface to top of atmosphere + l2dm1 = l2d - 1 + # calculate interpolation measure + levfrac = (p3d[l2dm1[:], yindices, xindices] - 500.) / ( + p3d[l2dm1[:], yindices, xindices] - p3d[l2d[:], yindices, xindices]) + levfrac[levfrac < 0.] = 0. # Himalaya correction + print("l2d: ", np.min(l2d), np.max(l2d)) + print("levfrac: ", np.min(levfrac), np.max(levfrac)) + # gp500below: geopotential height below 500 hPa level (i.e. pressure > 500 hPa) + gp500below = gph[l2dm1[:], yindices, xindices] + gp500above = gph[l2d[:], yindices, xindices] + gp500 = gp500below + levfrac * (gp500above - gp500below) + print("gp500below: ", np.min(gp500below), np.max(gp500below)) + print("gp500above: ", np.min(gp500above), np.max(gp500above)) + + # convert values in array from geopotential to geopotential height + divider = lambda t: t / 9.8 + vfunc = np.vectorize(divider) + gph500 = vfunc(gp500) + + gph500_shift, lons_shift = shiftgrid(180., gph500, lons, start = False) + + os.chdir(target_dir) + test = Dataset(out_file, 'w', format = 'NETCDF4', clobber = True) + + # test.createDimension("channel", 3) + latD = test.createDimension('lat', y_len) + lonD = test.createDimension('lon', x_len) + timeD = test.createDimension('time', None) + # for debugging + levD = test.createDimension('lev', z_len) + + # print(test.dimensions) + t2_new = test.createVariable('T2', float, ('time', 'lat', 'lon'), zlib = True) + t2_new.units = 'K' + msl_new = test.createVariable('MSL', float, ('time', 'lat', 'lon'), zlib = True) + msl_new.units = 'Pa' + gph500_new = test.createVariable('gph500', float, ('time', 'lat', 'lon'), zlib = True) + gph500_new.units = 'm' + lat_new = test.createVariable('lat', float, ('lat',), zlib = True) + lat_new.units = 'degrees_north' + lon_new = test.createVariable('lon', float, ('lon',), zlib = True) + lon_new.units = 'degrees_east' + time_new = test.createVariable('time', 'f8', ('time',), zlib = True) + time_new.units = "hours since 2000-01-01 00:00:00" + time_new.calendar = "gregorian" + p3d_new = test.createVariable('p3d', float, ('lev', 'lat', 'lon'), zlib = True) + + lat_new[:] = lats + lon_new[:] = lons_shift + year, month, day, hour = extract_time_from_file_name(src_file) + dates = np.array([datetime(int(year), int(month), int(day), int(hour), 0, 0)]) + time_new[:] = date2num(dates, units = time_new.units, calendar = time_new.calendar) + + t2_new[:] = t2_shift.reshape(1, y_len, x_len) + msl_new[:] = msl_shift.reshape(1, y_len, x_len) + gph500_new[:] = gph500_shift.reshape(1, y_len, x_len) + p3d_new[:] = p3d + test.source_file = src_file + test.title = 'ECMWF ERA5 data sample for Deep Learning' + test.author = AUTHOR + test.close() + except Exception as exc: + print (exc) + pass + + +def extract_time_from_file_name(src_file): + year = int("20" + src_file[11:13]) + month = int(src_file[13:15]) + day = int(src_file[15:17]) + hour = int(src_file[17:19]) + return year, month, day, hour + +def process_era5_in_dir(job_name,src_dir,target_dir): + print ("job_name",job_name) + directory_to_process = os.path.join(src_dir, job_name) + print("Going to process file in directory {}".format(directory_to_process)) + files = os.listdir(directory_to_process) + os.chdir(directory_to_process) + #create a subdirectory based on months + target_dir2 = os.path.join(target_dir,job_name) + print("The processed files are going to be saved to directory {}".format(target_dir2)) + if not os.path.exists(target_dir2): os.mkdir(target_dir2) + for src_file in files: + if src_file.endswith(".nc"): + if os.path.exists(os.path.join(target_dir2, src_file)): + print(src_file," file has been processed in directory ", target_dir2) + else: + print ("==========Processing file {} =============== ".format(src_file)) + prepare_era5_data_one_file(src_file=src_file,directory_to_process=directory_to_process, target=src_file, target_dir=target_dir2) diff --git a/workflow_parallel_frame_prediction/DataExtraction/readme.md b/workflow_parallel_frame_prediction/DataExtraction/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..9e97dae81e2f5aa45e3cc676b2b90a1fa318145b --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/readme.md @@ -0,0 +1,2 @@ +`source create_env_zam347.sh {MPI}` <br/> +`mpirun -np {number of processors max 13 on zam347} python main_single_master.py` \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/DataExtraction/shiftgrid.py b/workflow_parallel_frame_prediction/DataExtraction/shiftgrid.py new file mode 100644 index 0000000000000000000000000000000000000000..081e0985743de69cc5e75142dafb7004199ea8f2 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/shiftgrid.py @@ -0,0 +1,61 @@ +"""shiftgrid function from mpl_toolkitsbasemap v1.1.0 +This function was copied here to avoid the hassle of importing basemap. +""" +import numpy as np +import numpy.ma as ma + + +def shiftgrid(lon0,datain,lonsin,start=True,cyclic=360.0): + """ + Shift global lat/lon grid east or west. + .. tabularcolumns:: |l|L| + ============== ==================================================== + Arguments Description + ============== ==================================================== + lon0 starting longitude for shifted grid + (ending longitude if start=False). lon0 must be on + input grid (within the range of lonsin). + datain original data with longitude the right-most + dimension. + lonsin original longitudes. + ============== ==================================================== + .. tabularcolumns:: |l|L| + ============== ==================================================== + Keywords Description + ============== ==================================================== + start if True, lon0 represents the starting longitude + of the new grid. if False, lon0 is the ending + longitude. Default True. + cyclic width of periodic domain (default 360) + ============== ==================================================== + returns ``dataout,lonsout`` (data and longitudes on shifted grid). + """ + if np.fabs(lonsin[-1]-lonsin[0]-cyclic) > 1.e-4: + # Use all data instead of raise ValueError, 'cyclic point not included' + start_idx = 0 + else: + # If cyclic, remove the duplicate point + start_idx = 1 + if lon0 < lonsin[0] or lon0 > lonsin[-1]: + raise ValueError('lon0 outside of range of lonsin') + i0 = np.argmin(np.fabs(lonsin-lon0)) + i0_shift = len(lonsin)-i0 + if ma.isMA(datain): + dataout = ma.zeros(datain.shape,datain.dtype) + else: + dataout = np.zeros(datain.shape,datain.dtype) + if ma.isMA(lonsin): + lonsout = ma.zeros(lonsin.shape,lonsin.dtype) + else: + lonsout = np.zeros(lonsin.shape,lonsin.dtype) + if start: + lonsout[0:i0_shift] = lonsin[i0:] + else: + lonsout[0:i0_shift] = lonsin[i0:]-cyclic + dataout[...,0:i0_shift] = datain[...,i0:] + if start: + lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]+cyclic + else: + lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx] + dataout[...,i0_shift:] = datain[...,start_idx:i0+start_idx] + return dataout,lonsout diff --git a/workflow_parallel_frame_prediction/DataExtraction/submitJob.sh b/workflow_parallel_frame_prediction/DataExtraction/submitJob.sh new file mode 100755 index 0000000000000000000000000000000000000000..2e55e377208a965d6f334a17441c573aa983fc8e --- /dev/null +++ b/workflow_parallel_frame_prediction/DataExtraction/submitJob.sh @@ -0,0 +1,22 @@ +#!/bin/bash -x +#SBATCH --account=deepacf +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +##SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --output=pystager-out.%j +#SBATCH --error=pystager-err.%j +#SBATCH --time=00:20:00 +#SBATCH --partition=devel +#SBATCH --mail-type=ALL +#SBATCH --mail-user=b.gong@fz-juelich.de +##jutil env activate -p deepacf + +module --force purge +module /usr/local/software/jureca/OtherStages +module load Stages/2019a +module load Intel/2019.3.199-GCC-8.3.0 ParaStationMPI/5.2.2-1 +module load mpi4py/3.0.1-Python-3.6.8 + +#srun python mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/#nc/2017/ --destination_dir /p/scratch/deepacf/bing/extractedData +srun python Extract_data_for_testing.py diff --git a/workflow_parallel_frame_prediction/DataPostprocess/Stager_devel_N_24_evaluation.sh b/workflow_parallel_frame_prediction/DataPostprocess/Stager_devel_N_24_evaluation.sh new file mode 100755 index 0000000000000000000000000000000000000000..1aa0ae9aff1f939186c9499674385fcb4caceab6 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/Stager_devel_N_24_evaluation.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --account=jjsc42 +# budget account where contingent is taken from# TASKS = NODES * GPUS_PER_NODE +#SBATCH --nodes=3 +#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks=12 +# can be omitted if --nodes and --ntasks-per-node +# are given +# SBATCH --cpus-per-task=1 +# for OpenMP/hybrid jobs only +#SBATCH --output=horovod-4ntasks%j.out +# if keyword omitted: Default is slurm-%j.out in +# the submission directory (%j is replaced by +# the job ID). +#SBATCH --error=horovod-4ntasks%j.err +# if keyword omitted: Default is slurm-%j.out in +# the submission directory. +#SBATCH --time=20:00:00 +#SBATCH --gres=gpu:4 +#SBATCH --partition=gpus +#SBATCH --mail-user=b.gong@fz-juelich.de +#SBATCH --mail-type=ALL + +#create a folder to save the output + +module --force purge +module --force purge +module load Stages/Devel-2019a +module load GCC/8.3.0 +module load MVAPICH2/2.3.2-GDR +#module /usr/local/software/jureca/OtherStages +module load Stages/2019a +module load GCCcore/.8.3.0 +module load cuDNN/7.5.1.10-CUDA-10.1.105 +module load Horovod/0.16.2-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +#module load Intel/2019.3.199-GCC-8.3.0 ParaStationMPI/5.2.2-1-mt +#module load mpi4py/3.0.1-Python-3.6.8 + +srun python3.6 kitti_evaluate_parallel.py diff --git a/workflow_parallel_frame_prediction/DataPostprocess/data_utils.py b/workflow_parallel_frame_prediction/DataPostprocess/data_utils.py new file mode 100755 index 0000000000000000000000000000000000000000..8751fc42e36d74c81c328702db8569013ba51a69 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/data_utils.py @@ -0,0 +1,164 @@ +import hickle as hkl +import numpy as np +from keras import backend as K +from keras.preprocessing.image import Iterator + + +import inspect +print(inspect.getmembers(hkl,predicate=inspect.ismethod)) + +# Data generator that creates sequences for input into PredNet. +class SequenceGenerator(Iterator): + def __init__(self, data_file, source_file, nt, + batch_size=8, shuffle=False, seed=None, + output_mode='error', sequence_start_mode='all', N_seq=None, + data_format=K.image_data_format()): + self.X = hkl.load(data_file) # X will be like (n_images, nb_cols, nb_rows, nb_channels) + self.sources = hkl.load(source_file) # source for each image so when creating sequences can assure that consecutive frames are from same video + self.nt = nt + self.batch_size = batch_size + self.data_format = data_format + assert sequence_start_mode in {'all', 'unique'}, 'sequence_start_mode must be in {all, unique}' + self.sequence_start_mode = sequence_start_mode + assert output_mode in {'error', 'prediction'}, 'output_mode must be in {error, prediction}' + self.output_mode = output_mode + + if self.data_format == 'channels_first': + self.X = np.transpose(self.X, (0, 3, 1, 2)) + self.im_shape = self.X[0].shape + + if self.sequence_start_mode == 'all': # allow for any possible sequence, starting from any frame + #bing + #self.possible_starts = np.array([i for i in range(self.X.shape[0] - self.nt) if self.sources[i] == self.sources[i + self.nt - 1]]) + self.possible_starts = np.array([i for i in range(self.X.shape[0] - self.nt)]) + elif self.sequence_start_mode == 'unique': #create sequences where each unique frame is in at most one sequence + curr_location = 0 + possible_starts = [] + while curr_location < self.X.shape[0] - self.nt + 1: + if self.sources[curr_location] == self.sources[curr_location + self.nt - 1]: + possible_starts.append(curr_location) + curr_location += self.nt + else: + curr_location += 1 + self.possible_starts = possible_starts + + if shuffle: + self.possible_starts = np.random.permutation(self.possible_starts) + if N_seq is not None and len(self.possible_starts) > N_seq: # select a subset of sequences if want to + self.possible_starts = self.possible_starts[:N_seq] + self.N_sequences = len(self.possible_starts) + print("N_sequences", self.N_sequences) + super(SequenceGenerator, self).__init__(len(self.possible_starts), batch_size, shuffle, seed) + + def __getitem__(self, null): + return self.next() + + def next(self): + with self.lock: + current_index = (self.batch_index * self.batch_size) % self.n + index_array, current_batch_size = next(self.index_generator), self.batch_size + batch_x = np.zeros((current_batch_size, self.nt) + self.im_shape, np.float32) + for i, idx in enumerate(index_array): + idx = self.possible_starts[idx] + batch_x[i] = self.preprocess(self.X[idx:idx+self.nt]) + if self.output_mode == 'error': # model outputs errors, so y should be zeros + batch_y = np.zeros(current_batch_size, np.float32) + elif self.output_mode == 'prediction': # output actual pixels + batch_y = batch_x + return batch_x, batch_y + + def preprocess(self, X): + ### Normalization after extrema cut off: ### + #cut maxs & mins to mean+3*std & mean-3*std of training set for each parameter + #x_cut = np.zeros(shape=X.shape) + #x_cut = X*1 #pass X by value and not by reference + #x_cut[:,:,:,0][X[:,:,:,0]>311.5]=311.5 #set T2 upper limit + #x_cut[:,:,:,0][X[:,:,:,0]<258.9]=258.9 #set T2 lower limit + #x_cut[:,:,:,1][X[:,:,:,1]>104635.2]=104635.2 #set GP upper limit + #x_cut[:,:,:,1][X[:,:,:,1]<98205.6]=98205.6 #set GP lower limit ###Caution: Drastical cut ### + #x_cut[:,:,:,2][X[:,:,:,2]>6209.5]=6209.5 #set GPH upper limit ###Caution: Unnecessary as it succeeds max GPH ### + #x_cut[:,:,:,2][X[:,:,:,2]<5005.8]=5005.8 #set GPH lower limit + #normalize X based on max and min values(equals upper and lower limits except highCutGPH) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (x_cut[:,:,:,0]-258.9)/(311.5-258.9) + #x_processed[:,:,:,1] = (x_cut[:,:,:,1]-98205.6)/(104635.2-98205.6) + #x_processed[:,:,:,2] = (x_cut[:,:,:,2]-5005.8)/(6007.097417091836-5005.8) #GPH max stays; see above + + ### 'Standard' normalization: (x-min(x))/(max(x)-min(x)) ### + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-93401.125)/(105391.4375-93401.125) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + + ### t2only 'Standard' normalization: (x-min(x))/(max(x)-min(x)) ### + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-235.2141571044922)/(321.46630859375-235.2141571044922) + + ### t2_2MSL_1 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-93401.125)/(105391.4375-93401.125) + + ### t2_1MSL_2 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-93401.125)/(105391.4375-93401.125) + #x_processed[:,:,:,2] = (X[:,:,:,2]-93401.125)/(105391.4375-93401.125) + + ### t2_2gph500_1 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + ## t2_1gph500_2 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-4836.070232780612)/(6007.097417091836-4836.070232780612) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + + ### No standardization for moving Objects test set: Just 0s and 1s + #x_processed = np.zeros(shape=X.shape) + #x_processed = X + + ### t2_1 'standard' normalization (got one dimension less, due to just one channel) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = X[:,:,:,1] + #x_processed[:,:,:,2] = X[:,:,:,2] + + ### t2_1 'standard' normalization (got one dimension less, due to just one channel) + x_processed = np.zeros(shape=X.shape) + x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + x_processed[:,:,:,2] = X[:,:,:,2] + + ### Standardization: (x-mean)/standard_deviation ### + #Doesn't work due to some activation functions + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-285.1751264870658)/8.770013367617763 + #x_processed[:,:,:,1] = (X[:,:,:,1]-101420.4382666807)/1071.5999818175521 + #x_processed[:,:,:,2] = (X[:,:,:,2]-5607.662795353248)/200.62593105865764 + + ### Standardization+Normalization ### + # standardize:(x-mean)/standard_deviation + #x_preprocessed = np.zeros(shape=X.shape) + #x_preprocessed[:,:,:,0] = (X[:,:,:,0]-285.1751264870658)/8.770013367617763 + #x_preprocessed[:,:,:,1] = (X[:,:,:,1]-101420.4382666807)/1071.5999818175521 + #x_preprocessed[:,:,:,2] = (X[:,:,:,2]-5607.662795353248)/200.62593105865764 + # normalize:(x-min(x))/(max(x)-min(x)) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (x_preprocessed[:,:,:,0]-np.amin(x_preprocessed[:,:,:,0]))/(np.amax(x_preprocessed[:,:,:,0])-np.amin(x_preprocessed[:,:,:,0])) + #x_processed[:,:,:,1] = (x_preprocessed[:,:,:,1]-np.amin(x_preprocessed[:,:,:,1]))/(np.amax(x_preprocessed[:,:,:,1])-np.amin(x_preprocessed[:,:,:,1])) + #x_processed[:,:,:,2] = (x_preprocessed[:,:,:,2]-np.amin(x_preprocessed[:,:,:,2]))/(np.amax(x_preprocessed[:,:,:,2])-np.amin(x_preprocessed[:,:,:,2])) + + return x_processed.astype(np.float32) + #return X.astype(np.float32) / 255 + + def create_all(self): + X_all = np.zeros((self.N_sequences, self.nt) + self.im_shape, np.float32) + for i, idx in enumerate(self.possible_starts): + X_all[i] = self.preprocess(self.X[idx:idx+self.nt]) + return X_all diff --git a/workflow_parallel_frame_prediction/DataPostprocess/keras_utils.py b/workflow_parallel_frame_prediction/DataPostprocess/keras_utils.py new file mode 100755 index 0000000000000000000000000000000000000000..ededcc74fed982654d82cfb610b79224f1e08554 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/keras_utils.py @@ -0,0 +1,58 @@ +import os +import numpy as np + +from keras import backend as K +from keras.legacy.interfaces import generate_legacy_interface, recurrent_args_preprocessor +from keras.models import model_from_json + +legacy_prednet_support = generate_legacy_interface( + allowed_positional_args=['stack_sizes', 'R_stack_sizes', + 'A_filt_sizes', 'Ahat_filt_sizes', 'R_filt_sizes'], + conversions=[('dim_ordering', 'data_format'), + ('consume_less', 'implementation')], + value_conversions={'dim_ordering': {'tf': 'channels_last', + 'th': 'channels_first', + 'default': None}, + 'consume_less': {'cpu': 0, + 'mem': 1, + 'gpu': 2}}, + preprocessor=recurrent_args_preprocessor) + +# Convert old Keras (1.2) json models and weights to Keras 2.0 +def convert_model_to_keras2(old_json_file, old_weights_file, new_json_file, new_weights_file): + from prednet import PredNet + # If using tensorflow, it doesn't allow you to load the old weights. + if K.backend() != 'theano': + os.environ['KERAS_BACKEND'] = backend + reload(K) + + f = open(old_json_file, 'r') + json_string = f.read() + f.close() + model = model_from_json(json_string, custom_objects = {'PredNet': PredNet}) + model.load_weights(old_weights_file) + + weights = model.layers[1].get_weights() + if weights[0].shape[0] == model.layers[1].stack_sizes[1]: + for i, w in enumerate(weights): + if w.ndim == 4: + weights[i] = np.transpose(w, (2, 3, 1, 0)) + model.set_weights(weights) + + model.save_weights(new_weights_file) + json_string = model.to_json() + with open(new_json_file, "w") as f: + f.write(json_string) + + +if __name__ == '__main__': + old_dir = './model_data/' + new_dir = './model_data_keras2/' + if not os.path.exists(new_dir): + os.mkdir(new_dir) + for w_tag in ['', '-Lall', '-extrapfinetuned']: + m_tag = '' if w_tag == '-Lall' else w_tag + convert_model_to_keras2(old_dir + 'prednet_kitti_model' + m_tag + '.json', + old_dir + 'prednet_kitti_weights' + w_tag + '.hdf5', + new_dir + 'prednet_kitti_model' + m_tag + '.json', + new_dir + 'prednet_kitti_weights' + w_tag + '.hdf5') diff --git a/workflow_parallel_frame_prediction/DataPostprocess/kitti_evaluate_parallel.py b/workflow_parallel_frame_prediction/DataPostprocess/kitti_evaluate_parallel.py new file mode 100755 index 0000000000000000000000000000000000000000..1c1166eebfc09f947a0bf899b9d751a1ae061e67 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/kitti_evaluate_parallel.py @@ -0,0 +1,159 @@ +''' +Evaluate trained PredNet on KITTI sequences. +Calculates mean-squared error and plots predictions. +''' +import math +import os +import numpy as np +#from six.moves import cPickle +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +from keras import backend as K +from keras.models import Model, model_from_json +from keras.layers import Input, Dense, Flatten +from prednet import PredNet +from data_utils import SequenceGenerator +from kitti_settings import * +##Just for checking how the shape is after generator.create_all() from Sequence Generator +#import hickle as hkl +import horovod.keras as hvd +import tensorflow as tf + +#Horovod:initialize horovod +hvd.init() +#Horovod: pin GPU to be used for process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +n_plot = 10 +batch_size = 5 +nt = 10 + +##Configure the horovod + +#weights_file = os.path.join(WEIGHTS_DIR, 'tensorflow_weights/prednet_kitti_weights.hdf5') +weights_file = os.path.join(WEIGHTS_DIR, 'prednet_kitti_weights.hdf5') +print("weights loaded") +json_file = os.path.join(WEIGHTS_DIR, 'prednet_kitti_model.json') +print("model loaded") +test_file = os.path.join(DATA_DIR, 'X_train.hkl') +print("test file loaded") +test_sources = os.path.join(DATA_DIR, 'sources_test.hkl') +print("test source loaded") + +# Load trained model +f = open(json_file, 'r') +json_string = f.read() +f.close() +train_model = model_from_json(json_string, custom_objects = {'PredNet': PredNet}) +train_model.load_weights(weights_file) +print("Trained model is loaded") + +# Create testing model (to output predictions) +layer_config = train_model.layers[1].get_config() +layer_config['output_mode'] = 'prediction' +data_format = layer_config['data_format'] if 'data_format' in layer_config else layer_config['dim_ordering'] +test_prednet = PredNet(weights=train_model.layers[1].get_weights(), **layer_config) +input_shape = list(train_model.layers[0].batch_input_shape[1:]) +input_shape[0] = nt +inputs = Input(shape=tuple(input_shape)) +predictions = test_prednet(inputs) +test_model = Model(inputs=inputs, outputs=predictions) + +#Bing: change the unique to all,need to ask severin why training data use all , but for testing use unique +test_generator = SequenceGenerator(test_file, test_sources, nt, sequence_start_mode='all', data_format=data_format) +print ("test_generator finished") +X_test = test_generator.create_all() +print(X_test.shape) +#Bing, replace this part with horovod because of the memeory issue +X_test_batch_size = round(X_test.shape[0]/hvd.size()) + +def post_process(rank = 0): + print ("Rank {}".format(rank)) + X_test_batch = X_test[rank * X_test_batch_size:(rank + 1) * X_test_batch_size] + print ("X_test_batch size",X_test_batch.shape) + X_hat = test_model.predict(X_test_batch, batch_size) + + if data_format == 'channels_first': + X_test_batch = np.transpose(X_test_batch, (0, 1, 3, 4, 2)) + X_hat = np.transpose(X_hat, (0, 1, 3, 4, 2)) + + ##Just for checking how the shape is after generator.create_all() from Sequence Generator + #hkl.dump(X_test, os.path.join(RESULTS_SAVE_DIR, 'X_AfterGeneratorStandardized.hkl')) + #hkl.dump(X_hat, os.path.join(RESULTS_SAVE_DIR, 'X_hatStandardized.hkl')) + # + + # Compare MSE of PredNet predictions vs. using last frame. Write results to prediction_scores.txt + # Furthermore, calculate Model MSE from the last prediction of the sequence only + # as the model improves after several frames (mse_model_last) + # Typical shape of X_test and X_hat: (263, 10, 128, 160, 3) + # where 263 are the sequences, 10 ist the amount of frames in one sequence, + # 128 & 160 are the image sice and 3 the number of layers. + # For our case only take layer 0 (= T2) into account. + shapeXhat = str(X_hat.shape) #Just have a look at the shapes to be sure we are calculating the right MSE + shapeXtest = str(X_test_batch.shape) + mse_model = np.mean((X_test_batch[:, 1:,:,:,0] - X_hat[:, 1:,:,:,0])**2) # look at all timesteps except the first + mse_model_last = np.mean((X_test_batch[:, 9,:,:,0] - X_hat[:, 9,:,:,0])**2 ) + mse_prev = np.mean((X_test_batch[:, :-1,:,:,0] - X_test_batch[:, 1:,:,:,0])**2) + + # Calculate PSNR + # Function to calculate PSNR + # In the absence of noise, the two images I and K are identical, and thus the MSE is zero. In this case the PSNR is infinite. + # Or here the best value: 100 + def psnr(img1, img2): + mse = np.mean((img1-img2) ** 2) + if mse == 0: return 100 + PIXEL_MAX = 1 + return 20 * math.log10(PIXEL_MAX/math.sqrt(mse)) + + psnr_model = psnr(X_test_batch[:, 1:,:,:,0], X_hat[:, 1:,:,:,0]) + psnr_model_last = psnr(X_test_batch[:, 9,:,:,0], X_hat[:, 9,:,:,0]) + psnr_prev = psnr(X_test_batch[:, :-1,:,:,0], X_test_batch[:, 1:,:,:,0]) + print("Evaluations are completed") + + if not os.path.exists(RESULTS_SAVE_DIR): os.mkdir(RESULTS_SAVE_DIR) + f = open(RESULTS_SAVE_DIR + '/prediction_scores_rank_{}.txt'.format(rank), 'w') + + f.write("X_test_batch_size:{} ; Rank: {}\n".format(X_test_batch_size,hvd.rank())) + f.write("Model MSE: %f\n" % mse_model) + f.write("Model MSE from only last prediction in sequence: %f\n" % mse_model_last) + f.write("Previous Frame MSE: %f\n" % mse_prev) + f.write("Model PSNR: %f\n" % psnr_model) + f.write("Model PSNR from only last prediction in sequence: %f\n" % psnr_model_last) + f.write("Previous frame PSNR: %f\n" % psnr_prev) + f.write("Shape of X_test: " + shapeXtest) + f.write("") + f.write("Shape of X_hat: " + shapeXhat) + f.close() + print("Results are saved to {}\n".format(RESULTS_SAVE_DIR + "/prediction_scores_rank_{}.txt".format(rank))) + + # Plot some predictions + aspect_ratio = float(X_hat.shape[2]) / X_hat.shape[3] + plt.figure(figsize = (nt, 2*aspect_ratio)) + gs = gridspec.GridSpec(2, nt) + gs.update(wspace=0., hspace=0.) + plot_save_dir = os.path.join(RESULTS_SAVE_DIR, 'prediction_plots_rank_{}/'.format(rank)) + if not os.path.exists(plot_save_dir): os.mkdir(plot_save_dir) + plot_idx = np.random.permutation(X_test_batch.shape[0])[:n_plot] + for i in plot_idx: + for t in range(nt): + plt.subplot(gs[t]) + plt.imshow(X_test_batch[i,t,:,:,0], interpolation='none') #the last index sets the channel. 0 = t2 + #plt.pcolormesh(X_test[i,t,::-1,:,0], shading='bottom', cmap=plt.cm.jet) + plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False) + if t==0: plt.ylabel('Actual', fontsize=10) + + plt.subplot(gs[t + nt]) + plt.imshow(X_hat[i,t,:,:,0], interpolation='none') + #plt.pcolormesh(X_hat[i,t,::-1,:,0], shading='bottom', cmap=plt.cm.jet) + plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False) + if t==0: plt.ylabel('Predicted', fontsize=10) + plt.savefig(plot_save_dir + 'plot_' + str(i) + '.png') + plt.clf() + + +post_process(rank = hvd.rank()) \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/DataPostprocess/kitti_settings.py b/workflow_parallel_frame_prediction/DataPostprocess/kitti_settings.py new file mode 100755 index 0000000000000000000000000000000000000000..489989eecaa4a65abb6614d76402a3f153cca850 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/kitti_settings.py @@ -0,0 +1,19 @@ +# Where KITTI data will be saved if you run process_kitti.py +# If you directly download the processed data, change to the path of the data. +## Changed logic: Now this is the path where the processed data lies: X_train,val,test +#DATA_DIR = './kitti_data/' +#data directory for training data 2015 and 2016 +#DATA_DIR = '/p/project/cjjsc42/severin/try3' +#data directory for moving objects: +#DATA_DIR = '/p/home/jusers/hussmann1/jureca/movingObjects/se_nw' +#data directory for featuretesting: +##DATA_DIR = './testTry2' +DATA_DIR = '/p/scratch/cjjsc42/bing/pystager-development/processData/splits' +# Where model weights and config will be saved if you run kitti_train.py +# If you directly download the trained weights, change to appropriate path. +WEIGHTS_DIR = '/p/project/cjjsc42/bing/ml-severin/model_data_keras2' +#WEIGHTS_DIR = '/p/project/cjjsc42/bing/ml-severin/model_data_keras2' + +# Where results (prediction plots and evaluation file) will be saved. +RESULTS_SAVE_DIR = '/p/project/cjjsc42/bing/ml-severin/kitti_results' + diff --git a/workflow_parallel_frame_prediction/DataPostprocess/packageInstallation.sh b/workflow_parallel_frame_prediction/DataPostprocess/packageInstallation.sh new file mode 100644 index 0000000000000000000000000000000000000000..1039e5fd19fef89064f0d11ec6898117a0c6908d --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/packageInstallation.sh @@ -0,0 +1,9 @@ +module --force purge +module /usr/local/software/jureca/OtherStages +module load Stages/2019a +module load Intel/2019.3.199-GCC-8.3.0 ParaStationMPI/5.2.2-1-mt +module load mpi4py/3.0.1-Python-3.6.8 +pip3 install --user netCDF4 +pip3 install --user numpy + + diff --git a/workflow_parallel_frame_prediction/DataPostprocess/parameters_kitti_evaluate.dat b/workflow_parallel_frame_prediction/DataPostprocess/parameters_kitti_evaluate.dat new file mode 100755 index 0000000000000000000000000000000000000000..ac64b7046c7aa93f69d622d2b055e4fa45d53f75 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/parameters_kitti_evaluate.dat @@ -0,0 +1,14 @@ +import os + +# ============ input parameters =================== # +# 0:deactivate 1: active + +Pleas fill the following parameters list for PyStager +Source_Directory = /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2017/ +Destination_Directory = /p/scratch/cjjsc42/bing/pystager-development/tryData/ +Log_Directory = /p/project/cjjsc42/bing/pystager-development/log +Rsync_Status = 1 +Checksum_Status = 0 + + + diff --git a/workflow_parallel_frame_prediction/DataPostprocess/prednet.py b/workflow_parallel_frame_prediction/DataPostprocess/prednet.py new file mode 100755 index 0000000000000000000000000000000000000000..b5a0208ae137666c9bc284b21d6affe04d721053 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPostprocess/prednet.py @@ -0,0 +1,311 @@ +import numpy as np + +from keras import backend as K +from keras import activations +from keras.layers import Recurrent +from keras.layers import Conv2D, UpSampling2D, MaxPooling2D +from keras.engine import InputSpec +from keras_utils import legacy_prednet_support + +class PredNet(Recurrent): + '''PredNet architecture - Lotter 2016. + Stacked convolutional LSTM inspired by predictive coding principles. + + # Arguments + stack_sizes: number of channels in targets (A) and predictions (Ahat) in each layer of the architecture. + Length is the number of layers in the architecture. + First element is the number of channels in the input. + Ex. (3, 16, 32) would correspond to a 3 layer architecture that takes in RGB images and has 16 and 32 + channels in the second and third layers, respectively. + R_stack_sizes: number of channels in the representation (R) modules. + Length must equal length of stack_sizes, but the number of channels per layer can be different. + A_filt_sizes: filter sizes for the target (A) modules. + Has length of 1 - len(stack_sizes). + Ex. (3, 3) would mean that targets for layers 2 and 3 are computed by a 3x3 convolution of the errors (E) + from the layer below (followed by max-pooling) + Ahat_filt_sizes: filter sizes for the prediction (Ahat) modules. + Has length equal to length of stack_sizes. + Ex. (3, 3, 3) would mean that the predictions for each layer are computed by a 3x3 convolution of the + representation (R) modules at each layer. + R_filt_sizes: filter sizes for the representation (R) modules. + Has length equal to length of stack_sizes. + Corresponds to the filter sizes for all convolutions in the LSTM. + pixel_max: the maximum pixel value. + Used to clip the pixel-layer prediction. + error_activation: activation function for the error (E) units. + A_activation: activation function for the target (A) and prediction (A_hat) units. + LSTM_activation: activation function for the cell and hidden states of the LSTM. + LSTM_inner_activation: activation function for the gates in the LSTM. + output_mode: either 'error', 'prediction', 'all' or layer specification (ex. R2, see below). + Controls what is outputted by the PredNet. + If 'error', the mean response of the error (E) units of each layer will be outputted. + That is, the output shape will be (batch_size, nb_layers). + If 'prediction', the frame prediction will be outputted. + If 'all', the output will be the frame prediction concatenated with the mean layer errors. + The frame prediction is flattened before concatenation. + Nomenclature of 'all' is kept for backwards compatibility, but should not be confused with returning all of the layers of the model + For returning the features of a particular layer, output_mode should be of the form unit_type + layer_number. + For instance, to return the features of the LSTM "representational" units in the lowest layer, output_mode should be specificied as 'R0'. + The possible unit types are 'R', 'Ahat', 'A', and 'E' corresponding to the 'representation', 'prediction', 'target', and 'error' units respectively. + extrap_start_time: time step for which model will start extrapolating. + Starting at this time step, the prediction from the previous time step will be treated as the "actual" + data_format: 'channels_first' or 'channels_last'. + It defaults to the `image_data_format` value found in your + Keras config file at `~/.keras/keras.json`. + + # References + - [Deep predictive coding networks for video prediction and unsupervised learning](https://arxiv.org/abs/1605.08104) + - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) + - [Convolutional LSTM network: a machine learning approach for precipitation nowcasting](http://arxiv.org/abs/1506.04214) + - [Predictive coding in the visual cortex: a functional interpretation of some extra-classical receptive-field effects](http://www.nature.com/neuro/journal/v2/n1/pdf/nn0199_79.pdf) + ''' + @legacy_prednet_support + def __init__(self, stack_sizes, R_stack_sizes, + A_filt_sizes, Ahat_filt_sizes, R_filt_sizes, + pixel_max=1., error_activation='relu', A_activation='relu', + LSTM_activation='tanh', LSTM_inner_activation='hard_sigmoid', + output_mode='error', extrap_start_time=None, + data_format=K.image_data_format(), **kwargs): + self.stack_sizes = stack_sizes + self.nb_layers = len(stack_sizes) + assert len(R_stack_sizes) == self.nb_layers, 'len(R_stack_sizes) must equal len(stack_sizes)' + self.R_stack_sizes = R_stack_sizes + assert len(A_filt_sizes) == (self.nb_layers - 1), 'len(A_filt_sizes) must equal len(stack_sizes) - 1' + self.A_filt_sizes = A_filt_sizes + assert len(Ahat_filt_sizes) == self.nb_layers, 'len(Ahat_filt_sizes) must equal len(stack_sizes)' + self.Ahat_filt_sizes = Ahat_filt_sizes + assert len(R_filt_sizes) == (self.nb_layers), 'len(R_filt_sizes) must equal len(stack_sizes)' + self.R_filt_sizes = R_filt_sizes + + self.pixel_max = pixel_max + self.error_activation = activations.get(error_activation) + self.A_activation = activations.get(A_activation) + self.LSTM_activation = activations.get(LSTM_activation) + self.LSTM_inner_activation = activations.get(LSTM_inner_activation) + + default_output_modes = ['prediction', 'error', 'all'] + layer_output_modes = [layer + str(n) for n in range(self.nb_layers) for layer in ['R', 'E', 'A', 'Ahat']] + assert output_mode in default_output_modes + layer_output_modes, 'Invalid output_mode: ' + str(output_mode) + self.output_mode = output_mode + if self.output_mode in layer_output_modes: + self.output_layer_type = self.output_mode[:-1] + self.output_layer_num = int(self.output_mode[-1]) + else: + self.output_layer_type = None + self.output_layer_num = None + self.extrap_start_time = extrap_start_time + + assert data_format in {'channels_last', 'channels_first'}, 'data_format must be in {channels_last, channels_first}' + self.data_format = data_format + self.channel_axis = -3 if data_format == 'channels_first' else -1 + self.row_axis = -2 if data_format == 'channels_first' else -3 + self.column_axis = -1 if data_format == 'channels_first' else -2 + super(PredNet, self).__init__(**kwargs) + self.input_spec = [InputSpec(ndim=5)] + + def compute_output_shape(self, input_shape): + if self.output_mode == 'prediction': + out_shape = input_shape[2:] + elif self.output_mode == 'error': + out_shape = (self.nb_layers,) + elif self.output_mode == 'all': + out_shape = (np.prod(input_shape[2:]) + self.nb_layers,) + else: + stack_str = 'R_stack_sizes' if self.output_layer_type == 'R' else 'stack_sizes' + stack_mult = 2 if self.output_layer_type == 'E' else 1 + out_stack_size = stack_mult * getattr(self, stack_str)[self.output_layer_num] + out_nb_row = input_shape[self.row_axis] / 2**self.output_layer_num + out_nb_col = input_shape[self.column_axis] / 2**self.output_layer_num + if self.data_format == 'channels_first': + out_shape = (out_stack_size, out_nb_row, out_nb_col) + else: + out_shape = (out_nb_row, out_nb_col, out_stack_size) + + if self.return_sequences: + return (input_shape[0], input_shape[1]) + out_shape + else: + return (input_shape[0],) + out_shape + + def get_initial_state(self, x): + input_shape = self.input_spec[0].shape + init_nb_row = input_shape[self.row_axis] + init_nb_col = input_shape[self.column_axis] + + base_initial_state = K.zeros_like(x) # (samples, timesteps) + image_shape + non_channel_axis = -1 if self.data_format == 'channels_first' else -2 + for _ in range(2): + base_initial_state = K.sum(base_initial_state, axis=non_channel_axis) + base_initial_state = K.sum(base_initial_state, axis=1) # (samples, nb_channels) + + initial_states = [] + states_to_pass = ['r', 'c', 'e'] + nlayers_to_pass = {u: self.nb_layers for u in states_to_pass} + if self.extrap_start_time is not None: + states_to_pass.append('ahat') # pass prediction in states so can use as actual for t+1 when extrapolating + nlayers_to_pass['ahat'] = 1 + for u in states_to_pass: + for l in range(nlayers_to_pass[u]): + ds_factor = 2 ** l + nb_row = init_nb_row // ds_factor + nb_col = init_nb_col // ds_factor + if u in ['r', 'c']: + stack_size = self.R_stack_sizes[l] + elif u == 'e': + stack_size = 2 * self.stack_sizes[l] + elif u == 'ahat': + stack_size = self.stack_sizes[l] + output_size = stack_size * nb_row * nb_col # flattened size + + reducer = K.zeros((input_shape[self.channel_axis], output_size)) # (nb_channels, output_size) + initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) + if self.data_format == 'channels_first': + output_shp = (-1, stack_size, nb_row, nb_col) + else: + output_shp = (-1, nb_row, nb_col, stack_size) + initial_state = K.reshape(initial_state, output_shp) + initial_states += [initial_state] + + if K._BACKEND == 'theano': + from theano import tensor as T + # There is a known issue in the Theano scan op when dealing with inputs whose shape is 1 along a dimension. + # In our case, this is a problem when training on grayscale images, and the below line fixes it. + initial_states = [T.unbroadcast(init_state, 0, 1) for init_state in initial_states] + + if self.extrap_start_time is not None: + initial_states += [K.variable(0, int if K.backend() != 'tensorflow' else 'int32')] # the last state will correspond to the current timestep + return initial_states + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + self.conv_layers = {c: [] for c in ['i', 'f', 'c', 'o', 'a', 'ahat']} + + for l in range(self.nb_layers): + for c in ['i', 'f', 'c', 'o']: + act = self.LSTM_activation if c == 'c' else self.LSTM_inner_activation + self.conv_layers[c].append(Conv2D(self.R_stack_sizes[l], self.R_filt_sizes[l], padding='same', activation=act, data_format=self.data_format)) + + act = 'relu' if l == 0 else self.A_activation + self.conv_layers['ahat'].append(Conv2D(self.stack_sizes[l], self.Ahat_filt_sizes[l], padding='same', activation=act, data_format=self.data_format)) + + if l < self.nb_layers - 1: + self.conv_layers['a'].append(Conv2D(self.stack_sizes[l+1], self.A_filt_sizes[l], padding='same', activation=self.A_activation, data_format=self.data_format)) + + self.upsample = UpSampling2D(data_format=self.data_format) + self.pool = MaxPooling2D(data_format=self.data_format) + + self.trainable_weights = [] + nb_row, nb_col = (input_shape[-2], input_shape[-1]) if self.data_format == 'channels_first' else (input_shape[-3], input_shape[-2]) + for c in sorted(self.conv_layers.keys()): + for l in range(len(self.conv_layers[c])): + ds_factor = 2 ** l + if c == 'ahat': + nb_channels = self.R_stack_sizes[l] + elif c == 'a': + nb_channels = 2 * self.stack_sizes[l] + else: + nb_channels = self.stack_sizes[l] * 2 + self.R_stack_sizes[l] + if l < self.nb_layers - 1: + nb_channels += self.R_stack_sizes[l+1] + in_shape = (input_shape[0], nb_channels, nb_row // ds_factor, nb_col // ds_factor) + if self.data_format == 'channels_last': in_shape = (in_shape[0], in_shape[2], in_shape[3], in_shape[1]) + with K.name_scope('layer_' + c + '_' + str(l)): + self.conv_layers[c][l].build(in_shape) + self.trainable_weights += self.conv_layers[c][l].trainable_weights + + self.states = [None] * self.nb_layers*3 + + if self.extrap_start_time is not None: + self.t_extrap = K.variable(self.extrap_start_time, int if K.backend() != 'tensorflow' else 'int32') + self.states += [None] * 2 # [previous frame prediction, timestep] + + def step(self, a, states): + r_tm1 = states[:self.nb_layers] + c_tm1 = states[self.nb_layers:2*self.nb_layers] + e_tm1 = states[2*self.nb_layers:3*self.nb_layers] + + if self.extrap_start_time is not None: + t = states[-1] + a = K.switch(t >= self.t_extrap, states[-2], a) # if past self.extrap_start_time, the previous prediction will be treated as the actual + + c = [] + r = [] + e = [] + + # Update R units starting from the top + for l in reversed(range(self.nb_layers)): + inputs = [r_tm1[l], e_tm1[l]] + if l < self.nb_layers - 1: + inputs.append(r_up) + + inputs = K.concatenate(inputs, axis=self.channel_axis) + i = self.conv_layers['i'][l].call(inputs) + f = self.conv_layers['f'][l].call(inputs) + o = self.conv_layers['o'][l].call(inputs) + _c = f * c_tm1[l] + i * self.conv_layers['c'][l].call(inputs) + _r = o * self.LSTM_activation(_c) + c.insert(0, _c) + r.insert(0, _r) + + if l > 0: + r_up = self.upsample.call(_r) + + # Update feedforward path starting from the bottom + for l in range(self.nb_layers): + ahat = self.conv_layers['ahat'][l].call(r[l]) + if l == 0: + ahat = K.minimum(ahat, self.pixel_max) + frame_prediction = ahat + + # compute errors + e_up = self.error_activation(ahat - a) + e_down = self.error_activation(a - ahat) + + e.append(K.concatenate((e_up, e_down), axis=self.channel_axis)) + + if self.output_layer_num == l: + if self.output_layer_type == 'A': + output = a + elif self.output_layer_type == 'Ahat': + output = ahat + elif self.output_layer_type == 'R': + output = r[l] + elif self.output_layer_type == 'E': + output = e[l] + + if l < self.nb_layers - 1: + a = self.conv_layers['a'][l].call(e[l]) + a = self.pool.call(a) # target for next layer + + if self.output_layer_type is None: + if self.output_mode == 'prediction': + output = frame_prediction + else: + for l in range(self.nb_layers): + layer_error = K.mean(K.batch_flatten(e[l]), axis=-1, keepdims=True) + all_error = layer_error if l == 0 else K.concatenate((all_error, layer_error), axis=-1) + if self.output_mode == 'error': + output = all_error + else: + output = K.concatenate((K.batch_flatten(frame_prediction), all_error), axis=-1) + + states = r + c + e + if self.extrap_start_time is not None: + states += [frame_prediction, t + 1] + return output, states + + def get_config(self): + config = {'stack_sizes': self.stack_sizes, + 'R_stack_sizes': self.R_stack_sizes, + 'A_filt_sizes': self.A_filt_sizes, + 'Ahat_filt_sizes': self.Ahat_filt_sizes, + 'R_filt_sizes': self.R_filt_sizes, + 'pixel_max': self.pixel_max, + 'error_activation': self.error_activation.__name__, + 'A_activation': self.A_activation.__name__, + 'LSTM_activation': self.LSTM_activation.__name__, + 'LSTM_inner_activation': self.LSTM_inner_activation.__name__, + 'data_format': self.data_format, + 'extrap_start_time': self.extrap_start_time, + 'output_mode': self.output_mode} + base_config = super(PredNet, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/workflow_parallel_frame_prediction/DataPreprocess/Stager_devel_N_24_process_netCDF.sh b/workflow_parallel_frame_prediction/DataPreprocess/Stager_devel_N_24_process_netCDF.sh new file mode 100755 index 0000000000000000000000000000000000000000..983336105659267b5ed29215bd1f5c2ccc04e195 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPreprocess/Stager_devel_N_24_process_netCDF.sh @@ -0,0 +1,22 @@ +#!/bin/bash -x +#SBATCH --account=deepacf +#SBATCH --nodes=1 +#SBATCH --ntasks=12 +##SBATCH --ntasks-per-node=12 +#SBATCH --cpus-per-task=1 +#SBATCH --output=process_netcdf-out.%j +#SBATCH --error=process_netcdf-err.%j +#SBATCH --time=00:20:00 +#SBATCH --partition=devel +#SBATCH --mail-type=ALL +#SBATCH --mail-user=b.gong@fz-juelich.de + +module --force purge +module use $OTHERSTAGES +module load Stages/2019a +module load Intel/2019.3.199-GCC-8.3.0 ParaStationMPI/5.2.2-1 +module load h5py/2.9.0-Python-3.6.8 +module load mpi4py/3.0.1-Python-3.6.8 + +srun python mpi_stager_v2_process_netCDF.py --source_dir /p/scratch/deepacf/video_prediction_shared_folder/extractedData/\ +--destination_dir /p/scratch/deepacf/bing/processData_size_64_64_3_3t_norm_test2 diff --git a/workflow_parallel_frame_prediction/DataPreprocess/external_function.py b/workflow_parallel_frame_prediction/DataPreprocess/external_function.py new file mode 100755 index 0000000000000000000000000000000000000000..c360c0e0e6289d46224369d2b1a0a8bda223cb1f --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPreprocess/external_function.py @@ -0,0 +1,158 @@ +#from mpi4py import MPI +from os import walk +import os +import sys +import subprocess +import logging +import time +import hashlib + +# ======================= List of functions ====================================== # + +# check the rank and print it + +def logger(file_name, logger_level, program_name): + # Log file starter + + logging.basicConfig(filename=file_name, level=logger_level, + format='%(asctime)s:%(levelname)s:%(message)s') + logging.debug(' === PyStager is started === ') + print(str(program_name) + ' is Running .... ') + + +def config_file(config_file_name): + params = {} + for line in open(config_file_name): + line = line.strip() + read_in_value = line.split("=") + if len(read_in_value) == 2: + params[read_in_value[0].strip()] = read_in_value[1].strip() + + source_dir = str(params["Source_Directory"]) + print(source_dir) + destination_dir = str(params["Destination_Directory"]) + log_dir = str(params["Log_Directory"]) + rsync_status = int(params["Rsync_Status"]) + return source_dir, destination_dir, log_dir, rsync_status + + +def directory_scanner(source_path): + # Take a look inside a directories and make a list of ll the folders, sub directories, number of the files and size + # NOTE : It will neglect if there is a sub-directories inside directories!!! + + dir_detail_list = [] # directories details + sub_dir_list = [] + total_size_source = 0 + total_num_files = 0 + list_directories = [] + + list_directories = os.listdir(source_path) + print(list_directories) + print(int(len(list_directories))) + + for d in list_directories: + print(d) + path = source_path + d + print(path) + if os.path.isdir(path): + sub_dir_list.append(d) + sub_dir_list.sort() + num_files = 0 + # size of the files and subdirectories + size_dir = subprocess.check_output(['du', '-sc', path]) + splitted = size_dir.split() # fist item is the size of the folder + size = (splitted[0]) + num_files = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]) + dir_detail_list.extend([d, size, num_files]) + total_num_files = total_num_files + int(num_files) + total_size_source = total_size_source + int(size) + else: + print(path, 'does not exist') + print("===== Debug here =====") + + total_num_directories = int(len(list_directories)) + total_size_source = float(total_size_source / 1000000) + + message = 'Total size of the source directory is:' + str(total_size_source) + 'Gb.' + print(message) + message = "Total number of the files in the source directory is: " + str(total_num_files) + print(message) + message = "Total number of the directories in the source directory is: " + str(total_num_directories) + print(message) + + return dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories + + +def load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories, p): + # create a dictionary with p number of keys + # for each directory they add the name to one of the keys + transfer_dict = dict.fromkeys(list(range(1, p))) + print(transfer_dict) + # package_counter = 0 possibility to use the counter to fill + counter = 1 + for Directory_counter in range(0, total_num_directories): + + if transfer_dict[counter] is None: # if the value for the key is None add to it + transfer_dict[counter] = sub_dir_list[Directory_counter] + else: # if key has a value join the new value to the old value + transfer_dict[counter] = "{};{}".format(transfer_dict[counter], sub_dir_list[Directory_counter]) + counter = counter + 1 + if counter == p: + counter = 1 + + return transfer_dict + +def sync_file(source_path, destination_dir, job_name, rsync_status): + rsync_msg = ("rsync -r " + source_path + job_name + "/" + " " + destination_dir + "/" + job_name) + # print('Node:', str(my_rank),'will execute :', rsync_str,'\r\n') + # sync the assigned folder + + if rsync_status == 1: + os.system(rsync_msg) + + + +def hash_directory(source_path,job_name,hash_rep_file,input_status): + #sha256_hash = hashlib.sha256() + md5_hash = hashlib.md5() + + ########## Create a hashed file repasitory for direcotry(ies) assigned to node ####### + hash_repo_text = input_status + "_"+job_name +"_hashed.txt" + os.chdir(hash_rep_file) + hashed_text_note=open(hash_repo_text,"w+") + + # job_name is the name of the subdirectory that is going to be processed + directory_to_process = source_path + job_name + # print(directory_to_process) + files_list = [] + for dirpath, dirnames, filenames in os.walk(directory_to_process): + files_list.extend(filenames) + + os.chdir(directory_to_process) # change to the working directory + + for file_to_process in filenames: + + ## ======= this is the sha256 checksum ========= # + #with open(file_to_process,"rb") as f: + # # Read and update hash in chunks of 4K + # for byte_block in iter(lambda: f.read(4096),b""): + # sha256_hash.update(byte_block) + # hashed_file = sha256_hash.hexdigest() + + with open(file_to_process,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + hashed_file = md5_hash.hexdigest() + + hashed_text_note.write(hashed_file) + + return + +def md5(fname): + md5_hash = hashlib.md5() + with open(fname,"rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + md5_hash.update(byte_block) + return md5_hash.hexdigest() diff --git a/workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py b/workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py new file mode 100644 index 0000000000000000000000000000000000000000..f6b760c7ad3c528a745975cda9b1c420aa739d77 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py @@ -0,0 +1,44 @@ +from mpi4py import MPI +import argparse +from process_netCDF_v2 import * +from metadata import MetaData +import json + +#add parser arguments +parser = argparse.ArgumentParser() +#parser.add_argument("--source_dir", type=str, default="/p/scratch/deepacf/bing/extractedData/") +parser.add_argument("--destination_dir","-dest",dest="destination_dir",type=str, default="/p/scratch/deepacf/bing/processData_size_64_64_3_3t_norm") +parser.add_argument("--varnames","-vars",dest="varnames", nargs = '+') +#parser.add_argument("--partition","-part",dest="partition",type=json.loads) +# help="--partition allows to control the splitting of the processed data in training, test and validation data. Pass a dictionary-like string.") + +args = parser.parse_args() +# ML 2020/06/08: Dirty workaround as long as data-splitting is done with this seperate Python-script +# called from the same parent Shell-/Batch-script as 'mpi_stager_v2_process_netCDF.py' +target_dir = os.path.join(MetaData.get_destdir_jsontmp(),"hickle") +varnames = args.varnames + +#partition = args.partition +#all_keys = partition.keys() +#for key in all_keys: +# print(partition[key]) + +partition = { + "train":{ + "2017":[1] + }, + "val": + {"2017":[2] + }, + "test": + {"2017":[2] + } + } +# ini. MPI +comm = MPI.COMM_WORLD +my_rank = comm.Get_rank() # rank of the node +p = comm.Get_size() # number of assigned nods +if my_rank == 0: # node is master + split_data_multiple_years(target_dir=target_dir,partition=partition,varnames=varnames) +else: + pass diff --git a/workflow_parallel_frame_prediction/DataPreprocess/mpi_stager_v2_process_netCDF.py b/workflow_parallel_frame_prediction/DataPreprocess/mpi_stager_v2_process_netCDF.py new file mode 100755 index 0000000000000000000000000000000000000000..fdc2f65a5092469c021e0c21b0606e7e7d248c5c --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPreprocess/mpi_stager_v2_process_netCDF.py @@ -0,0 +1,248 @@ +from mpi4py import MPI +from os import walk +import sys +import subprocess +import logging +import time +from external_function import directory_scanner +from external_function import load_distributor +from external_function import hash_directory +from external_function import md5 +from process_netCDF_v2 import * +from metadata import MetaData as MetaData +import os +import argparse +import json + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument("--source_dir", type=str, default="/p/scratch/deepacf/bing/extractedData/") + parser.add_argument("--destination_dir", type=str, default="/p/scratch/deepacf/bing/processData_size_64_64_3_3t_norm") + parser.add_argument("--script_dir","-scr_dir",dest="script_dir",type=str) + parser.add_argument("--years", "-y", dest="years") + parser.add_argument("--checksum_status", type=int, default=0) + parser.add_argument("--rsync_status", type=int, default=1) + parser.add_argument("--vars", nargs="+",default = ["T2","T2","T2"]) #"MSL","gph500" + parser.add_argument("--lat_s", type=int, default=74+32) + parser.add_argument("--lat_e", type=int, default=202-32) + parser.add_argument("--lon_s", type=int, default=550+16+32) + parser.add_argument("--lon_e", type=int, default=710-16-32) + args = parser.parse_args() + + current_path = os.getcwd() + years = args.years + source_dir = os.path.join(args.source_dir,str(years))+"/" + destination_dir = args.destination_dir + scr_dir = args.script_dir + checksum_status = args.checksum_status + rsync_status = args.rsync_status + + vars = args.vars + lat_s = args.lat_s + lat_e = args.lat_e + lon_s = args.lon_s + lon_e = args.lon_e + + slices = {"lat_s": lat_s, + "lat_e": lat_e, + "lon_s": lon_s, + "lon_e": lon_e + } + print("Selected variables",vars) + print("Selected Slices",slices) + + os.chdir(current_path) + time.sleep(0) + + + # ini. MPI + comm = MPI.COMM_WORLD + my_rank = comm.Get_rank() # rank of the node + p = comm.Get_size() # number of assigned nods + + # ============ configuration for data preprocessing =================== # + # ==================================== Master Logging ==================================================== # + # DEBUG: Detailed information, typically of interest only when diagnosing problems. + # INFO: Confirmation that things are working as expected. + # WARNING: An indication that something unexpected happened, or indicative of some problem in the near + # ERROR: Due to a more serious problem, the software has not been able to perform some function. + # CRITICAL: A serious error, indicating that the program itself may be unable to continue running. + + if my_rank == 0: # node is master + logging.basicConfig(filename='stager.log', level=logging.DEBUG, + format='%(asctime)s:%(levelname)s:%(message)s') + start = time.time() # start of the MPI + logging.debug(' === PyStager is started === ') + print('PyStager is Running .... ') + + # ================================== ALL Nodes: Read-in parameters ====================================== # + #Bing: using the args to configure the directories + # fileName = "parameters_process_netCDF.dat" # input parameters file + # fileObj = open(fileName) + # params = {} + # + # for line in fileObj: + # line = line.strip() + # read_in_value = line.split("=") + # if len(read_in_value) == 2: + # params[read_in_value[0].strip()] = read_in_value[1].strip() + # + # # input from the user: + # source_dir = str(params["Source_Directory"]) + # destination_dir = str(params["Destination_Directory"]) + # log_dir = str(params["Log_Directory"]) + # rsync_status = int(params["Rsync_Status"]) + # checksum_status = int(params["Checksum_Status"]) + + # check the existence of teh folders : + + if not os.path.exists(source_dir): # check if the source dir. is existing + if my_rank == 0: + logging.critical('The source does not exist') + logging.info('exit status : 1') + print('Critical : The source does not exist') + + sys.exit(1) + + # ML 2020/04/26 + # Expand destination_dir-variable by searching for netCDF-files in source_dir and processing the file from the first list element to obtain all relevant (meta-)data. + if my_rank == 0: + data_files_list = glob.glob(source_dir+"/**/*.nc",recursive=True) + + if not data_files_list: raise ValueError("Could not find any data to be processed in '"+source_dir+"'") + + md = MetaData(suffix_indir=destination_dir,data_filename=data_files_list[0],slices=slices,variables=vars) + # modify Batch scripts if metadata has been retrieved for the first time (md.status = "new") + if (md.status == "new"): + md.write_dirs_to_batch_scripts(scr_dir+"/DataPreprocess_to_tf.sh") + md.write_dirs_to_batch_scripts(scr_dir+"/generate_era5.sh") + md.write_dirs_to_batch_scripts(scr_dir+"/train_era5.sh") + # ML 2020/06/08: Dirty workaround as long as data-splitting is done with a seperate Python-script + # called from the same parent Shell-/Batch-script + # -> work with temproary json-file in working directory + md.write_destdir_jsontmp(os.path.join(md.expdir,md.expname),tmp_dir=current_path) + #else: nothing to do + + destination_dir= os.path.join(md.expdir,md.expname,"hickle",years) + + # ...and create directory if necessary + if not os.path.exists(destination_dir): # check if the Destination dir. is existing + logging.critical('The Destination does not exist') + logging.info('Create new destination dir') + os.makedirs(destination_dir,exist_ok=True) + + # ML 2020/04/24 E + + if my_rank == 0: # node is master: + # ==================================== Master : Directory scanner ================================= # + + print(" # ============== Directory scanner : start ==================# ") + + ret_dir_scanner = directory_scanner(source_dir) + print(ret_dir_scanner) + dir_detail_list = ret_dir_scanner[0] + sub_dir_list = ret_dir_scanner[1] + total_size_source = ret_dir_scanner[2] + total_num_files = ret_dir_scanner[3] + total_num_dir = ret_dir_scanner[4] + + # =================================== Master : Load Distribution ========================== # + + print(" # ============== Load Distrbution : start ==================# ") + #def load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_directories, p): + ret_load_balancer = load_distributor(dir_detail_list, sub_dir_list, total_size_source, total_num_files, total_num_dir, p) + transfer_dict = ret_load_balancer + + print(ret_load_balancer) + # ===================================== Master : Send / Receive =============================== # + print(" # ============== Communication : start ==================# ") + + # Send : the list of the directories to the nodes + for nodes in range(1, p): + broadcast_list = transfer_dict[nodes] + comm.send(broadcast_list, dest=nodes) + + # Receive : will wait for a certain time to see if it will receive any critical error from the slaves nodes + idle_counter = p - len(sub_dir_list) + while idle_counter > 1: # non-blocking receive function + message_in = comm.recv() + logging.warning(message_in) + #print('Warning:', message_in) + idle_counter = idle_counter - 1 + + # Receive : Message from slave nodes confirming the sync + message_counter = 1 + while message_counter <= len(sub_dir_list): # non-blocking receive function + message_in = comm.recv() + logging.info(message_in) + message_counter = message_counter + 1 + #Bing + # ML 2020/05/19: Splitting now controlled from batch-script + # split_data(target_dir=destination_dir, partition = [0.6, 0.2, 0.2]) + + # stamp the end of the runtime + end = time.time() + logging.debug(end - start) + logging.info('== PyStager is done ==') + logging.info('exit status : 0') + print('PyStager is finished ') + sys.exit(0) + + else: # node is slave + + # ============================================= Slave : Send / Receive ============================================ # + message_in = comm.recv() + + if message_in is None: # in case more than number of the dir. processor is assigned todo Tag it! + message_out = ('Node', str(my_rank), 'is idle') + comm.send(message_out, dest=0) + + else: # if the Slave node has joblist to do + job_list = message_in.split(';') + + for job_count in range(0, len(job_list)): + job = job_list[job_count] # job is the name of the directory(ies) assigned to slave_node + #print(job) + + #grib_2_netcdf(rot_grid,source_dir, destination_dir, job) + + # creat a checksum ( hash) from the source folder. + if checksum_status == 1: + hash_directory(source_dir, job, current_path, "source") + + if rsync_status == 1: + # prepare the rsync commoand to be excexuted by the worker node + #rsync_str = ("rsync -r " + source_dir + job + "/" + " " + destination_dir + "/" + job) + #os.system(rsync_str) + + #process_era5_in_dir(job, src_dir=source_dir, target_dir=destination_dir) + # ML 2020/06/09: workaround to get correct destination_dir obtained by the master node + destination_dir = os.path.join(MetaData.get_destdir_jsontmp(tmp_dir=current_path),"hickle",years) + process_netCDF_in_dir(job_name=job, src_dir=source_dir, target_dir=destination_dir,slices=slices,vars=vars) + + if checksum_status == 1: + hash_directory(destination_dir, job, current_path, "destination") + os.chdir(current_path) + source_hash_text = "source" + "_"+ job +"_hashed.txt" + destination_hash_text = "destination" + "_"+ job +"_hashed.txt" + if md5(source_hash_text) == md5(destination_hash_text): + msg_out = 'source: ' + job +' and destination: ' + job + ' files are identical' + print(msg_out) + + else: + msg_out = 'integrity of source: ' + job +' and destination: ' + job +' files could not be verified' + print(msg_out) + + # Send : the finish of the sync message back to master node + message_out = ('Node:', str(my_rank), 'finished :', "", '\r\n') + comm.send(message_out, dest=0) + + MPI.Finalize() + + +if __name__ == "__main__": + main() + + + diff --git a/workflow_parallel_frame_prediction/DataPreprocess/process_netCDF_v2.py b/workflow_parallel_frame_prediction/DataPreprocess/process_netCDF_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..21bacb746a1c95a576dabd3e548d5f5a00dafdb0 --- /dev/null +++ b/workflow_parallel_frame_prediction/DataPreprocess/process_netCDF_v2.py @@ -0,0 +1,488 @@ +''' +Code for processing staged ERA5 data +''' + +import os +import glob +from netCDF4 import Dataset,num2date +#import requests +#from bs4 import BeautifulSoup +#import urllib.request +import numpy as np +#from imageio import imread +#from scipy.misc import imresize +import hickle as hkl +import json +import pickle + +# Create image datasets. +# Processes images and saves them in train, val, test splits. +def process_data(directory_to_process, target_dir, job_name, slices, vars=("T2","MSL","gph500")): + desired_im_sz = (slices["lat_e"] - slices["lat_s"], slices["lon_e"] - slices["lon_s"]) + # ToDo: Define a convenient function to create a list containing all files. + imageList = list(os.walk(directory_to_process, topdown = False))[-1][-1] + imageList = sorted(imageList) + EU_stack_list = [0] * (len(imageList)) + temporal_list = [0] * (len(imageList)) + nvars = len(vars) + #X = np.zeros((len(splits[split]),) + desired_im_sz + (3,), np.uint8) + #print(X) + #print('shape of X' + str(X.shape)) + + ##### TODO: iterate over split and read every .nc file, cut out array, + ##### overlay arrays for RGB like style. + ##### Save everything after for loop. + # ML 2020/04/06 S + # Some inits + stat_obj = Calc_data_stat(nvars) + # ML 2020/04/06 E + for j, im_file in enumerate(imageList): + #20200408,Bing + try: + im_path = os.path.join(directory_to_process, im_file) + print('Open following dataset: '+im_path) + + + #20200408,Bing + + im = Dataset(im_path, mode = 'r') + times = im.variables['time'] + time = num2date(times[:],units=times.units,calendar=times.calendar) + vars_list = [] + with Dataset(im_path,'r') as data_file: + for i in range(nvars): + var1 = data_file.variables[vars[i]][0,slices["lat_s"]:slices["lat_e"], slices["lon_s"]:slices["lon_e"]] + stat_obj.acc_stat_loc(i,var1) + vars_list.append(var1) + + EU_stack = np.stack(vars_list, axis = 2) + EU_stack_list[j] =list(EU_stack) + + #20200408,bing + temporal_list[j] = list(time) + #print('Does ist work? ') + #print(EU_stack_list[i][:,:,0]==EU_t2) + #print(EU_stack[:,:,1]==EU_msl + except Exception as err: + im_path = os.path.join(directory_to_process, im_file) + #im = Dataset(im_path, mode = 'r') + print("*************ERROR*************", err) + print("Error message {} from file {}".format(err,im_file)) + EU_stack_list[j] = list(EU_stack) # use the previous image as replacement, we can investigate further how to deal with the missing values + continue + + X = np.array(EU_stack_list) + target_file = os.path.join(target_dir, 'X_' + str(job_name) + '.hkl') + hkl.dump(X, target_file) #Not optimal! + print(target_file, "is saved") + # ML 2020/03/31: write json file with statistics + stat_obj.finalize_stat_loc(vars) + stat_obj.write_stat_json(target_dir,file_id=job_name) + # BG 2020/04/08: Also save temporal information to pickle-files + temporal_info = np.array(temporal_list) + temporal_file = os.path.join(target_dir, 'T_' + str(job_name) + '.pkl') + cwd = os.getcwd() + pickle.dump(temporal_info, open( temporal_file, "wb" ) ) + #hkl.dump(temporal_info, temporal_file) + + #hkl.dump(source_list, os.path.join(target_dir, 'sources_' + str(job) + '.hkl')) + + #for category, folder in splits[split]: + # im_dir = os.path.join(DATA_DIR, 'raw/', category, folder, folder[:10], folder, 'image_03/data/') + # files = list(os.walk(im_dir, topdown=False))[-1][-1] + # im_list += [im_dir + f for f in sorted(files)] + # multiply path of respective recording with lengths of its files in order to ensure + # that each entry in X_train.hkl corresponds with an entry of source_list/ sources_train.hkl + # source_list += [category + '-' + folder] * len(files) + + #print( 'Creating ' + split + ' data: ' + str(len(im_list)) + ' images') + #X = np.zeros((len(im_list),) + desired_im_sz + (3,), np.uint8) + # enumerate allows us to loop over something and have an automatic counter + #for i, im_file in enumerate(im_list): + # im = imread(im_file) + # X[i] = process_im(im, desired_im_sz) + + #hkl.dump(X, os.path.join(DATA_DIR, 'X_' + split + '.hkl')) + #hkl.dump(source_list, os.path.join(DATA_DIR, 'sources_' + split + '.hkl')) + +def process_netCDF_in_dir(src_dir,**kwargs): + target_dir = kwargs.get("target_dir") + job_name = kwargs.get("job_name") + directory_to_process = os.path.join(src_dir, job_name) + os.chdir(directory_to_process) + if not os.path.exists(target_dir): os.mkdir(target_dir) + target_file = os.path.join(target_dir, 'X_' + str(job_name) + '.hkl') + if os.path.exists(target_file): + print(target_file," file exists in the directory ", target_dir) + else: + print ("==========Processing files in directory {} =============== ".format(directory_to_process)) + process_data(directory_to_process=directory_to_process, **kwargs) + + +def split_data(target_dir, partition= [0.6, 0.2, 0.2]): + split_dir = target_dir + "/splits" + if not os.path.exists(split_dir): os.mkdir(split_dir) + os.chdir(target_dir) + files = glob.glob("*.hkl") + filesList = sorted(files) + #Bing: 20200415 + temporal_files = glob.glob("*.pkl") + temporal_filesList = sorted(temporal_files) + + # determine correct indicesue + train_begin = 0 + train_end = round(partition[0] * len(filesList)) - 1 + val_begin = train_end + 1 + val_end = train_end + round(partition[1] * len(filesList)) + test_begin = val_end + 1 + + + # slightly adapting start and end because starts at the first index given and stops before(!) the last. + train_files = filesList[train_begin:val_begin] + val_files = filesList[val_begin:test_begin] + test_files = filesList[test_begin:] + #bing: 20200415 + train_temporal_files = temporal_filesList[train_begin:val_begin] + val_temporal_files = temporal_filesList[val_begin:test_begin] + test_temporal_files = temporal_filesList[test_begin:] + + + splits = {s: [] for s in ['train', 'test', 'val']} + splits['val'] = val_files + splits['test'] = test_files + splits['train'] = train_files + + + splits_temporal = {s: [] for s in ['train', 'test', 'val']} + splits_temporal["train"] = train_temporal_files + splits_temporal["val"] = val_temporal_files + splits_temporal["test"] = test_temporal_files + + for split in splits: + X = [] + X_temporal = [] + files = splits[split] + temporal_files = splits_temporal[split] + for file, temporal_file in zip(files, temporal_files): + data_file = os.path.join(target_dir,file) + temporal_file = os.path.join(target_dir,temporal_file) + #load data with hkl file + data = hkl.load(data_file) + temporal_data = pickle.load(open(temporal_file,"rb")) + X_temporal = X_temporal + list(temporal_data) + X = X + list(data) + X = np.array(X) + X_temporal = np.array(X_temporal) + print ("X_temporal",X_temporal) + #save training, val and test data into splits directoyr + hkl.dump(X, os.path.join(split_dir, 'X_' + split + '.hkl')) + hkl.dump(files, os.path.join(split_dir,'sources_' + split + '.hkl')) + pickle.dump(X_temporal,open(os.path.join(split_dir,"T_"+split + ".pkl"),"wb")) + print ("PICKLE FILE FOR SPLITS SAVED") + +# ML 2020/05/15 S +def get_unique_vars(varnames): + vars_uni, varsind = np.unique(varnames,return_index = True) + nvars_uni = len(vars_uni) + + return(vars_uni, varsind, nvars_uni) + +class Calc_data_stat: + """Class for computing statistics and saving them to a json-files.""" + + def __init__(self,nvars): + """ + Initializes the instance for later use, i.e. initializes attributes with expected shape + """ + self.stat_dict = {} + self.varmin = np.full((nvars,1),np.nan) # avoid rank one-arrays + self.varmax = np.full((nvars,1),np.nan) + self.varavg = np.zeros((nvars,1)) # second dimension acts as placeholder for averaging on master node collecting json-files from slave nodes + self.nfiles = [0] # number of processed files + self.mode = "" # mode to distinguish between processing on slave and master nodes (sanity check) + self.jsfiles = [""] # list of processed json-files (master-mode only!) + + def acc_stat_loc(self,ivar,data): + """ + Performs accumulation of all statistics while looping through all data files (i.e. updates the statistics) on slave nodes + """ + if not self.mode: + self.mode = "loc" + elif self.mode == "master": + raise ValueError("Cannot switch to loc-mode during runtime...") + else: + pass + + self.varmin[ivar] = np.fmin(self.varmin[ivar],np.amin(data)) + self.varmax[ivar] = np.fmax(self.varmax[ivar],np.amax(data)) + self.varavg[ivar,0] += np.average(data) # note that we sum the average -> readjustment required in the final step + if (ivar == 0): self.nfiles[0] += 1 + + def finalize_stat_loc(self,varnames): + """ + Finalizes computation of statistics after going through all the data on slave nodes. + Afterwards the statistics dictionary is ready for being written in a json-file. + """ + + if self.mode != "loc": + raise ValueError("Object is not in loc-mode. Probably some master-method has been called previously.") + + if self.stat_dict: raise ValueError("Statistics dictionary is not empty.") + + vars_uni, varsind = np.unique(varnames,return_index=True) + nvars = len(vars_uni) + + vars_uni, varsind, nvars = get_unique_vars(varnames) + + varmin, varmax, varavg = self.varmin[varsind], self.varmax[varsind], self.varavg[varsind,0] + + for i in range(nvars): + varavg[i] /= self.nfiles # for adjusting the (summed) average + + self.stat_dict[vars_uni[i]]=[] + self.stat_dict[vars_uni[i]].append({ + 'min': varmin[i,0].tolist(), + 'max': varmax[i,0].tolist(), + 'avg': varavg[i].tolist() + }) + self.stat_dict["common_stat"] = [ + {"nfiles":self.nfiles[0]}] + + def acc_stat_master(self,file_dir,file_id): + """ + Opens statistics-file (created by slave nodes) and accumulates its content. + """ + + if (int(file_id) <= 0): raise ValueError("Non-valid file_id passed.") + + if not self.mode: + self.mode = "master" + elif self.mode == "loc": + raise ValueError("Cannot switch to master-mode during runtime...") + else: + pass + + # sanity check: check if dictionary is initialized with unique values only + if self.stat_dict.keys() > set(self.stat_dict.keys()): + raise ValueError("Initialized dictionary contains duplicates of variales. Need unique collection instead.") + else: + pass + + file_name = os.path.join(file_dir,"stat_{0:0=2d}.json".format(int(file_id))) + + if not file_name in self.jsfiles: + print("Try to open: '"+file_name+"'") + + try: + with open(file_name) as js_file: + dict_in = json.load(js_file) + + # sanity check + if (len(dict_in.keys()) -1 != len(self.varmin)): + raise ValueError("Different number of variables found in json-file '"+js_file+"' as expected from statistics object.") + + self.varmin = np.fmin(self.varmin,Calc_data_stat.get_stat_allvars(dict_in,"min")) + self.varmax = np.fmax(self.varmax,Calc_data_stat.get_stat_allvars(dict_in,"max")) + + if (np.all(self.varavg == 0.) or self.nfiles[0] == 0): + self.varavg = Calc_data_stat.get_stat_allvars(dict_in,"avg") + self.nfiles[0] = Calc_data_stat.get_common_stat(dict_in,"nfiles") + self.jsfiles[0]= file_name + else: + self.varavg = np.append(self.varavg,Calc_data_stat.get_stat_allvars(dict_in,"avg"),axis=1) + self.nfiles.append(Calc_data_stat.get_common_stat(dict_in,"nfiles")) + self.jsfiles.append(file_name) + except IOError: + print("Cannot handle statistics file '"+file_name+"' to be processed.") + except ValueError: + print("Cannot retireve all required statistics from '"+file_name+"'") + else: + print("Statistics file '"+file_name+"' has already been processed. Thus, just pass here...") + pass + + def finalize_stat_master(self,path_out,vars_uni): + """ + Performs final compuattion of statistics after accumulation from slave nodes. + """ + if self.mode != "master": + raise ValueError("Object is not in master-mode. Probably some loc-method has been called previously.") + + if len(vars_uni) > len(set(vars_uni)): + raise ValueError("Input variable names are not unique.") + + js_file = os.path.join(path_out,"statistics.json") + nvars = len(vars_uni) + n_jsfiles = len(self.nfiles) + nfiles_all= np.sum(self.nfiles) + avg_wgt = np.array(self.nfiles,dtype=float)/float(nfiles_all) + + varmin, varmax = self.varmin, self.varmax + varavg = np.sum(np.multiply(self.varavg,avg_wgt),axis=1) # calculate weighted average + + for i in range(nvars): + self.stat_dict[vars_uni[i]]=[] + self.stat_dict[vars_uni[i]].append({ + 'min': varmin[i,0].tolist(), + 'max': varmax[i,0].tolist(), + 'avg': varavg[i].tolist() + }) + self.stat_dict["common_stat"] = [ + {"nfiles": int(nfiles_all), + "jsfiles": self.jsfiles + }] + + @staticmethod + def get_stat_allvars(stat_dict,stat_name): + """ + Unpacks statistics dictionary and returns values of stat_name of all variables contained in the dictionary. + """ + + # some sanity checks + if not stat_dict: raise ValueError("Input dictionary is still empty! Cannot access anything from it.") + if not "common_stat" in stat_dict.keys(): raise ValueError("Input dictionary does not seem to be a proper statistics dictionary as common_stat-element is missing.") + + stat_dict_filter = (stat_dict).copy() + stat_dict_filter.pop("common_stat") + + if not stat_dict_filter.keys(): raise ValueError("Input dictionary does not contain any variables.") + + try: + varstat = np.array([stat_dict_filter[i][0][stat_name] for i in [*stat_dict_filter.keys()]]) + if np.ndim(varstat) == 1: # avoid returning rank 1-arrays + return varstat.reshape(-1,1) + else: + return varstat + except: + raise ValueError("Could not find "+stat_name+" for all variables of input dictionary.") + + @staticmethod + def get_stat_vars(stat_dict,stat_name,vars_in): + """ + Retrieves requested statistics (stat_name) for all unique variables listed in allvars given statistics dictionary. + If more than one unique variable is processed, this method returns a list, whereas a scalar is returned else. + """ + + if not stat_dict: raise ValueError("Statistics dictionary is still empty! Cannot access anything from it.") + if not "common_stat" in stat_dict.keys(): raise ValueError("Input dictionary does not seem to be a proper statistics dictionary as common_stat-element is missing.") + + vars_uni,indrev = np.unique(vars_in,return_inverse=True) + + try: + if len(vars_uni) > 1: + return([stat_dict[var][0][stat_name] for var in vars_uni[indrev]]) + else: + return(stat_dict[vars_uni[0]][0][stat_name]) + except: + raise ValueError("Could not find "+stat_name+" for all variables of input dictionary.") + + @staticmethod + def get_common_stat(stat_dict,stat_name): + + if not stat_dict: raise ValueError("Input dictionary is still empty! Cannot access anything from it.") + if not "common_stat" in stat_dict.keys(): raise ValueError("Input dictionary does not seem to be a proper statistics dictionary as common_stat-element is missing.") + + common_stat_dict = stat_dict["common_stat"][0] + + try: + return(common_stat_dict[stat_name]) + except: + raise ValueError("Could not find "+stat_name+" in common_stat of input dictionary.") + + + def write_stat_json(self,path_out,file_id = -1): + """ + Writes statistics-dictionary of slave nodes to json-file (with job_id in the output name) + If file_id is passed (and greater than 0), parallelized peration on a slave node is assumed. + Else: method is invoked from master node, i.e. final json-file is created + """ + if (self.mode == "loc"): + if int(file_id) <= 0: raise ValueError("Object is in loc-mode, but no valid file_id passed") + # json-file from slave node + js_file = os.path.join(path_out,'stat_{0:0=2d}.json'.format(int(file_id))) + elif (self.mode == "master"): + if (int(file_id) > 0): print("Warning: Object is master-mode, but file_id passed which will be ignored.") + # (final) json-file from master node + js_file = os.path.join(path_out,'statistics.json') + else: + raise ValueError("Object seems to be initialized only, but no data has been processed so far.") + + try: + with open(js_file,'w') as stat_out: + json.dump(self.stat_dict,stat_out) + except ValueError: + print("Something went wrong when writing dictionary to json-file: '"+js_file+"''") + finally: + print("Created statistics json-file '"+js_file+"' successfully.") + +# ML 2020/05/15 E + + +def split_data_multiple_years(target_dir,partition,varnames): + """ + Collect all the X_*.hkl data across years and split them to training, val and testing datatset + """ + #target_dirs = [os.path.join(target_dir,year) for year in years] + #os.chdir(target_dir) + splits_dir = os.path.join(target_dir,"splits") + os.makedirs(splits_dir, exist_ok=True) + splits = {s: [] for s in list(partition.keys())} + # ML 2020/05/19 S + vars_uni, varsind, nvars = get_unique_vars(varnames) + stat_obj = Calc_data_stat(nvars) + + for split in partition.keys(): + values = partition[split] + files = [] + X = [] + Temporal_X = [] + for year in values.keys(): + file_dir = os.path.join(target_dir,year) + for month in values[year]: + month = "{0:0=2d}".format(month) + hickle_file = "X_{}.hkl".format(month) + #20200408:bing + temporal_file = "T_{}.pkl".format(month) + data_file = os.path.join(file_dir,hickle_file) + temporal_data_file = os.path.join(file_dir,temporal_file) + files.append(data_file) + data = hkl.load(data_file) + temporal_data = pickle.load(open(temporal_data_file,"rb")) + X = X + list(data) + Temporal_X = Temporal_X + list(temporal_data) + # process stat-file: + stat_obj.acc_stat_master(file_dir,int(month)) + X = np.array(X) + Temporal_X = np.array(Temporal_X) + print("==================={}=====================".format(split)) + print ("Sources for {} dataset are {}".format(split,files)) + print("Number of images in {} dataset is {} ".format(split,len(X))) + print ("dataset shape is {}".format(np.array(X).shape)) + hkl.dump(X, os.path.join(splits_dir , 'X_' + split + '.hkl')) + pickle.dump(Temporal_X, open(os.path.join(splits_dir,"T_"+split + ".pkl"),"wb")) + hkl.dump(files, os.path.join(splits_dir,'sources_' + split + '.hkl')) + + # write final statistics json-file + stat_obj.finalize_stat_master(target_dir,vars_uni) + stat_obj.write_stat_json(splits_dir) + + + + + + + + + + + + + + + + + + + + + + diff --git a/workflow_parallel_frame_prediction/README.md b/workflow_parallel_frame_prediction/README.md new file mode 100755 index 0000000000000000000000000000000000000000..434e591f8f1b94fda5b9df119226ca6d037f81dc --- /dev/null +++ b/workflow_parallel_frame_prediction/README.md @@ -0,0 +1,56 @@ +# Workflow for Frame Prediction by Parallel Deep Learning + + + +## Workflow for parallel deep learning + +This project implements a workflow for parallel deep learning to predict the 2m temperature based on Severin's master thesis [code link](https://github.com/severin1992/airtemprednet) [thesis link](https://b2drop.eudat.eu/s/RmTd8K3pLsDMFw6) . + + +The workflow consists of a sqeuence of steps (Data Extraction, Data Preprocessing, Training and Data Postprocess)to implement video prediction, and In each step try to Parallel for accelerating the whole prediction process. + + +The wokflow have been tested on the supercomputers from JSC, [JURECA](https://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/JURECA/JURECA_node.html) and [JUWELS](https://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/JUWELS/JUWELS_node.html) + + +## Requirement: +* Keras +* Horovod +* Python3.6 +* mpi4py + + + +## Usage + +1. Clone or download this repository, +2. Install the required modules/packages on JURECA/JUWELS. + + ```shell + source packageInstallation.sh + ``` + + Add package directory to 'PYTHONPATH' + ```shell + export PYTHONPATH=/p/home/jusers/USERNAME/jureca/.local/lib/python3.6/site-packages:$PYTHONPATH + ``` + + + ```shell + source packageInstallation.sh + ``` + Add the packages directory to the PYTHONPATH + ```shell + export PATHONPATH=/p/home/jusers/USERNAME/jureca/.local/lib/python3.6/site-packages:$PYTHONPATH + ``` + +3. Configure your input directory, output and log directory in .dat file for each step. + + +4. Run .sh file for submitting job + + +## Workflow example + + + diff --git a/workflow_parallel_frame_prediction/Training/data_utils.py b/workflow_parallel_frame_prediction/Training/data_utils.py new file mode 100755 index 0000000000000000000000000000000000000000..d048236ffb1b84addde4d28aca6165dea8815dae --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/data_utils.py @@ -0,0 +1,163 @@ +import hickle as hkl +import numpy as np +from keras import backend as K +from keras.preprocessing.image import Iterator + + +import inspect +print(inspect.getmembers(hkl,predicate=inspect.ismethod)) + +# Data generator that creates sequences for input into PredNet. +class SequenceGenerator(Iterator): + def __init__(self, data_file, source_file, nt, + batch_size=8, shuffle=False, seed=None, + output_mode='error', sequence_start_mode='all', N_seq=None, + data_format=K.image_data_format()): + self.X = hkl.load(data_file) # X will be like (n_images, nb_cols, nb_rows, nb_channels) + self.sources = hkl.load(source_file) # source for each image so when creating sequences can assure that consecutive frames are from same video + self.nt = nt + self.batch_size = batch_size + self.data_format = data_format + assert sequence_start_mode in {'all', 'unique'}, 'sequence_start_mode must be in {all, unique}' + self.sequence_start_mode = sequence_start_mode + assert output_mode in {'error', 'prediction'}, 'output_mode must be in {error, prediction}' + self.output_mode = output_mode + + if self.data_format == 'channels_first': + self.X = np.transpose(self.X, (0, 3, 1, 2)) + self.im_shape = self.X[0].shape + if self.sequence_start_mode == 'all': # allow for any possible sequence, starting from any frame + #bing + #self.possible_starts = np.array([i for i in range(self.X.shape[0] - self.nt) if self.sources[i] == self.sources[i + self.nt - 1]]) + self.possible_starts = np.array([i for i in range(self.X.shape[0] - self.nt)]) + elif self.sequence_start_mode == 'unique': #create sequences where each unique frame is in at most one sequence + curr_location = 0 + possible_starts = [] + while curr_location < self.X.shape[0] - self.nt + 1: + if self.sources[curr_location] == self.sources[curr_location + self.nt - 1]: + possible_starts.append(curr_location) + curr_location += self.nt + else: + curr_location += 1 + self.possible_starts = possible_starts + + if shuffle: + self.possible_starts = np.random.permutation(self.possible_starts) + if N_seq is not None and len(self.possible_starts) > N_seq: # select a subset of sequences if want to + self.possible_starts = self.possible_starts[:N_seq] + self.N_sequences = len(self.possible_starts) + print("N_sequences",self.N_sequences) + super(SequenceGenerator, self).__init__(len(self.possible_starts), batch_size, shuffle, seed) + + def __getitem__(self, null): + return self.next() + + def next(self): + with self.lock: + current_index = (self.batch_index * self.batch_size) % self.n + index_array, current_batch_size = next(self.index_generator), self.batch_size + batch_x = np.zeros((current_batch_size, self.nt) + self.im_shape, np.float32) + for i, idx in enumerate(index_array): + idx = self.possible_starts[idx] + batch_x[i] = self.preprocess(self.X[idx:idx+self.nt]) + if self.output_mode == 'error': # model outputs errors, so y should be zeros + batch_y = np.zeros(current_batch_size, np.float32) + elif self.output_mode == 'prediction': # output actual pixels + batch_y = batch_x + return batch_x, batch_y + + def preprocess(self, X): + ### Normalization after extrema cut off: ### + #cut maxs & mins to mean+3*std & mean-3*std of training set for each parameter + #x_cut = np.zeros(shape=X.shape) + #x_cut = X*1 #pass X by value and not by reference + #x_cut[:,:,:,0][X[:,:,:,0]>311.5]=311.5 #set T2 upper limit + #x_cut[:,:,:,0][X[:,:,:,0]<258.9]=258.9 #set T2 lower limit + #x_cut[:,:,:,1][X[:,:,:,1]>104635.2]=104635.2 #set GP upper limit + #x_cut[:,:,:,1][X[:,:,:,1]<98205.6]=98205.6 #set GP lower limit ###Caution: Drastical cut ### + #x_cut[:,:,:,2][X[:,:,:,2]>6209.5]=6209.5 #set GPH upper limit ###Caution: Unnecessary as it succeeds max GPH ### + #x_cut[:,:,:,2][X[:,:,:,2]<5005.8]=5005.8 #set GPH lower limit + #normalize X based on max and min values(equals upper and lower limits except highCutGPH) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (x_cut[:,:,:,0]-258.9)/(311.5-258.9) + #x_processed[:,:,:,1] = (x_cut[:,:,:,1]-98205.6)/(104635.2-98205.6) + #x_processed[:,:,:,2] = (x_cut[:,:,:,2]-5005.8)/(6007.097417091836-5005.8) #GPH max stays; see above + + ### 'Standard' normalization: (x-min(x))/(max(x)-min(x)) ### + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-93401.125)/(105391.4375-93401.125) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + + ### t2only 'Standard' normalization: (x-min(x))/(max(x)-min(x)) ### + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-235.2141571044922)/(321.46630859375-235.2141571044922) + + ### t2_2MSL_1 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-93401.125)/(105391.4375-93401.125) + + ### t2_1MSL_2 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-93401.125)/(105391.4375-93401.125) + #x_processed[:,:,:,2] = (X[:,:,:,2]-93401.125)/(105391.4375-93401.125) + + ### t2_2gph500_1 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + ## t2_1gph500_2 'standard' normalization: + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = (X[:,:,:,1]-4836.070232780612)/(6007.097417091836-4836.070232780612) + #x_processed[:,:,:,2] = (X[:,:,:,2]-4836.070232780612)/(6007.097417091836-4836.070232780612) + + ### No standardization for moving Objects test set: Just 0s and 1s + #x_processed = np.zeros(shape=X.shape) + #x_processed = X + + ### t2_1 'standard' normalization (got one dimension less, due to just one channel) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + #x_processed[:,:,:,1] = X[:,:,:,1] + #x_processed[:,:,:,2] = X[:,:,:,2] + + ### t2_1 'standard' normalization (got one dimension less, due to just one channel) + x_processed = np.zeros(shape=X.shape) + x_processed[:,:,:,0] = (X[:,:,:,0]-235.2141571044922)/(321.46630859375-235.2141571044922) + x_processed[:,:,:,1] = (X[:,:,:,1]-235.2141571044922)/(321.46630859375-235.2141571044922) + x_processed[:,:,:,2] = X[:,:,:,2] + + ### Standardization: (x-mean)/standard_deviation ### + #Doesn't work due to some activation functions + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (X[:,:,:,0]-285.1751264870658)/8.770013367617763 + #x_processed[:,:,:,1] = (X[:,:,:,1]-101420.4382666807)/1071.5999818175521 + #x_processed[:,:,:,2] = (X[:,:,:,2]-5607.662795353248)/200.62593105865764 + + ### Standardization+Normalization ### + # standardize:(x-mean)/standard_deviation + #x_preprocessed = np.zeros(shape=X.shape) + #x_preprocessed[:,:,:,0] = (X[:,:,:,0]-285.1751264870658)/8.770013367617763 + #x_preprocessed[:,:,:,1] = (X[:,:,:,1]-101420.4382666807)/1071.5999818175521 + #x_preprocessed[:,:,:,2] = (X[:,:,:,2]-5607.662795353248)/200.62593105865764 + # normalize:(x-min(x))/(max(x)-min(x)) + #x_processed = np.zeros(shape=X.shape) + #x_processed[:,:,:,0] = (x_preprocessed[:,:,:,0]-np.amin(x_preprocessed[:,:,:,0]))/(np.amax(x_preprocessed[:,:,:,0])-np.amin(x_preprocessed[:,:,:,0])) + #x_processed[:,:,:,1] = (x_preprocessed[:,:,:,1]-np.amin(x_preprocessed[:,:,:,1]))/(np.amax(x_preprocessed[:,:,:,1])-np.amin(x_preprocessed[:,:,:,1])) + #x_processed[:,:,:,2] = (x_preprocessed[:,:,:,2]-np.amin(x_preprocessed[:,:,:,2]))/(np.amax(x_preprocessed[:,:,:,2])-np.amin(x_preprocessed[:,:,:,2])) + + return x_processed.astype(np.float32) + #return X.astype(np.float32) / 255 + + def create_all(self): + X_all = np.zeros((self.N_sequences, self.nt) + self.im_shape, np.float32) + for i, idx in enumerate(self.possible_starts): + X_all[i] = self.preprocess(self.X[idx:idx+self.nt]) + return X_all diff --git a/workflow_parallel_frame_prediction/Training/devel_horovodJob.sh b/workflow_parallel_frame_prediction/Training/devel_horovodJob.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b442c1861075ed3909435ffe70c2636ec0df115 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/devel_horovodJob.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --account=deepacf +# budget account where contingent is taken from# TASKS = NODES * GPUS_PER_NODE +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks=2 +# can be omitted if --nodes and --ntasks-per-node +# are given +# SBATCH --cpus-per-task=1 +# for OpenMP/hybrid jobs only +#SBATCH --output=horovod-%j.out +# if keyword omitted: Default is slurm-%j.out in +# the submission directory (%j is replaced by +# the job ID). +#SBATCH --error=horovod-%j.err +# if keyword omitted: Default is slurm-%j.out in +# the submission directory. +#SBATCH --time=00:20:00 +#SBATCH --gres=gpu:2 +#SBATCH --partition=develgpus +#SBATCH --mail-user=b.gong@fz-juelich.de +#SBATCH --mail-type=ALL + +#create a folder to save the output +jutil env activate -p deepacf +module --force purge +module load Stages/Devel-2019a +module load GCC/8.3.0 +module load MVAPICH2/2.3.2-GDR +module load Stages/2019a +module load Horovod/0.16.2-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +#module load ParaStationMPI/5.2.2-1 +#module load h5py/2.9.0-Python-3.6.8 +# *** start of job script ***: +# Note: The current working directory at this point is +# the directory where sbatch was executed. +# export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +# *** start of job script *** +# Note: The current working directory at this point is +# the directory where sbatch was executed. +# export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +srun --cpu_bind=none python3.6 kitti_train_horovod.py diff --git a/workflow_parallel_frame_prediction/Training/evaluate_multistep.py b/workflow_parallel_frame_prediction/Training/evaluate_multistep.py new file mode 100644 index 0000000000000000000000000000000000000000..a555a92973613868fc5dc7008fd0e6aa131133b6 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/evaluate_multistep.py @@ -0,0 +1,121 @@ +''' +Evaluate trained PredNet +Calculates mean-squared error and plots predictions. +''' + +import os +#import sys, argparse +import numpy as np +#from six.moves import cPickle +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + +from keras import backend as K +from keras.models import Model, model_from_json +from keras.layers import Input, Dense, Flatten + +from prednet import PredNet +from data_utils import SequenceGenerator +from kitti_settings import * +#from scipy.misc import imsave + +##Just for checking how the shape is after generator.create_all() from Sequence Generator +#import hickle as hkl +## +n_plot = 10 #number of plots +batch_size = 10 +nt = 15 #number of timesteps used for sequences in training +numtests = 18 +extrap = 10 #frame number from where extrapolation will start to be produced + +#parser = argparse.ArgumentParser() +#parser.add_argument('-ft', help="fine-tune multistep: add extrap time") +#args=parser.parse_args() + +weights_file = os.path.join(WEIGHTS_DIR, 'tensorflow_weights/prednet_kitti_weights-extrapfinetuned.hdf5') +json_file = os.path.join(WEIGHTS_DIR, 'prednet_kitti_model-extrapfinetuned.json') +test_file = os.path.join(DATA_DIR, 'X_test.hkl') +test_sources = os.path.join(DATA_DIR, 'sources_test.hkl') + +#if args.ft is not None: +# extrap = int(args.ft) +# nt = extrap + 5 +# weights_file = os.path.join(MODELS_DIR, 'prednet_ee_weights-extrapfinetuned.hdf5') +# json_file = os.path.join(MODELS_DIR, 'prednet_ee_model-extrapfinetuned.json') + +# Load trained model +f = open(json_file, 'r') +json_string = f.read() +f.close() +train_model = model_from_json(json_string, custom_objects = {'PredNet': PredNet}) +train_model.load_weights(weights_file) + +# Create testing model (to output predictions) +layer_config = train_model.layers[1].get_config() +layer_config['output_mode'] = 'prediction' #'prediction' +layer_config['extrap_start_time'] = extrap; +data_format = layer_config['data_format'] if 'data_format' in layer_config else layer_config['dim_ordering'] +test_prednet = PredNet(weights=train_model.layers[1].get_weights(), **layer_config) +input_shape = list(train_model.layers[0].batch_input_shape[1:]) +input_shape[0] = nt +inputs = Input(shape=tuple(input_shape)) +predictions = test_prednet(inputs) +test_model = Model(inputs=inputs, outputs=predictions) + +test_generator = SequenceGenerator(test_file, test_sources, nt, sequence_start_mode='unique', data_format=data_format) # orig: unique +X_test = test_generator.create_all() +X_hat = test_model.predict(X_test, batch_size) +if data_format == 'channels_first': + X_test = np.transpose(X_test, (0, 1, 3, 4, 2)) + X_hat = np.transpose(X_hat, (0, 1, 3, 4, 2)) + +# Compare MSE of PredNet predictions vs. using last frame. Write results to prediction_scores.txt +shapeXhat = str(X_hat.shape) #Just have a look at the shapes to be sure we are calculating the right MSE +shapeXtest = str(X_test.shape) +mse_model = np.mean( (X_test[:, 1:,:,:,0] - X_hat[:, 1:,:,:,0])**2 ) # look at all timesteps except the first +mse_model_last = np.mean( (X_test[:, 9,:,:,0] - X_hat[:, 14,:,:,0])**2 ) +#mse_prev = np.mean( (X_test[:, :-1,:,:,0] - X_test[:, 1:,:,:,0])**2 ) +mse_prev = np.mean( (X_test[:, 9,:,:,0] - X_test[:, 14,:,:,0])**2 ) +if not os.path.exists(RESULTS_SAVE_DIR): os.mkdir(RESULTS_SAVE_DIR) +f = open(os.path.join(RESULTS_SAVE_DIR, 'prediction_scores.txt'), 'w') +f.write("Model MSE: %f\n" % mse_model) +f.write("Model MSE from only last prediction in sequence in comparison with extrap start time: %f\n" % mse_model_last) +f.write("Previous Frame MSE last frame vs extrap start time: %f" % mse_prev) +f.write("Shape of X_test: " + shapeXtest) +f.write("") +f.write("Shape of X_hat: " + shapeXhat) +f.close() + +# Plot some predictions +aspect_ratio = float(X_hat.shape[2]) / X_hat.shape[3] +plt.figure(figsize = (nt, 2*aspect_ratio)) +gs = gridspec.GridSpec(2, nt) +gs.update(wspace=0., hspace=0.) +plot_save_dir = os.path.join(RESULTS_SAVE_DIR, 'prediction_plots/') +if not os.path.exists(plot_save_dir): os.mkdir(plot_save_dir) +plot_idx = np.random.permutation(X_test.shape[0])[:n_plot] +for i in plot_idx: + for t in range(nt): + plt.subplot(gs[t]) + plt.imshow(X_test[i,t,:,:,0], interpolation='none') + plt.tick_params(axis='both', which='both', bottom='off', top='off', left='off', right='off', labelbottom='off', labelleft='off') + if t==0: plt.ylabel('Actual', fontsize=10) + + plt.subplot(gs[t + nt]) + plt.imshow(X_hat[i,t,:,:,0], interpolation='none') + plt.tick_params(axis='both', which='both', bottom='off', top='off', left='off', right='off', labelbottom='off', labelleft='off') + if t==0: plt.ylabel('Predicted', fontsize=10) + + plt.savefig(plot_save_dir + 'plot_' + str(i) + '.jpg') + plt.clf() + +#abe +#for test in range(numtests): +# testdir = "tile-" + str(test) +# testdir = os.path.join(plot_save_dir, testdir) +# if not os.path.exists( testdir ) : os.mkdir( testdir ) +# for t in range(nt): +# imsave( testdir + "/pred-%02d.jpg" % (t,), X_hat[test,t] ) +# imsave( testdir + "/orig-%02d.jpg" % (t,), X_test[test,t]) \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/bin/f2py b/workflow_parallel_frame_prediction/Training/hickle/bin/f2py new file mode 100755 index 0000000000000000000000000000000000000000..fcc774fba52f3705ff41babc8dbb21dae36d2c29 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/bin/f2py @@ -0,0 +1,4 @@ +#!/usr/local/software/jureca/Stages/2018b/software/Python/3.6.6-GCCcore-7.3.0/bin/python +# EASY-INSTALL-SCRIPT: 'numpy==1.15.2','f2py' +__requires__ = 'numpy==1.15.2' +__import__('pkg_resources').run_script('numpy==1.15.2', 'f2py') diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/__pycache__/site.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/__pycache__/site.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abc2e28feeb37e7e377c2cf5eadc076e5edb1e93 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/__pycache__/site.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/easy-install.pth b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/easy-install.pth new file mode 100755 index 0000000000000000000000000000000000000000..09ac282550d7bba3d89ef3a91ea75877f66f0384 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/easy-install.pth @@ -0,0 +1,4 @@ +./hickle-3.4.3-py3.6.egg +/usr/local/software/jureca/Stages/2018b/software/h5py/2.8.0-ipsmpi-2018b-Python-3.6.6/lib/python3.6/site-packages/h5py-2.8.0-py3.6-linux-x86_64.egg +/usr/local/software/jureca/Stages/2018b/software/SciPy-Stack/2018b-gcccoremkl-7.3.0-2019.0.117-Python-3.6.6/lib/python3.6/site-packages/numpy-1.15.2-py3.6-linux-x86_64.egg +/usr/local/software/jureca/Stages/2018b/software/Python/3.6.6-GCCcore-7.3.0/lib/python3.6/site-packages/six-1.11.0-py3.6.egg diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/PKG-INFO b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/PKG-INFO new file mode 100755 index 0000000000000000000000000000000000000000..5f8214504c72f2cfb7307cf8259de678fba12236 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/PKG-INFO @@ -0,0 +1,207 @@ +Metadata-Version: 2.1 +Name: hickle +Version: 3.4.3 +Summary: Hickle - a HDF5 based version of pickle +Home-page: http://github.com/telegraphic/hickle +Author: Danny Price +Author-email: dan@thetelegraphic.com +License: UNKNOWN +Download-URL: https://github.com/telegraphic/hickle/archive/3.4.3.tar.gz +Description: [](https://travis-ci.org/telegraphic/hickle) + [](http://joss.theoj.org/papers/0c6638f84a1a574913ed7c6dd1051847) + + + Hickle + ====== + + Hickle is a [HDF5](https://www.hdfgroup.org/solutions/hdf5/) based clone of `pickle`, with a twist: instead of serializing to a pickle file, + Hickle dumps to a HDF5 file (Hierarchical Data Format). It is designed to be a "drop-in" replacement for pickle (for common data objects), but is + really an amalgam of `h5py` and `dill`/`pickle` with extended functionality. + + That is: `hickle` is a neat little way of dumping python variables to HDF5 files that can be read in most programming + languages, not just Python. Hickle is fast, and allows for transparent compression of your data (LZF / GZIP). + + Why use Hickle? + --------------- + + While `hickle` is designed to be a drop-in replacement for `pickle` (or something like `json`), it works very differently. + Instead of serializing / json-izing, it instead stores the data using the excellent [h5py](https://www.h5py.org/) module. + + The main reasons to use hickle are: + + 1. It's faster than pickle and cPickle. + 2. It stores data in HDF5. + 3. You can easily compress your data. + + The main reasons not to use hickle are: + + 1. You don't want to store your data in HDF5. While hickle can serialize arbitrary python objects, this functionality is provided only for convenience, and you're probably better off just using the pickle module. + 2. You want to convert your data in human-readable JSON/YAML, in which case, you should do that instead. + + So, if you want your data in HDF5, or if your pickling is taking too long, give hickle a try. + Hickle is particularly good at storing large numpy arrays, thanks to `h5py` running under the hood. + + Documentation + ------------- + + Documentation for hickle can be found at [telegraphic.github.io/hickle/](http://telegraphic.github.io/hickle/). + + + Usage example + ------------- + + Hickle is nice and easy to use, and should look very familiar to those of you who have pickled before. + + In short, `hickle` provides two methods: a [hickle.load](http://telegraphic.github.io/hickle/toc.html#hickle.load) + method, for loading hickle files, and a [hickle.dump](http://telegraphic.github.io/hickle/toc.html#hickle.dump) + method, for dumping data into HDF5. Here's a complete example: + + ```python + import os + import hickle as hkl + import numpy as np + + # Create a numpy array of data + array_obj = np.ones(32768, dtype='float32') + + # Dump to file + hkl.dump(array_obj, 'test.hkl', mode='w') + + # Dump data, with compression + hkl.dump(array_obj, 'test_gzip.hkl', mode='w', compression='gzip') + + # Compare filesizes + print('uncompressed: %i bytes' % os.path.getsize('test.hkl')) + print('compressed: %i bytes' % os.path.getsize('test_gzip.hkl')) + + # Load data + array_hkl = hkl.load('test_gzip.hkl') + + # Check the two are the same file + assert array_hkl.dtype == array_obj.dtype + assert np.all((array_hkl, array_obj)) + ``` + + ### HDF5 compression options + + A major benefit of `hickle` over `pickle` is that it allows fancy HDF5 features to + be applied, by passing on keyword arguments on to `h5py`. So, you can do things like: + ```python + hkl.dump(array_obj, 'test_lzf.hkl', mode='w', compression='lzf', scaleoffset=0, + chunks=(100, 100), shuffle=True, fletcher32=True) + ``` + A detailed explanation of these keywords is given at http://docs.h5py.org/en/latest/high/dataset.html, + but we give a quick rundown below. + + In HDF5, datasets are stored as B-trees, a tree data structure that has speed benefits over contiguous + blocks of data. In the B-tree, data are split into [chunks](http://docs.h5py.org/en/latest/high/dataset.html#chunked-storage), + which is leveraged to allow [dataset resizing](http://docs.h5py.org/en/latest/high/dataset.html#resizable-datasets) and + compression via [filter pipelines](http://docs.h5py.org/en/latest/high/dataset.html#filter-pipeline). Filters such as + `shuffle` and `scaleoffset` move your data around to improve compression ratios, and `fletcher32` computes a checksum. + These file-level options are abstracted away from the data model. + + Recent changes + -------------- + + * December 2018: Accepted to Journal of Open-Source Software (JOSS). + * June 2018: Major refactor and support for Python 3. + * Aug 2016: Added support for scipy sparse matrices `bsr_matrix`, `csr_matrix` and `csc_matrix`. + + Performance comparison + ---------------------- + + Hickle runs a lot faster than pickle with its default settings, and a little faster than pickle with `protocol=2` set: + + ```Python + In [1]: import numpy as np + + In [2]: x = np.random.random((2000, 2000)) + + In [3]: import pickle + + In [4]: f = open('foo.pkl', 'w') + + In [5]: %time pickle.dump(x, f) # slow by default + CPU times: user 2 s, sys: 274 ms, total: 2.27 s + Wall time: 2.74 s + + In [6]: f = open('foo.pkl', 'w') + + In [7]: %time pickle.dump(x, f, protocol=2) # actually very fast + CPU times: user 18.8 ms, sys: 36 ms, total: 54.8 ms + Wall time: 55.6 ms + + In [8]: import hickle + + In [9]: f = open('foo.hkl', 'w') + + In [10]: %time hickle.dump(x, f) # a bit faster + dumping <type 'numpy.ndarray'> to file <HDF5 file "foo.hkl" (mode r+)> + CPU times: user 764 us, sys: 35.6 ms, total: 36.4 ms + Wall time: 36.2 ms + ``` + + So if you do continue to use pickle, add the `protocol=2` keyword (thanks @mrocklin for pointing this out). + + For storing python dictionaries of lists, hickle beats the python json encoder, but is slower than uJson. For a dictionary with 64 entries, each containing a 4096 length list of random numbers, the times are: + + + json took 2633.263 ms + uJson took 138.482 ms + hickle took 232.181 ms + + + It should be noted that these comparisons are of course not fair: storing in HDF5 will not help you convert something into JSON, nor will it help you serialize a string. But for quick storage of the contents of a python variable, it's a pretty good option. + + Installation guidelines (for Linux and Mac OS). + ----------------------------------------------- + + ### Easy method + Install with `pip` by running `pip install hickle` from the command line. + + ### Manual install + + 1. You should have Python 2.7 and above installed + + 2. Install h5py + (Official page: http://docs.h5py.org/en/latest/build.html) + + 3. Install hdf5 + (Official page: http://www.hdfgroup.org/ftp/HDF5/current/src/unpacked/release_docs/INSTALL) + + 4. Download `hickle`: + via terminal: git clone https://github.com/telegraphic/hickle.git + via manual download: Go to https://github.com/telegraphic/hickle and on right hand side you will find `Download ZIP` file + + 5. cd to your downloaded `hickle` directory + + 6. Then run the following command in the `hickle` directory: + `python setup.py install` + + ### Testing + + Once installed from source, run `python setup.py test` to check it's all working. + + + Bugs & contributing + -------------------- + + Contributions and bugfixes are very welcome. Please check out our [contribution guidelines](https://github.com/telegraphic/hickle/blob/master/CONTRIBUTING.md) + for more details on how to contribute to development. + + + Referencing hickle + ------------------ + + If you use `hickle` in academic research, we would be grateful if you could reference [our paper](http://joss.theoj.org/papers/0c6638f84a1a574913ed7c6dd1051847) in the [Journal of Open-Source Software (JOSS)](http://joss.theoj.org/about). + + ``` + Price et al., (2018). Hickle: A HDF5-based python pickle replacement. Journal of Open Source Software, 3(32), 1115, https://doi.org/10.21105/joss.01115 + ``` + +Keywords: pickle,hdf5,data storage,data export +Platform: Cross platform (Linux +Platform: Mac OSX +Platform: Windows) +Requires-Python: >=2.7 +Description-Content-Type: text/markdown diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/SOURCES.txt b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/SOURCES.txt new file mode 100755 index 0000000000000000000000000000000000000000..bf56f059f14d80d641efba6de75e401b4410786f --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/SOURCES.txt @@ -0,0 +1,52 @@ +.gitignore +.nojekyll +.pylintrc +.travis.yml +CODE_OF_CONDUCT.md +CONTRIBUTING.md +LICENSE +README.md +_config.yml +paper.bib +paper.md +requirements.txt +setup.cfg +setup.py +docs/Makefile +docs/make_docs.sh +docs/source/conf.py +docs/source/index.md +docs/source/toc.rst +docs/source/_static/empty.txt +docs/source/_templates/empty.txt +hickle/__init__.py +hickle/helpers.py +hickle/hickle.py +hickle/hickle_legacy.py +hickle/hickle_legacy2.py +hickle/lookup.py +hickle.egg-info/PKG-INFO +hickle.egg-info/SOURCES.txt +hickle.egg-info/dependency_links.txt +hickle.egg-info/not-zip-safe +hickle.egg-info/requires.txt +hickle.egg-info/top_level.txt +hickle/loaders/__init__.py +hickle/loaders/load_astropy.py +hickle/loaders/load_numpy.py +hickle/loaders/load_pandas.py +hickle/loaders/load_python.py +hickle/loaders/load_python3.py +hickle/loaders/load_scipy.py +tests/__init__.py +tests/test_astropy.py +tests/test_hickle.py +tests/test_hickle_helpers.py +tests/test_legacy_load.py +tests/test_scipy.py +tests/legacy_hkls/generate_test_hickle.py +tests/legacy_hkls/hickle_1_1_0.hkl +tests/legacy_hkls/hickle_1_3_2.hkl +tests/legacy_hkls/hickle_1_4_0.hkl +tests/legacy_hkls/hickle_2_0_5.hkl +tests/legacy_hkls/hickle_2_1_0.hkl \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/dependency_links.txt b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/dependency_links.txt new file mode 100755 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/not-zip-safe b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/not-zip-safe new file mode 100755 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/not-zip-safe @@ -0,0 +1 @@ + diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/requires.txt b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/requires.txt new file mode 100755 index 0000000000000000000000000000000000000000..8ccd55587b619ea766f8d1a76bc06739e176f552 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/requires.txt @@ -0,0 +1,2 @@ +numpy +h5py diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/top_level.txt b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/top_level.txt new file mode 100755 index 0000000000000000000000000000000000000000..ce3b9fb874814125f842378fab0204ff0e9184a3 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/EGG-INFO/top_level.txt @@ -0,0 +1,2 @@ +hickle +tests diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__init__.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..46e2ea2c6d0f5578529b3e40e060b1a244420772 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__init__.py @@ -0,0 +1,4 @@ +from .hickle import dump, load +from .hickle import __version__ + + diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/__init__.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..638a67eaaa3ab784f6e31d96cc63e6c3a1acc1e7 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/__init__.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/helpers.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/helpers.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..944de5cd7f681a49a8d9fbf2024be8e218cadb71 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/helpers.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff8228b3857699bdd27288d585d63a1bcfa08c69 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..e81a055331c0c861fbe8dbf300783bb85bcdd730 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy2.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy2.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..477aa15d70a77296c4b1d4b98e55aa747dd6552f Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/hickle_legacy2.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/lookup.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/lookup.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cd877598197f890a4cef46feab4f938a2529c61 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/__pycache__/lookup.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/helpers.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/helpers.py new file mode 100755 index 0000000000000000000000000000000000000000..6c3d7f9f3853101723380f4658487978605f0cf3 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/helpers.py @@ -0,0 +1,113 @@ +import re +import six + +def get_type_and_data(h_node): + """ Helper function to return the py_type and data block for a HDF node """ + py_type = h_node.attrs["type"][0] + data = h_node[()] +# if h_node.shape == (): +# data = h_node.value +# else: +# data = h_node[:] + return py_type, data + +def get_type(h_node): + """ Helper function to return the py_type for a HDF node """ + py_type = h_node.attrs["type"][0] + return py_type + +def sort_keys(key_list): + """ Take a list of strings and sort it by integer value within string + + Args: + key_list (list): List of keys + + Returns: + key_list_sorted (list): List of keys, sorted by integer + """ + + # Py3 h5py returns an irritating KeysView object + # Py3 also complains about bytes and strings, convert all keys to bytes + if six.PY3: + key_list2 = [] + for key in key_list: + if isinstance(key, str): + key = bytes(key, 'ascii') + key_list2.append(key) + key_list = key_list2 + + # Check which keys contain a number + numbered_keys = [re.search(b'\d+', key) for key in key_list] + + # Sort the keys on number if they have it, or normally if not + if(len(key_list) and not numbered_keys.count(None)): + to_int = lambda x: int(re.search(b'\d+', x).group(0)) + return(sorted(key_list, key=to_int)) + else: + return(sorted(key_list)) + + +def check_is_iterable(py_obj): + """ Check whether a python object is iterable. + + Note: this treats unicode and string as NON ITERABLE + + Args: + py_obj: python object to test + + Returns: + iter_ok (bool): True if item is iterable, False is item is not + """ + if six.PY2: + string_types = (str, unicode) + else: + string_types = (str, bytes, bytearray) + if isinstance(py_obj, string_types): + return False + try: + iter(py_obj) + return True + except TypeError: + return False + + +def check_is_hashable(py_obj): + """ Check if a python object is hashable + + Note: this function is currently not used, but is useful for future + development. + + Args: + py_obj: python object to test + """ + + try: + py_obj.__hash__() + return True + except TypeError: + return False + + +def check_iterable_item_type(iter_obj): + """ Check if all items within an iterable are the same type. + + Args: + iter_obj: iterable object + + Returns: + iter_type: type of item contained within the iterable. If + the iterable has many types, a boolean False is returned instead. + + References: + http://stackoverflow.com/questions/13252333/python-check-if-all-elements-of-a-list-are-the-same-type + """ + iseq = iter(iter_obj) + + try: + first_type = type(next(iseq)) + except StopIteration: + return False + except Exception as ex: + return False + else: + return first_type if all((type(x) is first_type) for x in iseq) else False diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle.py new file mode 100755 index 0000000000000000000000000000000000000000..24b38c3e1283618c9ce2c4d97b6960334cc08530 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle.py @@ -0,0 +1,611 @@ +# encoding: utf-8 +""" +# hickle.py + +Created by Danny Price 2016-02-03. + +Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle +file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in +usage as possible, providing a load() and dump() function. + +## Notes + +Hickle has two main advantages over Pickle: +1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler +reads the entire pickle thing and loads it into memory. In comparison, HDF5 +files are designed for large datasets. Things are only loaded when accessed. + +2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows +on Linux and vice versa is likely to fail with errors like "Insecure string +pickle". HDF5 files will load fine, as long as both machines have +h5py installed. + +""" + +from __future__ import absolute_import, division, print_function +import sys +import os +from pkg_resources import get_distribution, DistributionNotFound +from ast import literal_eval + +import numpy as np +import h5py as h5 + + +from .helpers import get_type, sort_keys, check_is_iterable, check_iterable_item_type +from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \ + container_types_dict, container_key_types_dict +from .lookup import check_is_ndarray_like + + +try: + from exceptions import Exception + from types import NoneType +except ImportError: + pass # above imports will fail in python3 + +from six import PY2, PY3, string_types, integer_types +import io + +# Make several aliases for Python2/Python3 compatibility +if PY3: + file = io.TextIOWrapper + +# Import a default 'pickler' +# Not the nicest import code, but should work on Py2/Py3 +try: + import dill as pickle +except ImportError: + try: + import cPickle as pickle + except ImportError: + import pickle + +import warnings + +try: + __version__ = get_distribution('hickle').version +except DistributionNotFound: + __version__ = '0.0.0 - please install via pip/setup.py' + +################## +# Error handling # +################## + +class FileError(Exception): + """ An exception raised if the file is fishy """ + def __init__(self): + return + + def __str__(self): + return ("Cannot open file. Please pass either a filename " + "string, a file object, or a h5py.File") + + +class ClosedFileError(Exception): + """ An exception raised if the file is fishy """ + def __init__(self): + return + + def __str__(self): + return ("HDF5 file has been closed. Please pass either " + "a filename string, a file object, or an open h5py.File") + + +class NoMatchError(Exception): + """ An exception raised if the object type is not understood (or + supported)""" + def __init__(self): + return + + def __str__(self): + return ("Error: this type of python object cannot be converted into a " + "hickle.") + + +class ToDoError(Exception): + """ An exception raised for non-implemented functionality""" + def __init__(self): + return + + def __str__(self): + return "Error: this functionality hasn't been implemented yet." + + +class SerializedWarning(UserWarning): + """ An object type was not understood + + The data will be serialized using pickle. + """ + pass + + +###################### +# H5PY file wrappers # +###################### + +class H5GroupWrapper(h5.Group): + """ Group wrapper that provides a track_times kwarg. + + track_times is a boolean flag that can be set to False, so that two + files created at different times will have identical MD5 hashes. + """ + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5GroupWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +class H5FileWrapper(h5.File): + """ Wrapper for h5py File that provides a track_times kwarg. + + track_times is a boolean flag that can be set to False, so that two + files created at different times will have identical MD5 hashes. + """ + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5FileWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5FileWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +def file_opener(f, mode='r', track_times=True): + """ A file opener helper function with some error handling. This can open + files through a file object, a h5py file, or just the filename. + + Args: + f (file, h5py.File, or string): File-identifier, e.g. filename or file object. + mode (str): File open mode. Only required if opening by filename string. + track_times (bool): Track time in HDF5; turn off if you want hickling at + different times to produce identical files (e.g. for MD5 hash check). + + """ + + # Assume that we will have to close the file after dump or load + close_flag = True + + # Were we handed a file object or just a file name string? + if isinstance(f, (file, io.TextIOWrapper)): + filename, mode = f.name, f.mode + f.close() + h5f = h5.File(filename, mode) + elif isinstance(f, string_types): + filename = f + h5f = h5.File(filename, mode) + elif isinstance(f, (H5FileWrapper, h5._hl.files.File)): + try: + filename = f.filename + except ValueError: + raise ClosedFileError + h5f = f + # Since this file was already open, do not close the file afterward + close_flag = False + else: + print(f.__class__) + raise FileError + + h5f.__class__ = H5FileWrapper + h5f.track_times = track_times + return(h5f, close_flag) + + +########### +# DUMPERS # +########### + + +def _dump(py_obj, h_group, call_id=0, **kwargs): + """ Dump a python object to a group within a HDF5 file. + + This function is called recursively by the main dump() function. + + Args: + py_obj: python object to dump. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + + # Get list of dumpable dtypes + dumpable_dtypes = [] + for lst in [[bool, complex, bytes, float], string_types, integer_types]: + dumpable_dtypes.extend(lst) + + # Firstly, check if item is a numpy array. If so, just dump it. + if check_is_ndarray_like(py_obj): + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + + # Next, check if item is a dict + elif isinstance(py_obj, dict): + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + + # If not, check if item is iterable + elif check_is_iterable(py_obj): + item_type = check_iterable_item_type(py_obj) + + # item_type == False implies multiple types. Create a dataset + if item_type is False: + h_subgroup = create_hkl_group(py_obj, h_group, call_id) + for ii, py_subobj in enumerate(py_obj): + _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) + + # otherwise, subitems have same type. Check if subtype is an iterable + # (e.g. list of lists), or not (e.g. list of ints, which should be treated + # as a single dataset). + else: + if item_type in dumpable_dtypes: + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + else: + h_subgroup = create_hkl_group(py_obj, h_group, call_id) + for ii, py_subobj in enumerate(py_obj): + _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) + + # item is not iterable, so create a dataset for it + else: + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + + +def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs): + """ Write a pickled representation of obj to the open file object file. + + Args: + obj (object): python object o store in a Hickle + file: file object, filename string, or h5py.File object + file in which to store the object. A h5py.File or a filename is also + acceptable. + mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append). + Ignored if file is a file object. + compression (str): optional argument. Applies compression to dataset. Options: None, gzip, + lzf (+ szip, if installed) + track_times (bool): optional argument. If set to False, repeated hickling will produce + identical files. + path (str): path within hdf5 file to save data to. Defaults to root / + """ + + # Make sure that file is not closed unless modified + # This is to avoid trying to close a file that was never opened + close_flag = False + + try: + # Open the file + h5f, close_flag = file_opener(file_obj, mode, track_times) + h5f.attrs["CLASS"] = b'hickle' + h5f.attrs["VERSION"] = get_distribution('hickle').version + h5f.attrs["type"] = [b'hickle'] + # Log which version of python was used to generate the hickle file + pv = sys.version_info + py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2]) + h5f.attrs["PYTHON_VERSION"] = py_ver + + h_root_group = h5f.get(path) + + if h_root_group is None: + h_root_group = h5f.create_group(path) + h_root_group.attrs["type"] = [b'hickle'] + + _dump(py_obj, h_root_group, **kwargs) + except NoMatchError: + fname = h5f.filename + h5f.close() + try: + os.remove(fname) + except OSError: + warnings.warn("Dump failed. Could not remove %s" % fname) + finally: + raise NoMatchError + finally: + # Close the file if requested. + # Closing a file twice will not cause any problems + if close_flag: + h5f.close() + + +def create_dataset_lookup(py_obj): + """ What type of object are we trying to pickle? This is a python + dictionary based equivalent of a case statement. It returns the correct + helper function for a given data type. + + Args: + py_obj: python object to look-up what function to use to dump to disk + + Returns: + match: function that should be used to dump data to a new dataset + """ + t = type(py_obj) + types_lookup = {dict: create_dict_dataset} + types_lookup.update(types_dict) + + match = types_lookup.get(t, no_match) + + return match + + + +def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Create a dataset within the hickle HDF5 file + + Args: + py_obj: python object to dump. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + + """ + #lookup dataset creator type based on python object type + create_dataset = create_dataset_lookup(py_obj) + + # do the creation + create_dataset(py_obj, h_group, call_id, **kwargs) + + +def create_hkl_group(py_obj, h_group, call_id=0): + """ Create a new group within the hickle file + + Args: + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + + """ + h_subgroup = h_group.create_group('data_%i' % call_id) + h_subgroup.attrs['type'] = [str(type(py_obj)).encode('ascii', 'ignore')] + return h_subgroup + + +def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Creates a data group for each key in dictionary + + Notes: + This is a very important function which uses the recursive _dump + method to build up hierarchical data models stored in the HDF5 file. + As this is critical to functioning, it is kept in the main hickle.py + file instead of in the loaders/ directory. + + Args: + py_obj: python object to dump; should be dictionary + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + h_dictgroup = h_group.create_group('data_%i' % call_id) + h_dictgroup.attrs['type'] = [str(type(py_obj)).encode('ascii', 'ignore')] + + for key, py_subobj in py_obj.items(): + if isinstance(key, string_types): + h_subgroup = h_dictgroup.create_group("%r" % (key)) + else: + h_subgroup = h_dictgroup.create_group(str(key)) + h_subgroup.attrs["type"] = [b'dict_item'] + + h_subgroup.attrs["key_type"] = [str(type(key)).encode('ascii', 'ignore')] + + _dump(py_subobj, h_subgroup, call_id=0, **kwargs) + + +def no_match(py_obj, h_group, call_id=0, **kwargs): + """ If no match is made, raise an exception + + Args: + py_obj: python object to dump; default if item is not matched. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + pickled_obj = pickle.dumps(py_obj) + d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj]) + d.attrs["type"] = [b'pickle'] + + warnings.warn("%s type not understood, data have been serialized" % type(py_obj), + SerializedWarning) + + + +############# +## LOADERS ## +############# + +class PyContainer(list): + """ A group-like object into which to load datasets. + + In order to build up a tree-like structure, we need to be able + to load datasets into a container with an append() method. + Python tuples and sets do not allow this. This class provides + a list-like object that be converted into a list, tuple, set or dict. + """ + def __init__(self): + super(PyContainer, self).__init__() + self.container_type = None + self.name = None + self.key_type = None + + def convert(self): + """ Convert from PyContainer to python core data type. + + Returns: self, either as a list, tuple, set or dict + (or other type specified in lookup.py) + """ + + if self.container_type in container_types_dict.keys(): + convert_fn = container_types_dict[self.container_type] + return convert_fn(self) + if self.container_type == str(dict).encode('ascii', 'ignore'): + keys = [] + for item in self: + key = item.name.split('/')[-1] + key_type = item.key_type[0] + if key_type in container_key_types_dict.keys(): + to_type_fn = container_key_types_dict[key_type] + key = to_type_fn(key) + keys.append(key) + + items = [item[0] for item in self] + return dict(zip(keys, items)) + else: + return self + +def no_match_load(key): + """ If no match is made when loading, need to raise an exception + """ + raise RuntimeError("Cannot load %s data type" % key) + #pass + +def load_dataset_lookup(key): + """ What type of object are we trying to unpickle? This is a python + dictionary based equivalent of a case statement. It returns the type + a given 'type' keyword in the hickle file. + + Args: + py_obj: python object to look-up what function to use to dump to disk + + Returns: + match: function that should be used to dump data to a new dataset + """ + + match = hkl_types_dict.get(key, no_match_load) + + return match + +def load(fileobj, path='/', safe=True): + """ Load a hickle file and reconstruct a python object + + Args: + fileobj: file object, h5py.File, or filename string + safe (bool): Disable automatic depickling of arbitrary python objects. + DO NOT set this to False unless the file is from a trusted source. + (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) + + path (str): path within hdf5 file to save data to. Defaults to root / + """ + + # Make sure that the file is not closed unless modified + # This is to avoid trying to close a file that was never opened + close_flag = False + + try: + h5f, close_flag = file_opener(fileobj) + h_root_group = h5f.get(path) + try: + assert 'CLASS' in h5f.attrs.keys() + assert 'VERSION' in h5f.attrs.keys() + VER = h5f.attrs['VERSION'] + try: + VER_MAJOR = int(VER) + except ValueError: + VER_MAJOR = int(VER[0]) + if VER_MAJOR == 1: + if PY2: + warnings.warn("Hickle file versioned as V1, attempting legacy loading...") + from . import hickle_legacy + return hickle_legacy.load(fileobj, safe) + else: + raise RuntimeError("Cannot open file. This file was likely" + " created with Python 2 and an old hickle version.") + elif VER_MAJOR == 2: + if PY2: + warnings.warn("Hickle file appears to be old version (v2), attempting " + "legacy loading...") + from . import hickle_legacy2 + return hickle_legacy2.load(fileobj, path=path, safe=safe) + else: + raise RuntimeError("Cannot open file. This file was likely" + " created with Python 2 and an old hickle version.") + # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3) + # For backward compatibility we really need to catch this. + # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0) + elif VER_MAJOR == 3 and VER == VER_MAJOR: + if PY2: + warnings.warn("Hickle file appears to be old version (v2.1.0), attempting " + "legacy loading...") + from . import hickle_legacy2 + return hickle_legacy2.load(fileobj, path=path, safe=safe) + else: + raise RuntimeError("Cannot open file. This file was likely" + " created with Python 2 and an old hickle version.") + elif VER_MAJOR >= 3: + py_container = PyContainer() + py_container.container_type = 'hickle' + py_container = _load(py_container, h_root_group) + return py_container[0][0] + + except AssertionError: + if PY2: + warnings.warn("Hickle file is not versioned, attempting legacy loading...") + from . import hickle_legacy + return hickle_legacy.load(fileobj, safe) + else: + raise RuntimeError("Cannot open file. This file was likely" + " created with Python 2 and an old hickle version.") + finally: + # Close the file if requested. + # Closing a file twice will not cause any problems + if close_flag: + h5f.close() + +def load_dataset(h_node): + """ Load a dataset, converting into its correct python type + + Args: + h_node (h5py dataset): h5py dataset object to read + + Returns: + data: reconstructed python object from loaded data + """ + py_type = get_type(h_node) + + try: + load_fn = load_dataset_lookup(py_type) + return load_fn(h_node) + except: + raise + #raise RuntimeError("Hickle type %s not understood." % py_type) + +def _load(py_container, h_group): + """ Load a hickle file + + Recursive funnction to load hdf5 data into a PyContainer() + + Args: + py_container (PyContainer): Python container to load data into + h_group (h5 group or dataset): h5py object, group or dataset, to spider + and load all datasets. + """ + + group_dtype = h5._hl.group.Group + dataset_dtype = h5._hl.dataset.Dataset + + #either a file, group, or dataset + if isinstance(h_group, (H5FileWrapper, group_dtype)): + + py_subcontainer = PyContainer() + try: + py_subcontainer.container_type = bytes(h_group.attrs['type'][0]) + except KeyError: + raise + #py_subcontainer.container_type = '' + py_subcontainer.name = h_group.name + + if py_subcontainer.container_type == b'dict_item': + py_subcontainer.key_type = h_group.attrs['key_type'] + + if py_subcontainer.container_type not in types_not_to_sort: + h_keys = sort_keys(h_group.keys()) + else: + h_keys = h_group.keys() + + for h_name in h_keys: + h_node = h_group[h_name] + py_subcontainer = _load(py_subcontainer, h_node) + + sub_data = py_subcontainer.convert() + py_container.append(sub_data) + + else: + # must be a dataset + subdata = load_dataset(h_group) + py_container.append(subdata) + + return py_container diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy.py new file mode 100755 index 0000000000000000000000000000000000000000..61a171fde3d39304d78d1ddede9656dd7ad50940 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy.py @@ -0,0 +1,535 @@ +# encoding: utf-8 +""" +# hickle_legacy.py + +Created by Danny Price 2012-05-28. + +Hickle is a HDF5 based clone of Pickle. Instead of serializing to a +pickle file, Hickle dumps to a HDF5 file. It is designed to be as similar +to pickle in usage as possible. + +## Notes + +This is a legacy handler, for hickle v1 files. +If V2 reading fails, this will be called as a fail-over. + +""" + +import os +import sys +import numpy as np +import h5py as h5 + +if sys.version_info.major == 3: + NoneType = type(None) +else: + from types import NoneType + +__version__ = "1.3.0" +__author__ = "Danny Price" + +#################### +## Error handling ## +#################### + + +class FileError(Exception): + """ An exception raised if the file is fishy""" + + def __init__(self): + return + + def __str__(self): + print("Error: cannot open file. Please pass either a filename string, a file object, " + "or a h5py.File") + + +class NoMatchError(Exception): + """ An exception raised if the object type is not understood (or supported)""" + + def __init__(self): + return + + def __str__(self): + print("Error: this type of python object cannot be converted into a hickle.") + + +class ToDoError(Exception): + """ An exception raised for non-implemented functionality""" + + def __init__(self): + return + + def __str__(self): + print("Error: this functionality hasn't been implemented yet.") + + +class H5GroupWrapper(h5.Group): + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5GroupWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +class H5FileWrapper(h5.File): + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5FileWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5FileWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +def file_opener(f, mode='r', track_times=True): + """ A file opener helper function with some error handling. + + This can open files through a file object, a h5py file, or just the filename. + """ + # Were we handed a file object or just a file name string? + if isinstance(f, file): + filename, mode = f.name, f.mode + f.close() + h5f = h5.File(filename, mode) + + elif isinstance(f, h5._hl.files.File): + h5f = f + elif isinstance(f, str): + filename = f + h5f = h5.File(filename, mode) + else: + raise FileError + + h5f.__class__ = H5FileWrapper + h5f.track_times = track_times + return h5f + + +############# +## dumpers ## +############# + +def dump_ndarray(obj, h5f, **kwargs): + """ dumps an ndarray object to h5py file""" + h5f.create_dataset('data', data=obj, **kwargs) + h5f.create_dataset('type', data=['ndarray']) + + +def dump_np_dtype(obj, h5f, **kwargs): + """ dumps an np dtype object to h5py file""" + h5f.create_dataset('data', data=obj) + h5f.create_dataset('type', data=['np_dtype']) + + +def dump_np_dtype_dict(obj, h5f, **kwargs): + """ dumps an np dtype object within a group""" + h5f.create_dataset('data', data=obj) + h5f.create_dataset('_data', data=['np_dtype']) + + +def dump_masked(obj, h5f, **kwargs): + """ dumps an ndarray object to h5py file""" + h5f.create_dataset('data', data=obj, **kwargs) + h5f.create_dataset('mask', data=obj.mask, **kwargs) + h5f.create_dataset('type', data=['masked']) + + +def dump_list(obj, h5f, **kwargs): + """ dumps a list object to h5py file""" + + # Check if there are any numpy arrays in the list + contains_numpy = any(isinstance(el, np.ndarray) for el in obj) + + if contains_numpy: + _dump_list_np(obj, h5f, **kwargs) + else: + h5f.create_dataset('data', data=obj, **kwargs) + h5f.create_dataset('type', data=['list']) + + +def _dump_list_np(obj, h5f, **kwargs): + """ Dump a list of numpy objects to file """ + + np_group = h5f.create_group('data') + h5f.create_dataset('type', data=['np_list']) + + ii = 0 + for np_item in obj: + np_group.create_dataset("%s" % ii, data=np_item, **kwargs) + ii += 1 + + +def dump_tuple(obj, h5f, **kwargs): + """ dumps a list object to h5py file""" + + # Check if there are any numpy arrays in the list + contains_numpy = any(isinstance(el, np.ndarray) for el in obj) + + if contains_numpy: + _dump_tuple_np(obj, h5f, **kwargs) + else: + h5f.create_dataset('data', data=obj, **kwargs) + h5f.create_dataset('type', data=['tuple']) + + +def _dump_tuple_np(obj, h5f, **kwargs): + """ Dump a tuple of numpy objects to file """ + + np_group = h5f.create_group('data') + h5f.create_dataset('type', data=['np_tuple']) + + ii = 0 + for np_item in obj: + np_group.create_dataset("%s" % ii, data=np_item, **kwargs) + ii += 1 + + +def dump_set(obj, h5f, **kwargs): + """ dumps a set object to h5py file""" + obj = list(obj) + h5f.create_dataset('data', data=obj, **kwargs) + h5f.create_dataset('type', data=['set']) + + +def dump_string(obj, h5f, **kwargs): + """ dumps a list object to h5py file""" + h5f.create_dataset('data', data=[obj], **kwargs) + h5f.create_dataset('type', data=['string']) + + +def dump_none(obj, h5f, **kwargs): + """ Dump None type to file """ + h5f.create_dataset('data', data=[0], **kwargs) + h5f.create_dataset('type', data=['none']) + + +def dump_unicode(obj, h5f, **kwargs): + """ dumps a list object to h5py file""" + dt = h5.special_dtype(vlen=unicode) + ll = len(obj) + dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs) + dset[:ll] = obj + h5f.create_dataset('type', data=['unicode']) + + +def _dump_dict(dd, hgroup, **kwargs): + for key in dd: + if type(dd[key]) in (str, int, float, unicode, bool): + # Figure out type to be stored + types = {str: 'str', int: 'int', float: 'float', + unicode: 'unicode', bool: 'bool', NoneType: 'none'} + _key = types.get(type(dd[key])) + + # Store along with dtype info + if _key == 'unicode': + dd[key] = str(dd[key]) + + hgroup.create_dataset("%s" % key, data=[dd[key]], **kwargs) + hgroup.create_dataset("_%s" % key, data=[_key]) + + elif type(dd[key]) in (type(np.array([1])), type(np.ma.array([1]))): + + if hasattr(dd[key], 'mask'): + hgroup.create_dataset("_%s" % key, data=["masked"]) + hgroup.create_dataset("%s" % key, data=dd[key].data, **kwargs) + hgroup.create_dataset("_%s_mask" % key, data=dd[key].mask, **kwargs) + else: + hgroup.create_dataset("_%s" % key, data=["ndarray"]) + hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) + + elif type(dd[key]) is list: + hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) + hgroup.create_dataset("_%s" % key, data=["list"]) + + elif type(dd[key]) is tuple: + hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) + hgroup.create_dataset("_%s" % key, data=["tuple"]) + + elif type(dd[key]) is set: + hgroup.create_dataset("%s" % key, data=list(dd[key]), **kwargs) + hgroup.create_dataset("_%s" % key, data=["set"]) + + elif isinstance(dd[key], dict): + new_group = hgroup.create_group("%s" % key) + _dump_dict(dd[key], new_group, **kwargs) + + elif type(dd[key]) is NoneType: + hgroup.create_dataset("%s" % key, data=[0], **kwargs) + hgroup.create_dataset("_%s" % key, data=["none"]) + + else: + if type(dd[key]).__module__ == np.__name__: + #print type(dd[key]) + hgroup.create_dataset("%s" % key, data=dd[key]) + hgroup.create_dataset("_%s" % key, data=["np_dtype"]) + #new_group = hgroup.create_group("%s" % key) + #dump_np_dtype_dict(dd[key], new_group) + else: + raise NoMatchError + + +def dump_dict(obj, h5f='', **kwargs): + """ dumps a dictionary to h5py file """ + h5f.create_dataset('type', data=['dict']) + hgroup = h5f.create_group('data') + _dump_dict(obj, hgroup, **kwargs) + + +def no_match(obj, h5f, *args, **kwargs): + """ If no match is made, raise an exception """ + try: + import dill as cPickle + except ImportError: + import cPickle + + pickled_obj = cPickle.dumps(obj) + h5f.create_dataset('type', data=['pickle']) + h5f.create_dataset('data', data=[pickled_obj]) + + print("Warning: %s type not understood, data have been serialized" % type(obj)) + #raise NoMatchError + + +def dumper_lookup(obj): + """ What type of object are we trying to pickle? + + This is a python dictionary based equivalent of a case statement. + It returns the correct helper function for a given data type. + """ + t = type(obj) + + types = { + list: dump_list, + tuple: dump_tuple, + set: dump_set, + dict: dump_dict, + str: dump_string, + unicode: dump_unicode, + NoneType: dump_none, + np.ndarray: dump_ndarray, + np.ma.core.MaskedArray: dump_masked, + np.float16: dump_np_dtype, + np.float32: dump_np_dtype, + np.float64: dump_np_dtype, + np.int8: dump_np_dtype, + np.int16: dump_np_dtype, + np.int32: dump_np_dtype, + np.int64: dump_np_dtype, + np.uint8: dump_np_dtype, + np.uint16: dump_np_dtype, + np.uint32: dump_np_dtype, + np.uint64: dump_np_dtype, + np.complex64: dump_np_dtype, + np.complex128: dump_np_dtype, + } + + match = types.get(t, no_match) + return match + + +def dump(obj, file, mode='w', track_times=True, **kwargs): + """ Write a pickled representation of obj to the open file object file. + + Parameters + ---------- + obj: object + python object o store in a Hickle + file: file object, filename string, or h5py.File object + file in which to store the object. A h5py.File or a filename is also acceptable. + mode: string + optional argument, 'r' (read only), 'w' (write) or 'a' (append). Ignored if file + is a file object. + compression: str + optional argument. Applies compression to dataset. Options: None, gzip, lzf (+ szip, + if installed) + track_times: bool + optional argument. If set to False, repeated hickling will produce identical files. + """ + + try: + # See what kind of object to dump + dumper = dumper_lookup(obj) + # Open the file + h5f = file_opener(file, mode, track_times) + print("dumping %s to file %s" % (type(obj), repr(h5f))) + dumper(obj, h5f, **kwargs) + h5f.close() + except NoMatchError: + fname = h5f.filename + h5f.close() + try: + os.remove(fname) + except: + print("Warning: dump failed. Could not remove %s" % fname) + finally: + raise NoMatchError + + +############# +## loaders ## +############# + +def load(file, safe=True): + """ Load a hickle file and reconstruct a python object + + Parameters + ---------- + file: file object, h5py.File, or filename string + + safe (bool): Disable automatic depickling of arbitrary python objects. + DO NOT set this to False unless the file is from a trusted source. + (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) + """ + + try: + h5f = file_opener(file) + dtype = h5f["type"][0] + + if dtype == 'dict': + group = h5f["data"] + data = load_dict(group) + elif dtype == 'pickle': + data = load_pickle(h5f, safe) + elif dtype == 'np_list': + group = h5f["data"] + data = load_np_list(group) + elif dtype == 'np_tuple': + group = h5f["data"] + data = load_np_tuple(group) + elif dtype == 'masked': + data = np.ma.array(h5f["data"][:], mask=h5f["mask"][:]) + elif dtype == 'none': + data = None + else: + if dtype in ('string', 'unicode'): + data = h5f["data"][0] + else: + try: + data = h5f["data"][:] + except ValueError: + data = h5f["data"] + types = { + 'list': list, + 'set': set, + 'unicode': unicode, + 'string': str, + 'ndarray': load_ndarray, + 'np_dtype': load_np_dtype + } + + mod = types.get(dtype, no_match) + data = mod(data) + finally: + if 'h5f' in locals(): + h5f.close() + return data + + +def load_pickle(h5f, safe=True): + """ Deserialize and load a pickled object within a hickle file + + WARNING: Pickle has + + Parameters + ---------- + h5f: h5py.File object + + safe (bool): Disable automatic depickling of arbitrary python objects. + DO NOT set this to False unless the file is from a trusted source. + (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) + """ + + if not safe: + try: + import dill as cPickle + except ImportError: + import cPickle + + data = h5f["data"][:] + data = cPickle.loads(data[0]) + return data + else: + print("\nWarning: Object is of an unknown type, and has not been loaded") + print(" for security reasons (it could be malicious code). If") + print(" you wish to continue, manually set safe=False\n") + + +def load_np_list(group): + """ load a numpy list """ + np_list = [] + for key in sorted(group.keys()): + data = group[key][:] + np_list.append(data) + return np_list + + +def load_np_tuple(group): + """ load a tuple containing numpy arrays """ + return tuple(load_np_list(group)) + + +def load_ndarray(arr): + """ Load a numpy array """ + # Nothing to be done! + return arr + + +def load_np_dtype(arr): + """ Load a numpy array """ + # Just return first value + return arr.value + + +def load_dict(group): + """ Load dictionary """ + + dd = {} + for key in group.keys(): + if isinstance(group[key], h5._hl.group.Group): + new_group = group[key] + dd[key] = load_dict(new_group) + elif not key.startswith("_"): + _key = "_%s" % key + + if group[_key][0] == 'np_dtype': + dd[key] = group[key].value + elif group[_key][0] in ('str', 'int', 'float', 'unicode', 'bool'): + dd[key] = group[key][0] + elif group[_key][0] == 'masked': + key_ma = "_%s_mask" % key + dd[key] = np.ma.array(group[key][:], mask=group[key_ma]) + else: + dd[key] = group[key][:] + + # Convert numpy constructs back to string + dtype = group[_key][0] + types = {'str': str, 'int': int, 'float': float, + 'unicode': unicode, 'bool': bool, 'list': list, 'none' : NoneType} + try: + mod = types.get(dtype) + if dtype == 'none': + dd[key] = None + else: + dd[key] = mod(dd[key]) + except: + pass + return dd + + +def load_large(file): + """ Load a large hickle file (returns the h5py object not the data) + + Parameters + ---------- + file: file object, h5py.File, or filename string + """ + + h5f = file_opener(file) + return h5f diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy2.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy2.py new file mode 100755 index 0000000000000000000000000000000000000000..4d018fde9a161713213b00190267439257cb876d --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/hickle_legacy2.py @@ -0,0 +1,672 @@ +# encoding: utf-8 +""" +# hickle_legacy2.py + +Created by Danny Price 2016-02-03. + +This is a legacy handler, for hickle v2 files. +If V3 reading fails, this will be called as a fail-over. + +""" + +import os +import numpy as np +import h5py as h5 +import re + +try: + from exceptions import Exception + from types import NoneType +except ImportError: + pass # above imports will fail in python3 + +import warnings +__version__ = "2.0.4" +__author__ = "Danny Price" + + +################## +# Error handling # +################## + +class FileError(Exception): + """ An exception raised if the file is fishy """ + def __init__(self): + return + + def __str__(self): + return ("Cannot open file. Please pass either a filename " + "string, a file object, or a h5py.File") + + +class ClosedFileError(Exception): + """ An exception raised if the file is fishy """ + def __init__(self): + return + + def __str__(self): + return ("HDF5 file has been closed. Please pass either " + "a filename string, a file object, or an open h5py.File") + + +class NoMatchError(Exception): + """ An exception raised if the object type is not understood (or + supported)""" + def __init__(self): + return + + def __str__(self): + return ("Error: this type of python object cannot be converted into a " + "hickle.") + + +class ToDoError(Exception): + """ An exception raised for non-implemented functionality""" + def __init__(self): + return + + def __str__(self): + return "Error: this functionality hasn't been implemented yet." + + +###################### +# H5PY file wrappers # +###################### + +class H5GroupWrapper(h5.Group): + """ Group wrapper that provides a track_times kwarg. + + track_times is a boolean flag that can be set to False, so that two + files created at different times will have identical MD5 hashes. + """ + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5GroupWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +class H5FileWrapper(h5.File): + """ Wrapper for h5py File that provides a track_times kwarg. + + track_times is a boolean flag that can be set to False, so that two + files created at different times will have identical MD5 hashes. + """ + def create_dataset(self, *args, **kwargs): + kwargs['track_times'] = getattr(self, 'track_times', True) + return super(H5FileWrapper, self).create_dataset(*args, **kwargs) + + def create_group(self, *args, **kwargs): + group = super(H5FileWrapper, self).create_group(*args, **kwargs) + group.__class__ = H5GroupWrapper + group.track_times = getattr(self, 'track_times', True) + return group + + +def file_opener(f, mode='r', track_times=True): + """ A file opener helper function with some error handling. This can open + files through a file object, a h5py file, or just the filename. + + Args: + f (file, h5py.File, or string): File-identifier, e.g. filename or file object. + mode (str): File open mode. Only required if opening by filename string. + track_times (bool): Track time in HDF5; turn off if you want hickling at + different times to produce identical files (e.g. for MD5 hash check). + + """ + # Were we handed a file object or just a file name string? + if isinstance(f, file): + filename, mode = f.name, f.mode + f.close() + h5f = h5.File(filename, mode) + elif isinstance(f, str) or isinstance(f, unicode): + filename = f + h5f = h5.File(filename, mode) + elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File): + try: + filename = f.filename + except ValueError: + raise ClosedFileError() + h5f = f + else: + print(type(f)) + raise FileError + + h5f.__class__ = H5FileWrapper + h5f.track_times = track_times + return h5f + + +########### +# DUMPERS # +########### + +def check_is_iterable(py_obj): + """ Check whether a python object is iterable. + + Note: this treats unicode and string as NON ITERABLE + + Args: + py_obj: python object to test + + Returns: + iter_ok (bool): True if item is iterable, False is item is not + """ + if type(py_obj) in (str, unicode): + return False + try: + iter(py_obj) + return True + except TypeError: + return False + + +def check_iterable_item_type(iter_obj): + """ Check if all items within an iterable are the same type. + + Args: + iter_obj: iterable object + + Returns: + iter_type: type of item contained within the iterable. If + the iterable has many types, a boolean False is returned instead. + + References: + http://stackoverflow.com/questions/13252333/python-check-if-all-elements-of-a-list-are-the-same-type + """ + iseq = iter(iter_obj) + first_type = type(next(iseq)) + return first_type if all((type(x) is first_type) for x in iseq) else False + + +def check_is_numpy_array(py_obj): + """ Check if a python object is a numpy array (masked or regular) + + Args: + py_obj: python object to check whether it is a numpy array + + Returns + is_numpy (bool): Returns True if it is a numpy array, else False if it isn't + """ + + is_numpy = type(py_obj) in (type(np.array([1])), type(np.ma.array([1]))) + + return is_numpy + + +def _dump(py_obj, h_group, call_id=0, **kwargs): + """ Dump a python object to a group within a HDF5 file. + + This function is called recursively by the main dump() function. + + Args: + py_obj: python object to dump. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + + dumpable_dtypes = set([bool, int, float, long, complex, str, unicode]) + + # Firstly, check if item is a numpy array. If so, just dump it. + if check_is_numpy_array(py_obj): + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + + # next, check if item is iterable + elif check_is_iterable(py_obj): + item_type = check_iterable_item_type(py_obj) + + # item_type == False implies multiple types. Create a dataset + if item_type is False: + h_subgroup = create_hkl_group(py_obj, h_group, call_id) + for ii, py_subobj in enumerate(py_obj): + _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) + + # otherwise, subitems have same type. Check if subtype is an iterable + # (e.g. list of lists), or not (e.g. list of ints, which should be treated + # as a single dataset). + else: + if item_type in dumpable_dtypes: + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + else: + h_subgroup = create_hkl_group(py_obj, h_group, call_id) + for ii, py_subobj in enumerate(py_obj): + #print py_subobj, h_subgroup, ii + _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) + + # item is not iterable, so create a dataset for it + else: + create_hkl_dataset(py_obj, h_group, call_id, **kwargs) + + +def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs): + """ Write a pickled representation of obj to the open file object file. + + Args: + obj (object): python object o store in a Hickle + file: file object, filename string, or h5py.File object + file in which to store the object. A h5py.File or a filename is also + acceptable. + mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append). + Ignored if file is a file object. + compression (str): optional argument. Applies compression to dataset. Options: None, gzip, + lzf (+ szip, if installed) + track_times (bool): optional argument. If set to False, repeated hickling will produce + identical files. + path (str): path within hdf5 file to save data to. Defaults to root / + """ + + try: + # Open the file + h5f = file_opener(file_obj, mode, track_times) + h5f.attrs["CLASS"] = 'hickle' + h5f.attrs["VERSION"] = 2 + h5f.attrs["type"] = ['hickle'] + + h_root_group = h5f.get(path) + + if h_root_group is None: + h_root_group = h5f.create_group(path) + h_root_group.attrs["type"] = ['hickle'] + + _dump(py_obj, h_root_group, **kwargs) + h5f.close() + except NoMatchError: + fname = h5f.filename + h5f.close() + try: + os.remove(fname) + except OSError: + warnings.warn("Dump failed. Could not remove %s" % fname) + finally: + raise NoMatchError + + +def create_dataset_lookup(py_obj): + """ What type of object are we trying to pickle? This is a python + dictionary based equivalent of a case statement. It returns the correct + helper function for a given data type. + + Args: + py_obj: python object to look-up what function to use to dump to disk + + Returns: + match: function that should be used to dump data to a new dataset + """ + t = type(py_obj) + + types = { + dict: create_dict_dataset, + list: create_listlike_dataset, + tuple: create_listlike_dataset, + set: create_listlike_dataset, + str: create_stringlike_dataset, + unicode: create_stringlike_dataset, + int: create_python_dtype_dataset, + float: create_python_dtype_dataset, + long: create_python_dtype_dataset, + bool: create_python_dtype_dataset, + complex: create_python_dtype_dataset, + NoneType: create_none_dataset, + np.ndarray: create_np_array_dataset, + np.ma.core.MaskedArray: create_np_array_dataset, + np.float16: create_np_dtype_dataset, + np.float32: create_np_dtype_dataset, + np.float64: create_np_dtype_dataset, + np.int8: create_np_dtype_dataset, + np.int16: create_np_dtype_dataset, + np.int32: create_np_dtype_dataset, + np.int64: create_np_dtype_dataset, + np.uint8: create_np_dtype_dataset, + np.uint16: create_np_dtype_dataset, + np.uint32: create_np_dtype_dataset, + np.uint64: create_np_dtype_dataset, + np.complex64: create_np_dtype_dataset, + np.complex128: create_np_dtype_dataset + } + + match = types.get(t, no_match) + return match + + +def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Create a dataset within the hickle HDF5 file + + Args: + py_obj: python object to dump. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + + """ + #lookup dataset creator type based on python object type + create_dataset = create_dataset_lookup(py_obj) + + # do the creation + create_dataset(py_obj, h_group, call_id, **kwargs) + + +def create_hkl_group(py_obj, h_group, call_id=0): + """ Create a new group within the hickle file + + Args: + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + + """ + h_subgroup = h_group.create_group('data_%i' % call_id) + h_subgroup.attrs["type"] = [str(type(py_obj))] + return h_subgroup + + +def create_listlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dumper for list, set, tuple + + Args: + py_obj: python object to dump; should be list-like + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + dtype = str(type(py_obj)) + obj = list(py_obj) + d = h_group.create_dataset('data_%i' % call_id, data=obj, **kwargs) + d.attrs["type"] = [dtype] + + +def create_np_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps an np dtype object to h5py file + + Args: + py_obj: python object to dump; should be a numpy scalar, e.g. np.float16(1) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) + d.attrs["type"] = ['np_dtype'] + d.attrs["np_dtype"] = str(d.dtype) + + +def create_python_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a python dtype object to h5py file + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, + dtype=type(py_obj), **kwargs) + d.attrs["type"] = ['python_dtype'] + d.attrs['python_subdtype'] = str(type(py_obj)) + + +def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Creates a data group for each key in dictionary + + Args: + py_obj: python object to dump; should be dictionary + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + h_dictgroup = h_group.create_group('data_%i' % call_id) + h_dictgroup.attrs["type"] = ['dict'] + for key, py_subobj in py_obj.items(): + h_subgroup = h_dictgroup.create_group(key) + h_subgroup.attrs["type"] = ['dict_item'] + _dump(py_subobj, h_subgroup, call_id=0, **kwargs) + + +def create_np_array_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps an ndarray object to h5py file + + Args: + py_obj: python object to dump; should be a numpy array or np.ma.array (masked) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + if isinstance(py_obj, type(np.ma.array([1]))): + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) + #m = h_group.create_dataset('mask_%i' % call_id, data=py_obj.mask, **kwargs) + m = h_group.create_dataset('data_%i_mask' % call_id, data=py_obj.mask, **kwargs) + d.attrs["type"] = ['ndarray_masked_data'] + m.attrs["type"] = ['ndarray_masked_mask'] + else: + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) + d.attrs["type"] = ['ndarray'] + + +def create_stringlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a list object to h5py file + + Args: + py_obj: python object to dump; should be string-like (unicode or string) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + if isinstance(py_obj, str): + d = h_group.create_dataset('data_%i' % call_id, data=[py_obj], **kwargs) + d.attrs["type"] = ['string'] + else: + dt = h5.special_dtype(vlen=unicode) + dset = h_group.create_dataset('data_%i' % call_id, shape=(1, ), dtype=dt, **kwargs) + dset[0] = py_obj + dset.attrs['type'] = ['unicode'] + + +def create_none_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dump None type to file + + Args: + py_obj: python object to dump; must be None object + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=[0], **kwargs) + d.attrs["type"] = ['none'] + + +def no_match(py_obj, h_group, call_id=0, **kwargs): + """ If no match is made, raise an exception + + Args: + py_obj: python object to dump; default if item is not matched. + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + try: + import dill as cPickle + except ImportError: + import cPickle + + pickled_obj = cPickle.dumps(py_obj) + d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj]) + d.attrs["type"] = ['pickle'] + + warnings.warn("%s type not understood, data have been " + "serialized" % type(py_obj)) + + +############# +## LOADERS ## +############# + +class PyContainer(list): + """ A group-like object into which to load datasets. + + In order to build up a tree-like structure, we need to be able + to load datasets into a container with an append() method. + Python tuples and sets do not allow this. This class provides + a list-like object that be converted into a list, tuple, set or dict. + """ + def __init__(self): + super(PyContainer, self).__init__() + self.container_type = None + self.name = None + + def convert(self): + """ Convert from PyContainer to python core data type. + + Returns: self, either as a list, tuple, set or dict + """ + if self.container_type == "<type 'list'>": + return list(self) + if self.container_type == "<type 'tuple'>": + return tuple(self) + if self.container_type == "<type 'set'>": + return set(self) + if self.container_type == "dict": + keys = [str(item.name.split('/')[-1]) for item in self] + items = [item[0] for item in self] + return dict(zip(keys, items)) + else: + return self + + +def load(fileobj, path='/', safe=True): + """ Load a hickle file and reconstruct a python object + + Args: + fileobj: file object, h5py.File, or filename string + safe (bool): Disable automatic depickling of arbitrary python objects. + DO NOT set this to False unless the file is from a trusted source. + (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) + + path (str): path within hdf5 file to save data to. Defaults to root / + """ + + try: + h5f = file_opener(fileobj) + h_root_group = h5f.get(path) + + try: + assert 'CLASS' in h5f.attrs.keys() + assert 'VERSION' in h5f.attrs.keys() + py_container = PyContainer() + py_container.container_type = 'hickle' + py_container = _load(py_container, h_root_group) + return py_container[0][0] + except AssertionError: + import hickle_legacy + return hickle_legacy.load(fileobj, safe) + finally: + if 'h5f' in locals(): + h5f.close() + + +def load_dataset(h_node): + """ Load a dataset, converting into its correct python type + + Args: + h_node (h5py dataset): h5py dataset object to read + + Returns: + data: reconstructed python object from loaded data + """ + py_type = h_node.attrs["type"][0] + + if h_node.shape == (): + data = h_node.value + else: + data = h_node[:] + + if py_type == "<type 'list'>": + #print self.name + return list(data) + elif py_type == "<type 'tuple'>": + return tuple(data) + elif py_type == "<type 'set'>": + return set(data) + elif py_type == "np_dtype": + subtype = h_node.attrs["np_dtype"] + data = np.array(data, dtype=subtype) + return data + elif py_type == 'ndarray': + return np.array(data) + elif py_type == 'ndarray_masked_data': + try: + mask_path = h_node.name + "_mask" + h_root = h_node.parent + mask = h_root.get(mask_path)[:] + except IndexError: + mask = h_root.get(mask_path) + except ValueError: + mask = h_root.get(mask_path) + data = np.ma.array(data, mask=mask) + return data + elif py_type == 'python_dtype': + subtype = h_node.attrs["python_subdtype"] + type_dict = { + "<type 'int'>": int, + "<type 'float'>": float, + "<type 'long'>": long, + "<type 'bool'>": bool, + "<type 'complex'>": complex + } + tcast = type_dict.get(subtype) + return tcast(data) + elif py_type == 'string': + return str(data[0]) + elif py_type == 'unicode': + return unicode(data[0]) + elif py_type == 'none': + return None + else: + print(h_node.name, py_type, h_node.attrs.keys()) + return data + + +def sort_keys(key_list): + """ Take a list of strings and sort it by integer value within string + + Args: + key_list (list): List of keys + + Returns: + key_list_sorted (list): List of keys, sorted by integer + """ + to_int = lambda x: int(re.search('\d+', x).group(0)) + keys_by_int = sorted([(to_int(key), key) for key in key_list]) + return [ii[1] for ii in keys_by_int] + + +def _load(py_container, h_group): + """ Load a hickle file + + Recursive funnction to load hdf5 data into a PyContainer() + + Args: + py_container (PyContainer): Python container to load data into + h_group (h5 group or dataset): h5py object, group or dataset, to spider + and load all datasets. + """ + + group_dtype = h5._hl.group.Group + dataset_dtype = h5._hl.dataset.Dataset + + #either a file, group, or dataset + if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype): + py_subcontainer = PyContainer() + py_subcontainer.container_type = h_group.attrs['type'][0] + py_subcontainer.name = h_group.name + + if py_subcontainer.container_type != 'dict': + h_keys = sort_keys(h_group.keys()) + else: + h_keys = h_group.keys() + + for h_name in h_keys: + h_node = h_group[h_name] + py_subcontainer = _load(py_subcontainer, h_node) + + sub_data = py_subcontainer.convert() + py_container.append(sub_data) + + else: + # must be a dataset + subdata = load_dataset(h_group) + py_container.append(subdata) + + #print h_group.name, py_container + return py_container diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__init__.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..3be6bd298581fb3086bb5a261de72a56970faddf --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__init__.py @@ -0,0 +1 @@ +from __future__ import absolute_import \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/__init__.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..059bbcb18d24b4ed243c011342d5220fc0ca9b4b Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/__init__.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_astropy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_astropy.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3856511eb477a1bd3a48b33dd2325efe9afb4735 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_astropy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_numpy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_numpy.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b431b401afea544b1894a3b08b282d070c988e1 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_numpy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_pandas.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_pandas.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2df2075b66902547e8435e4a81eba6a175408411 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_pandas.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..ada924cd472a1993704ccdda86ab632d87e62aa2 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python3.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python3.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53f514453416469ddfd6d8843477414904a3276f Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_python3.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_scipy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_scipy.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aff088cfb6e7c7c16bc2fd5a3ae5dd05f77bdd41 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/__pycache__/load_scipy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_astropy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_astropy.py new file mode 100755 index 0000000000000000000000000000000000000000..dd8efce655c2223262b42868cbb1d9ba5c580acb --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_astropy.py @@ -0,0 +1,237 @@ +import numpy as np +from astropy.units import Quantity +from astropy.coordinates import Angle, SkyCoord +from astropy.constants import Constant, EMConstant +from astropy.table import Table +from astropy.time import Time + +from hickle.helpers import get_type_and_data +import six + +def create_astropy_quantity(py_obj, h_group, call_id=0, **kwargs): + """ dumps an astropy quantity + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + d = h_group.create_dataset('data_%i' % call_id, data=py_obj.value, + dtype='float64') #, **kwargs) + d.attrs["type"] = [b'astropy_quantity'] + if six.PY3: + unit = bytes(str(py_obj.unit), 'ascii') + else: + unit = str(py_obj.unit) + d.attrs['unit'] = [unit] + +def create_astropy_angle(py_obj, h_group, call_id=0, **kwargs): + """ dumps an astropy quantity + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + d = h_group.create_dataset('data_%i' % call_id, data=py_obj.value, + dtype='float64') #, **kwargs) + d.attrs["type"] = [b'astropy_angle'] + if six.PY3: + unit = str(py_obj.unit).encode('ascii') + else: + unit = str(py_obj.unit) + d.attrs['unit'] = [unit] + +def create_astropy_skycoord(py_obj, h_group, call_id=0, **kwargs): + """ dumps an astropy quantity + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + lat = py_obj.data.lat.value + lon = py_obj.data.lon.value + dd = np.column_stack((lon, lat)) + + d = h_group.create_dataset('data_%i' % call_id, data=dd, + dtype='float64') #, **kwargs) + d.attrs["type"] = [b'astropy_skycoord'] + if six.PY3: + lon_unit = str(py_obj.data.lon.unit).encode('ascii') + lat_unit = str(py_obj.data.lat.unit).encode('ascii') + else: + lon_unit = str(py_obj.data.lon.unit) + lat_unit = str(py_obj.data.lat.unit) + d.attrs['lon_unit'] = [lon_unit] + d.attrs['lat_unit'] = [lat_unit] + +def create_astropy_time(py_obj, h_group, call_id=0, **kwargs): + """ dumps an astropy Time object + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + + # kwarg compression etc does not work on scalars + data = py_obj.value + dtype = str(py_obj.value.dtype) + + # Need to catch string times + if '<U' in dtype: + dtype = dtype.replace('<U', '|S') + print(dtype) + data = [] + for item in py_obj.value: + data.append(str(item).encode('ascii')) + + d = h_group.create_dataset('data_%i' % call_id, data=data, dtype=dtype) #, **kwargs) + d.attrs["type"] = [b'astropy_time'] + if six.PY2: + fmt = str(py_obj.format) + scale = str(py_obj.scale) + else: + fmt = str(py_obj.format).encode('ascii') + scale = str(py_obj.scale).encode('ascii') + d.attrs['format'] = [fmt] + d.attrs['scale'] = [scale] + +def create_astropy_constant(py_obj, h_group, call_id=0, **kwargs): + """ dumps an astropy constant + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + d = h_group.create_dataset('data_%i' % call_id, data=py_obj.value, + dtype='float64') #, **kwargs) + d.attrs["type"] = [b'astropy_constant'] + d.attrs["unit"] = [str(py_obj.unit)] + d.attrs["abbrev"] = [str(py_obj.abbrev)] + d.attrs["name"] = [str(py_obj.name)] + d.attrs["reference"] = [str(py_obj.reference)] + d.attrs["uncertainty"] = [py_obj.uncertainty] + + if py_obj.system: + d.attrs["system"] = [py_obj.system] + + +def create_astropy_table(py_obj, h_group, call_id=0, **kwargs): + """ Dump an astropy Table + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + data = py_obj.as_array() + d = h_group.create_dataset('data_%i' % call_id, data=data, dtype=data.dtype, **kwargs) + d.attrs['type'] = [b'astropy_table'] + + if six.PY3: + colnames = [bytes(cn, 'ascii') for cn in py_obj.colnames] + else: + colnames = py_obj.colnames + d.attrs['colnames'] = colnames + for key, value in py_obj.meta.items(): + d.attrs[key] = value + + +def load_astropy_quantity_dataset(h_node): + py_type, data = get_type_and_data(h_node) + unit = h_node.attrs["unit"][0] + q = Quantity(data, unit) + return q + +def load_astropy_time_dataset(h_node): + py_type, data = get_type_and_data(h_node) + if six.PY3: + fmt = h_node.attrs["format"][0].decode('ascii') + scale = h_node.attrs["scale"][0].decode('ascii') + else: + fmt = h_node.attrs["format"][0] + scale = h_node.attrs["scale"][0] + q = Time(data, format=fmt, scale=scale) + return q + +def load_astropy_angle_dataset(h_node): + py_type, data = get_type_and_data(h_node) + unit = h_node.attrs["unit"][0] + q = Angle(data, unit) + return q + +def load_astropy_skycoord_dataset(h_node): + py_type, data = get_type_and_data(h_node) + lon_unit = h_node.attrs["lon_unit"][0] + lat_unit = h_node.attrs["lat_unit"][0] + q = SkyCoord(data[:,0], data[:, 1], unit=(lon_unit, lat_unit)) + return q + +def load_astropy_constant_dataset(h_node): + py_type, data = get_type_and_data(h_node) + unit = h_node.attrs["unit"][0] + abbrev = h_node.attrs["abbrev"][0] + name = h_node.attrs["name"][0] + ref = h_node.attrs["reference"][0] + unc = h_node.attrs["uncertainty"][0] + + system = None + if "system" in h_node.attrs.keys(): + system = h_node.attrs["system"][0] + + c = Constant(abbrev, name, data, unit, unc, ref, system) + return c + +def load_astropy_table(h_node): + py_type, data = get_type_and_data(h_node) + metadata = dict(h_node.attrs.items()) + metadata.pop('type') + metadata.pop('colnames') + + if six.PY3: + colnames = [cn.decode('ascii') for cn in h_node.attrs["colnames"]] + else: + colnames = h_node.attrs["colnames"] + + t = Table(data, names=colnames, meta=metadata) + return t + +def check_is_astropy_table(py_obj): + return isinstance(py_obj, Table) + +def check_is_astropy_quantity_array(py_obj): + if isinstance(py_obj, Quantity) or isinstance(py_obj, Time) or \ + isinstance(py_obj, Angle) or isinstance(py_obj, SkyCoord): + if py_obj.isscalar: + return False + else: + return True + else: + return False + + +##################### +# Lookup dictionary # +##################### + +class_register = [ + [Quantity, b'astropy_quantity', create_astropy_quantity, load_astropy_quantity_dataset, + True, check_is_astropy_quantity_array], + [Time, b'astropy_time', create_astropy_time, load_astropy_time_dataset, + True, check_is_astropy_quantity_array], + [Angle, b'astropy_angle', create_astropy_angle, load_astropy_angle_dataset, + True, check_is_astropy_quantity_array], + [SkyCoord, b'astropy_skycoord', create_astropy_skycoord, load_astropy_skycoord_dataset, + True, check_is_astropy_quantity_array], + [Constant, b'astropy_constant', create_astropy_constant, load_astropy_constant_dataset, + True, None], + [Table, b'astropy_table', create_astropy_table, load_astropy_table, + True, check_is_astropy_table] +] diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_numpy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_numpy.py new file mode 100755 index 0000000000000000000000000000000000000000..7a31b12e235b07cccb6b1f0045ca9ccbfb874454 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_numpy.py @@ -0,0 +1,145 @@ +# encoding: utf-8 +""" +# load_numpy.py + +Utilities and dump / load handlers for handling numpy and scipy arrays + +""" +import six +import numpy as np + + +from hickle.helpers import get_type_and_data + + +def check_is_numpy_array(py_obj): + """ Check if a python object is a numpy array (masked or regular) + + Args: + py_obj: python object to check whether it is a numpy array + + Returns + is_numpy (bool): Returns True if it is a numpy array, else False if it isn't + """ + + is_numpy = type(py_obj) in (type(np.array([1])), type(np.ma.array([1]))) + + return is_numpy + + +def create_np_scalar_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps an np dtype object to h5py file + + Args: + py_obj: python object to dump; should be a numpy scalar, e.g. np.float16(1) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + + # DO NOT PASS KWARGS TO SCALAR DATASETS! + d = h_group.create_dataset('data_%i' % call_id, data=py_obj) # **kwargs) + d.attrs["type"] = [b'np_scalar'] + + if six.PY2: + d.attrs["np_dtype"] = str(d.dtype) + else: + d.attrs["np_dtype"] = bytes(str(d.dtype), 'ascii') + + +def create_np_dtype(py_obj, h_group, call_id=0, **kwargs): + """ dumps an np dtype object to h5py file + + Args: + py_obj: python object to dump; should be a numpy scalar, e.g. np.float16(1) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=[str(py_obj)]) + d.attrs["type"] = [b'np_dtype'] + + +def create_np_array_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps an ndarray object to h5py file + + Args: + py_obj: python object to dump; should be a numpy array or np.ma.array (masked) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + if isinstance(py_obj, type(np.ma.array([1]))): + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) + #m = h_group.create_dataset('mask_%i' % call_id, data=py_obj.mask, **kwargs) + m = h_group.create_dataset('data_%i_mask' % call_id, data=py_obj.mask, **kwargs) + d.attrs["type"] = [b'ndarray_masked_data'] + m.attrs["type"] = [b'ndarray_masked_mask'] + else: + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) + d.attrs["type"] = [b'ndarray'] + + + + +####################### +## Lookup dictionary ## +####################### + +types_dict = { + np.ndarray: create_np_array_dataset, + np.ma.core.MaskedArray: create_np_array_dataset, + np.float16: create_np_scalar_dataset, + np.float32: create_np_scalar_dataset, + np.float64: create_np_scalar_dataset, + np.int8: create_np_scalar_dataset, + np.int16: create_np_scalar_dataset, + np.int32: create_np_scalar_dataset, + np.int64: create_np_scalar_dataset, + np.uint8: create_np_scalar_dataset, + np.uint16: create_np_scalar_dataset, + np.uint32: create_np_scalar_dataset, + np.uint64: create_np_scalar_dataset, + np.complex64: create_np_scalar_dataset, + np.complex128: create_np_scalar_dataset, + np.dtype: create_np_dtype +} + +def load_np_dtype_dataset(h_node): + py_type, data = get_type_and_data(h_node) + data = np.dtype(data[0]) + return data + +def load_np_scalar_dataset(h_node): + py_type, data = get_type_and_data(h_node) + subtype = h_node.attrs["np_dtype"] + data = np.array([data], dtype=subtype)[0] + return data + +def load_ndarray_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return np.array(data, copy=False) + +def load_ndarray_masked_dataset(h_node): + py_type, data = get_type_and_data(h_node) + try: + mask_path = h_node.name + "_mask" + h_root = h_node.parent + mask = h_root.get(mask_path)[:] + except IndexError: + mask = h_root.get(mask_path) + except ValueError: + mask = h_root.get(mask_path) + data = np.ma.array(data, mask=mask) + return data + +def load_nothing(h_hode): + pass + +hkl_types_dict = { + b"np_dtype" : load_np_dtype_dataset, + b"np_scalar" : load_np_scalar_dataset, + b"ndarray" : load_ndarray_dataset, + b"numpy.ndarray" : load_ndarray_dataset, + b"ndarray_masked_data" : load_ndarray_masked_dataset, + b"ndarray_masked_mask" : load_nothing # Loaded autormatically +} + + diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_pandas.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_pandas.py new file mode 100755 index 0000000000000000000000000000000000000000..0b5185533dafe9d2f8b2c45405967d7489ce7caf --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_pandas.py @@ -0,0 +1,4 @@ +import pandas as pd + +# TODO: populate with classes to load +class_register = [] \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python.py new file mode 100755 index 0000000000000000000000000000000000000000..58de921ed13e2e9b0c57ad724e94fa2ac9a3268f --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python.py @@ -0,0 +1,141 @@ +# encoding: utf-8 +""" +# load_python.py + +Handlers for dumping and loading built-in python types. +NB: As these are for built-in types, they are critical to the functioning of hickle. + +""" + +from hickle.helpers import get_type_and_data + +import sys +if sys.version_info.major == 3: + unicode = type(str) + str = type(bytes) + long = type(int) + NoneType = type(None) +else: + from types import NoneType + +import h5py as h5 + +def create_listlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dumper for list, set, tuple + + Args: + py_obj: python object to dump; should be list-like + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + dtype = str(type(py_obj)) + obj = list(py_obj) + d = h_group.create_dataset('data_%i' % call_id, data=obj, **kwargs) + d.attrs["type"] = [dtype] + + +def create_python_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a python dtype object to h5py file + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, + dtype=type(py_obj)) #, **kwargs) + d.attrs["type"] = ['python_dtype'] + d.attrs['python_subdtype'] = str(type(py_obj)) + + +def create_stringlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a list object to h5py file + + Args: + py_obj: python object to dump; should be string-like (unicode or string) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + if isinstance(py_obj, str): + d = h_group.create_dataset('data_%i' % call_id, data=[py_obj], **kwargs) + d.attrs["type"] = ['string'] + else: + dt = h5.special_dtype(vlen=unicode) + dset = h_group.create_dataset('data_%i' % call_id, shape=(1, ), dtype=dt, **kwargs) + dset[0] = py_obj + dset.attrs['type'] = ['unicode'] + + +def create_none_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dump None type to file + + Args: + py_obj: python object to dump; must be None object + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=[0], **kwargs) + d.attrs["type"] = ['none'] + + +def load_list_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return list(data) + +def load_tuple_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return tuple(data) + +def load_set_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return set(data) + +def load_string_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return str(data[0]) + +def load_unicode_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return unicode(data[0]) + +def load_none_dataset(h_node): + return None + +def load_python_dtype_dataset(h_node): + py_type, data = get_type_and_data(h_node) + subtype = h_node.attrs["python_subdtype"] + type_dict = { + "<type 'int'>": int, + "<type 'float'>": float, + "<type 'long'>": long, + "<type 'bool'>": bool, + "<type 'complex'>": complex + } + tcast = type_dict.get(subtype) + return tcast(data) + +types_dict = { + list: create_listlike_dataset, + tuple: create_listlike_dataset, + set: create_listlike_dataset, + str: create_stringlike_dataset, + unicode: create_stringlike_dataset, + int: create_python_dtype_dataset, + float: create_python_dtype_dataset, + long: create_python_dtype_dataset, + bool: create_python_dtype_dataset, + complex: create_python_dtype_dataset, + NoneType: create_none_dataset, +} + +hkl_types_dict = { + "<type 'list'>" : load_list_dataset, + "<type 'tuple'>" : load_tuple_dataset, + "<type 'set'>" : load_set_dataset, + "python_dtype" : load_python_dtype_dataset, + "string" : load_string_dataset, + "unicode" : load_unicode_dataset, + "none" : load_none_dataset +} + diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python3.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python3.py new file mode 100755 index 0000000000000000000000000000000000000000..c6b173fd07af42735dd05dd7acb9c42e1c651e38 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_python3.py @@ -0,0 +1,201 @@ +# encoding: utf-8 +""" +# load_python.py + +Handlers for dumping and loading built-in python types. +NB: As these are for built-in types, they are critical to the functioning of hickle. + +""" + +import six +from hickle.helpers import get_type_and_data + +try: + from exceptions import Exception +except ImportError: + pass # above imports will fail in python3 + +try: + ModuleNotFoundError # This fails on Py3.5 and below +except NameError: + ModuleNotFoundError = ImportError + +import h5py as h5 + + +def get_py3_string_type(h_node): + """ Helper function to return the python string type for items in a list. + + Notes: + Py3 string handling is a bit funky and doesn't play too nicely with HDF5. + We needed to add metadata to say if the strings in a list started off as + bytes, string, etc. This helper loads + + """ + try: + py_type = h_node.attrs["py3_string_type"][0] + return py_type + except: + return None + +def create_listlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dumper for list, set, tuple + + Args: + py_obj: python object to dump; should be list-like + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + dtype = str(type(py_obj)) + obj = list(py_obj) + + # h5py does not handle Py3 'str' objects well. Need to catch this + # Only need to check first element as this method + # is only called if all elements have same dtype + py3_str_type = None + if type(obj[0]) in (str, bytes): + py3_str_type = bytes(str(type(obj[0])), 'ascii') + + if type(obj[0]) is str: + #print(py3_str_type) + #print(obj, "HERE") + obj = [bytes(oo, 'utf8') for oo in obj] + #print(obj, "HERE") + + + d = h_group.create_dataset('data_%i' % call_id, data=obj, **kwargs) + d.attrs["type"] = [bytes(dtype, 'ascii')] + + # Need to add some metadata to aid in unpickling if it's a string type + if py3_str_type is not None: + d.attrs["py3_string_type"] = [py3_str_type] + + + +def create_python_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a python dtype object to h5py file + + Args: + py_obj: python object to dump; should be a python type (int, float, bool etc) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + # kwarg compression etc does not work on scalars + d = h_group.create_dataset('data_%i' % call_id, data=py_obj, + dtype=type(py_obj)) #, **kwargs) + d.attrs["type"] = [b'python_dtype'] + d.attrs['python_subdtype'] = bytes(str(type(py_obj)), 'ascii') + + +def create_stringlike_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps a list object to h5py file + + Args: + py_obj: python object to dump; should be string-like (unicode or string) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + if isinstance(py_obj, bytes): + d = h_group.create_dataset('data_%i' % call_id, data=[py_obj], **kwargs) + d.attrs["type"] = [b'bytes'] + elif isinstance(py_obj, str): + dt = h5.special_dtype(vlen=str) + dset = h_group.create_dataset('data_%i' % call_id, shape=(1, ), dtype=dt, **kwargs) + dset[0] = py_obj + dset.attrs['type'] = [b'string'] + +def create_none_dataset(py_obj, h_group, call_id=0, **kwargs): + """ Dump None type to file + + Args: + py_obj: python object to dump; must be None object + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + d = h_group.create_dataset('data_%i' % call_id, data=[0], **kwargs) + d.attrs["type"] = [b'none'] + + +def load_list_dataset(h_node): + py_type, data = get_type_and_data(h_node) + py3_str_type = get_py3_string_type(h_node) + + if py3_str_type == b"<class 'bytes'>": + # Yuck. Convert numpy._bytes -> str -> bytes + return [bytes(str(item, 'utf8'), 'utf8') for item in data] + if py3_str_type == b"<class 'str'>": + return [str(item, 'utf8') for item in data] + else: + return list(data) + +def load_tuple_dataset(h_node): + data = load_list_dataset(h_node) + return tuple(data) + +def load_set_dataset(h_node): + data = load_list_dataset(h_node) + return set(data) + +def load_bytes_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return bytes(data[0]) + +def load_string_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return str(data[0]) + +def load_unicode_dataset(h_node): + py_type, data = get_type_and_data(h_node) + return unicode(data[0]) + +def load_none_dataset(h_node): + return None + +def load_pickled_data(h_node): + py_type, data = get_type_and_data(h_node) + try: + import cPickle as pickle + except ModuleNotFoundError: + import pickle + return pickle.loads(data[0]) + + +def load_python_dtype_dataset(h_node): + py_type, data = get_type_and_data(h_node) + subtype = h_node.attrs["python_subdtype"] + type_dict = { + b"<class 'int'>": int, + b"<class 'float'>": float, + b"<class 'bool'>": bool, + b"<class 'complex'>": complex + } + + tcast = type_dict.get(subtype) + return tcast(data) + + + +types_dict = { + list: create_listlike_dataset, + tuple: create_listlike_dataset, + set: create_listlike_dataset, + bytes: create_stringlike_dataset, + str: create_stringlike_dataset, + #bytearray: create_stringlike_dataset, + int: create_python_dtype_dataset, + float: create_python_dtype_dataset, + bool: create_python_dtype_dataset, + complex: create_python_dtype_dataset, + type(None): create_none_dataset, +} + +hkl_types_dict = { + b"<class 'list'>" : load_list_dataset, + b"<class 'tuple'>" : load_tuple_dataset, + b"<class 'set'>" : load_set_dataset, + b"bytes" : load_bytes_dataset, + b"python_dtype" : load_python_dtype_dataset, + b"string" : load_string_dataset, + b"pickle" : load_pickled_data, + b"none" : load_none_dataset, +} diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_scipy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_scipy.py new file mode 100755 index 0000000000000000000000000000000000000000..ab09fe23c69ea791371e4b6a808b553c84195289 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/loaders/load_scipy.py @@ -0,0 +1,92 @@ +import six +import scipy +from scipy import sparse + +from hickle.helpers import get_type_and_data + +def check_is_scipy_sparse_array(py_obj): + """ Check if a python object is a scipy sparse array + + Args: + py_obj: python object to check whether it is a sparse array + + Returns + is_numpy (bool): Returns True if it is a sparse array, else False if it isn't + """ + t_csr = type(scipy.sparse.csr_matrix([0])) + t_csc = type(scipy.sparse.csc_matrix([0])) + t_bsr = type(scipy.sparse.bsr_matrix([0])) + is_sparse = type(py_obj) in (t_csr, t_csc, t_bsr) + + return is_sparse + + +def create_sparse_dataset(py_obj, h_group, call_id=0, **kwargs): + """ dumps an sparse array to h5py file + + Args: + py_obj: python object to dump; should be a numpy array or np.ma.array (masked) + h_group (h5.File.group): group to dump data into. + call_id (int): index to identify object's relative location in the iterable. + """ + h_sparsegroup = h_group.create_group('data_%i' % call_id) + data = h_sparsegroup.create_dataset('data', data=py_obj.data, **kwargs) + indices = h_sparsegroup.create_dataset('indices', data=py_obj.indices, **kwargs) + indptr = h_sparsegroup.create_dataset('indptr', data=py_obj.indptr, **kwargs) + shape = h_sparsegroup.create_dataset('shape', data=py_obj.shape, **kwargs) + + if isinstance(py_obj, type(sparse.csr_matrix([0]))): + type_str = 'csr' + elif isinstance(py_obj, type(sparse.csc_matrix([0]))): + type_str = 'csc' + elif isinstance(py_obj, type(sparse.bsr_matrix([0]))): + type_str = 'bsr' + + if six.PY2: + h_sparsegroup.attrs["type"] = [b'%s_matrix' % type_str] + data.attrs["type"] = [b"%s_matrix_data" % type_str] + indices.attrs["type"] = [b"%s_matrix_indices" % type_str] + indptr.attrs["type"] = [b"%s_matrix_indptr" % type_str] + shape.attrs["type"] = [b"%s_matrix_shape" % type_str] + else: + h_sparsegroup.attrs["type"] = [bytes(str('%s_matrix' % type_str), 'ascii')] + data.attrs["type"] = [bytes(str("%s_matrix_data" % type_str), 'ascii')] + indices.attrs["type"] = [bytes(str("%s_matrix_indices" % type_str), 'ascii')] + indptr.attrs["type"] = [bytes(str("%s_matrix_indptr" % type_str), 'ascii')] + shape.attrs["type"] = [bytes(str("%s_matrix_shape" % type_str), 'ascii')] + +def load_sparse_matrix_data(h_node): + + py_type, data = get_type_and_data(h_node) + h_root = h_node.parent + indices = h_root.get('indices')[:] + indptr = h_root.get('indptr')[:] + shape = h_root.get('shape')[:] + + if py_type == b'csc_matrix_data': + smat = sparse.csc_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) + elif py_type == b'csr_matrix_data': + smat = sparse.csr_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) + elif py_type == b'bsr_matrix_data': + smat = sparse.bsr_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) + return smat + + + + + +class_register = [ + [scipy.sparse.csr_matrix, b'csr_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], + [scipy.sparse.csc_matrix, b'csc_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], + [scipy.sparse.bsr_matrix, b'bsr_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], +] + +exclude_register = [] + +# Need to ignore things like csc_matrix_indices which are loaded automatically +for mat_type in ('csr', 'csc', 'bsr'): + for attrib in ('indices', 'indptr', 'shape'): + hkl_key = "%s_matrix_%s" % (mat_type, attrib) + if not six.PY2: + hkl_key = hkl_key.encode('ascii') + exclude_register.append(hkl_key) diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/lookup.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/lookup.py new file mode 100755 index 0000000000000000000000000000000000000000..99d13df9315be642540e46efc44d8e3d293de708 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/hickle/lookup.py @@ -0,0 +1,238 @@ +""" +#lookup.py + +This file contains all the mappings between hickle/HDF5 metadata and python types. +There are four dictionaries and one set that are populated here: + +1) types_dict +types_dict: mapping between python types and dataset creation functions, e.g. + types_dict = { + list: create_listlike_dataset, + int: create_python_dtype_dataset, + np.ndarray: create_np_array_dataset + } + +2) hkl_types_dict +hkl_types_dict: mapping between hickle metadata and dataset loading functions, e.g. + hkl_types_dict = { + "<type 'list'>" : load_list_dataset, + "<type 'tuple'>" : load_tuple_dataset + } + +3) container_types_dict +container_types_dict: mapping required to convert the PyContainer object in hickle.py + back into the required native type. PyContainer is required as + some iterable types are immutable (do not have an append() function). + Here is an example: + container_types_dict = { + "<type 'list'>": list, + "<type 'tuple'>": tuple + } + +4) container_key_types_dict +container_key_types_dict: mapping specifically for converting hickled dict data back into + a dictionary with the same key type. While python dictionary keys + can be any hashable object, in HDF5 a unicode/string is required + for a dataset name. Example: + container_key_types_dict = { + "<type 'str'>": str, + "<type 'unicode'>": unicode + } + +5) types_not_to_sort +type_not_to_sort is a list of hickle type attributes that may be hierarchical, +but don't require sorting by integer index. + +## Extending hickle to add support for other classes and types + +The process to add new load/dump capabilities is as follows: + +1) Create a file called load_[newstuff].py in loaders/ +2) In the load_[newstuff].py file, define your create_dataset and load_dataset functions, + along with all required mapping dictionaries. +3) Add an import call here, and populate the lookup dictionaries with update() calls: + # Add loaders for [newstuff] + try: + from .loaders.load_[newstuff[ import types_dict as ns_types_dict + from .loaders.load_[newstuff[ import hkl_types_dict as ns_hkl_types_dict + types_dict.update(ns_types_dict) + hkl_types_dict.update(ns_hkl_types_dict) + ... (Add container_types_dict etc if required) + except ImportError: + raise +""" + +import six +from ast import literal_eval + +def return_first(x): + """ Return first element of a list """ + return x[0] + +def load_nothing(h_hode): + pass + +types_dict = {} + +hkl_types_dict = {} + +types_not_to_sort = [b'dict', b'csr_matrix', b'csc_matrix', b'bsr_matrix'] + +container_types_dict = { + b"<type 'list'>": list, + b"<type 'tuple'>": tuple, + b"<type 'set'>": set, + b"<class 'list'>": list, + b"<class 'tuple'>": tuple, + b"<class 'set'>": set, + b"csr_matrix": return_first, + b"csc_matrix": return_first, + b"bsr_matrix": return_first + } + +# Technically, any hashable object can be used, for now sticking with built-in types +container_key_types_dict = { + b"<type 'str'>": literal_eval, + b"<type 'float'>": float, + b"<type 'bool'>": bool, + b"<type 'int'>": int, + b"<type 'complex'>": complex, + b"<type 'tuple'>": literal_eval, + b"<class 'str'>": literal_eval, + b"<class 'float'>": float, + b"<class 'bool'>": bool, + b"<class 'int'>": int, + b"<class 'complex'>": complex, + b"<class 'tuple'>": literal_eval + } + +if six.PY2: + container_key_types_dict[b"<type 'unicode'>"] = literal_eval + container_key_types_dict[b"<type 'long'>"] = long + +# Add loaders for built-in python types +if six.PY2: + from .loaders.load_python import types_dict as py_types_dict + from .loaders.load_python import hkl_types_dict as py_hkl_types_dict +else: + from .loaders.load_python3 import types_dict as py_types_dict + from .loaders.load_python3 import hkl_types_dict as py_hkl_types_dict + +types_dict.update(py_types_dict) +hkl_types_dict.update(py_hkl_types_dict) + +# Add loaders for numpy types +from .loaders.load_numpy import types_dict as np_types_dict +from .loaders.load_numpy import hkl_types_dict as np_hkl_types_dict +from .loaders.load_numpy import check_is_numpy_array +types_dict.update(np_types_dict) +hkl_types_dict.update(np_hkl_types_dict) + +####################### +## ND-ARRAY checking ## +####################### + +ndarray_like_check_fns = [ + check_is_numpy_array +] + +def check_is_ndarray_like(py_obj): + is_ndarray_like = False + for ii, check_fn in enumerate(ndarray_like_check_fns): + is_ndarray_like = check_fn(py_obj) + if is_ndarray_like: + break + return is_ndarray_like + + + + +####################### +## loading optional ## +####################### + +def register_class(myclass_type, hkl_str, dump_function, load_function, + to_sort=True, ndarray_check_fn=None): + """ Register a new hickle class. + + Args: + myclass_type type(class): type of class + dump_function (function def): function to write data to HDF5 + load_function (function def): function to load data from HDF5 + is_iterable (bool): Is the item iterable? + hkl_str (str): String to write to HDF5 file to describe class + to_sort (bool): If the item is iterable, does it require sorting? + ndarray_check_fn (function def): function to use to check if + + """ + types_dict.update({myclass_type: dump_function}) + hkl_types_dict.update({hkl_str: load_function}) + if to_sort == False: + types_not_to_sort.append(hkl_str) + if ndarray_check_fn is not None: + ndarray_like_check_fns.append(ndarray_check_fn) + +def register_class_list(class_list): + """ Register multiple classes in a list + + Args: + class_list (list): A list, where each item is an argument to + the register_class() function. + + Notes: This just runs the code: + for item in mylist: + register_class(*item) + """ + for class_item in class_list: + register_class(*class_item) + +def register_class_exclude(hkl_str_to_ignore): + """ Tell loading funciton to ignore any HDF5 dataset with attribute 'type=XYZ' + + Args: + hkl_str_to_ignore (str): attribute type=string to ignore and exclude from loading. + """ + hkl_types_dict[hkl_str_to_ignore] = load_nothing + +def register_exclude_list(exclude_list): + """ Ignore HDF5 datasets with attribute type='XYZ' from loading + + ArgsL + exclude_list (list): List of strings, which correspond to hdf5/hickle + type= attributes not to load. + """ + for hkl_str in exclude_list: + register_class_exclude(hkl_str) + +######################## +## Scipy sparse array ## +######################## + +try: + from .loaders.load_scipy import class_register, exclude_register + register_class_list(class_register) + register_exclude_list(exclude_register) +except ImportError: + pass +except NameError: + pass + +#################### +## Astropy stuff ## +#################### + +try: + from .loaders.load_astropy import class_register + register_class_list(class_register) +except ImportError: + pass + +################## +## Pandas stuff ## +################## + +try: + from .loaders.load_pandas import class_register + register_class_list(class_register) +except ImportError: + pass diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__init__.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/__init__.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/__init__.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..86d97d222a8780ca54c672085ff548dfa6a28be0 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_astropy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_astropy.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..8c672a706793402cf52bedb977e67f6a6b91ef51 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_astropy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..046d30d0527994b989f6c055be5a3cb573b6163f Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle_helpers.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle_helpers.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..d10d047823c6177abba7330f72046787d86a5e46 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_hickle_helpers.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_legacy_load.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_legacy_load.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..387c9076b34bdbd560ab83a5798a241b2adc1ece Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_legacy_load.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_scipy.cpython-36.pyc b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_scipy.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..6f7dbb8741560fc88a3fb0bd2356c226bae2cd69 Binary files /dev/null and b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/__pycache__/test_scipy.cpython-36.pyc differ diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_astropy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_astropy.py new file mode 100755 index 0000000000000000000000000000000000000000..2086ec37456b2bbcde77fbed2d5370b67ee89381 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_astropy.py @@ -0,0 +1,133 @@ +import hickle as hkl +from astropy.units import Quantity +from astropy.time import Time +from astropy.coordinates import Angle, SkyCoord +from astropy.constants import Constant, EMConstant, G +from astropy.table import Table +import numpy as np +from py.path import local + +# Set the current working directory to the temporary directory +local.get_temproot().chdir() + +def test_astropy_quantity(): + + for uu in ['m^3', 'm^3 / s', 'kg/pc']: + a = Quantity(7, unit=uu) + + hkl.dump(a, "test_ap.h5") + b = hkl.load("test_ap.h5") + + assert a == b + assert a.unit == b.unit + + a *= a + hkl.dump(a, "test_ap.h5") + b = hkl.load("test_ap.h5") + assert a == b + assert a.unit == b.unit + +def TODO_test_astropy_constant(): + hkl.dump(G, "test_ap.h5") + gg = hkl.load("test_ap.h5") + + print(G) + print(gg) + +def test_astropy_table(): + t = Table([[1, 2], [3, 4]], names=('a', 'b'), meta={'name': 'test_thing'}) + + hkl.dump({'a': t}, "test_ap.h5") + t2 = hkl.load("test_ap.h5")['a'] + + print(t) + print(t.meta) + print(t2) + print(t2.meta) + + print(t.dtype, t2.dtype) + assert t.meta == t2.meta + assert t.dtype == t2.dtype + + assert np.allclose(t['a'].astype('float32'), t2['a'].astype('float32')) + assert np.allclose(t['b'].astype('float32'), t2['b'].astype('float32')) + +def test_astropy_quantity_array(): + a = Quantity([1,2,3], unit='m') + + hkl.dump(a, "test_ap.h5") + b = hkl.load("test_ap.h5") + + assert np.allclose(a.value, b.value) + assert a.unit == b.unit + +def test_astropy_time_array(): + times = ['1999-01-01T00:00:00.123456789', '2010-01-01T00:00:00'] + t1 = Time(times, format='isot', scale='utc') + hkl.dump(t1, "test_ap2.h5") + t2 = hkl.load("test_ap2.h5") + + print(t1) + print(t2) + assert t1.value.shape == t2.value.shape + for ii in range(len(t1)): + assert t1.value[ii] == t2.value[ii] + assert t1.format == t2.format + assert t1.scale == t2.scale + + times = [58264, 58265, 58266] + t1 = Time(times, format='mjd', scale='utc') + hkl.dump(t1, "test_ap2.h5") + t2 = hkl.load("test_ap2.h5") + + print(t1) + print(t2) + assert t1.value.shape == t2.value.shape + assert np.allclose(t1.value, t2.value) + assert t1.format == t2.format + assert t1.scale == t2.scale + +def test_astropy_angle(): + for uu in ['radian', 'degree']: + a = Angle(1.02, unit=uu) + + hkl.dump(a, "test_ap.h5") + b = hkl.load("test_ap.h5") + assert a == b + assert a.unit == b.unit + +def test_astropy_angle_array(): + a = Angle([1,2,3], unit='degree') + + hkl.dump(a, "test_ap.h5") + b = hkl.load("test_ap.h5") + + assert np.allclose(a.value, b.value) + assert a.unit == b.unit + +def test_astropy_skycoord(): + ra = Angle(['1d20m', '1d21m'], unit='degree') + dec = Angle(['33d0m0s', '33d01m'], unit='degree') + radec = SkyCoord(ra, dec) + hkl.dump(radec, "test_ap.h5") + radec2 = hkl.load("test_ap.h5") + assert np.allclose(radec.ra.value, radec2.ra.value) + assert np.allclose(radec.dec.value, radec2.dec.value) + + ra = Angle(['1d20m', '1d21m'], unit='hourangle') + dec = Angle(['33d0m0s', '33d01m'], unit='degree') + radec = SkyCoord(ra, dec) + hkl.dump(radec, "test_ap.h5") + radec2 = hkl.load("test_ap.h5") + assert np.allclose(radec.ra.value, radec2.ra.value) + assert np.allclose(radec.dec.value, radec2.dec.value) + +if __name__ == "__main__": + test_astropy_quantity() + #test_astropy_constant() + test_astropy_table() + test_astropy_quantity_array() + test_astropy_time_array() + test_astropy_angle() + test_astropy_angle_array() + test_astropy_skycoord() diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle.py new file mode 100755 index 0000000000000000000000000000000000000000..5491054239372a3b5d42c9e6f07b6fc5701ed933 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle.py @@ -0,0 +1,826 @@ +#! /usr/bin/env python +# encoding: utf-8 +""" +# test_hickle.py + +Unit tests for hickle module. + +""" + +import h5py +import hashlib +import numpy as np +import os +import six +import time +from pprint import pprint + +from py.path import local + +import hickle +from hickle.hickle import * + + +# Set current working directory to the temporary directory +local.get_temproot().chdir() + +NESTED_DICT = { + "level1_1": { + "level2_1": [1, 2, 3], + "level2_2": [4, 5, 6] + }, + "level1_2": { + "level2_1": [1, 2, 3], + "level2_2": [4, 5, 6] + }, + "level1_3": { + "level2_1": { + "level3_1": [1, 2, 3], + "level3_2": [4, 5, 6] + }, + "level2_2": [4, 5, 6] + } +} + +DUMP_CACHE = [] # Used in test_track_times() + + +def test_string(): + """ Dumping and loading a string """ + if six.PY2: + filename, mode = 'test.h5', 'w' + string_obj = "The quick brown fox jumps over the lazy dog" + dump(string_obj, filename, mode) + string_hkl = load(filename) + #print "Initial list: %s"%list_obj + #print "Unhickled data: %s"%list_hkl + assert type(string_obj) == type(string_hkl) == str + assert string_obj == string_hkl + else: + pass + + +def test_unicode(): + """ Dumping and loading a unicode string """ + if six.PY2: + filename, mode = 'test.h5', 'w' + u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) + dump(u, filename, mode) + u_hkl = load(filename) + + assert type(u) == type(u_hkl) == unicode + assert u == u_hkl + # For those interested, uncomment below to see what those codes are: + # for i, c in enumerate(u_hkl): + # print i, '%04x' % ord(c), unicodedata.category(c), + # print unicodedata.name(c) + else: + pass + + +def test_unicode2(): + if six.PY2: + a = u"unicode test" + dump(a, 'test.hkl', mode='w') + + z = load('test.hkl') + assert a == z + assert type(a) == type(z) == unicode + pprint(z) + else: + pass + +def test_list(): + """ Dumping and loading a list """ + filename, mode = 'test_list.h5', 'w' + list_obj = [1, 2, 3, 4, 5] + dump(list_obj, filename, mode=mode) + list_hkl = load(filename) + #print(f'Initial list: {list_obj}') + #print(f'Unhickled data: {list_hkl}') + try: + assert type(list_obj) == type(list_hkl) == list + assert list_obj == list_hkl + import h5py + a = h5py.File(filename) + a.close() + + except AssertionError: + print("ERR:", list_obj, list_hkl) + import h5py + + raise() + + +def test_set(): + """ Dumping and loading a list """ + filename, mode = 'test_set.h5', 'w' + list_obj = set([1, 0, 3, 4.5, 11.2]) + dump(list_obj, filename, mode) + list_hkl = load(filename) + #print "Initial list: %s"%list_obj + #print "Unhickled data: %s"%list_hkl + try: + assert type(list_obj) == type(list_hkl) == set + assert list_obj == list_hkl + except AssertionError: + print(type(list_obj)) + print(type(list_hkl)) + #os.remove(filename) + raise + + +def test_numpy(): + """ Dumping and loading numpy array """ + filename, mode = 'test.h5', 'w' + dtypes = ['float32', 'float64', 'complex64', 'complex128'] + + for dt in dtypes: + array_obj = np.ones(8, dtype=dt) + dump(array_obj, filename, mode) + array_hkl = load(filename) + try: + assert array_hkl.dtype == array_obj.dtype + assert np.all((array_hkl, array_obj)) + except AssertionError: + print(array_hkl) + print(array_obj) + raise + + +def test_masked(): + """ Test masked numpy array """ + filename, mode = 'test.h5', 'w' + a = np.ma.array([1,2,3,4], dtype='float32', mask=[0,1,0,0]) + + dump(a, filename, mode) + a_hkl = load(filename) + + try: + assert a_hkl.dtype == a.dtype + assert np.all((a_hkl, a)) + except AssertionError: + print(a_hkl) + print(a) + raise + + +def test_dict(): + """ Test dictionary dumping and loading """ + filename, mode = 'test.h5', 'w' + + dd = { + 'name' : b'Danny', + 'age' : 28, + 'height' : 6.1, + 'dork' : True, + 'nums' : [1, 2, 3], + 'narr' : np.array([1,2,3]), + #'unic' : u'dan[at]thetelegraphic.com' + } + + + dump(dd, filename, mode) + dd_hkl = load(filename) + + for k in dd.keys(): + try: + assert k in dd_hkl.keys() + + if type(dd[k]) is type(np.array([1])): + assert np.all((dd[k], dd_hkl[k])) + else: + #assert dd_hkl[k] == dd[k] + pass + assert type(dd_hkl[k]) == type(dd[k]) + except AssertionError: + print(k) + print(dd_hkl[k]) + print(dd[k]) + print(type(dd_hkl[k]), type(dd[k])) + raise + + +def test_empty_dict(): + """ Test empty dictionary dumping and loading """ + filename, mode = 'test.h5', 'w' + + dump({}, filename, mode) + assert load(filename) == {} + + +def test_compression(): + """ Test compression on datasets""" + + filename, mode = 'test.h5', 'w' + dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] + + comps = [None, 'gzip', 'lzf'] + + for dt in dtypes: + for cc in comps: + array_obj = np.ones(32768, dtype=dt) + dump(array_obj, filename, mode, compression=cc) + print(cc, os.path.getsize(filename)) + array_hkl = load(filename) + try: + assert array_hkl.dtype == array_obj.dtype + assert np.all((array_hkl, array_obj)) + except AssertionError: + print(array_hkl) + print(array_obj) + raise + + +def test_dict_int_key(): + """ Test for dictionaries with integer keys """ + filename, mode = 'test.h5', 'w' + + dd = { + 0: "test", + 1: "test2" + } + + dump(dd, filename, mode) + dd_hkl = load(filename) + + +def test_dict_nested(): + """ Test for dictionaries with integer keys """ + filename, mode = 'test.h5', 'w' + + dd = NESTED_DICT + + dump(dd, filename, mode) + dd_hkl = load(filename) + + ll_hkl = dd_hkl["level1_3"]["level2_1"]["level3_1"] + ll = dd["level1_3"]["level2_1"]["level3_1"] + assert ll == ll_hkl + + +def test_masked_dict(): + """ Test dictionaries with masked arrays """ + + filename, mode = 'test.h5', 'w' + + dd = { + "data" : np.ma.array([1,2,3], mask=[True, False, False]), + "data2" : np.array([1,2,3,4,5]) + } + + dump(dd, filename, mode) + dd_hkl = load(filename) + + for k in dd.keys(): + try: + assert k in dd_hkl.keys() + if type(dd[k]) is type(np.array([1])): + assert np.all((dd[k], dd_hkl[k])) + elif type(dd[k]) is type(np.ma.array([1])): + print(dd[k].data) + print(dd_hkl[k].data) + assert np.allclose(dd[k].data, dd_hkl[k].data) + assert np.allclose(dd[k].mask, dd_hkl[k].mask) + + assert type(dd_hkl[k]) == type(dd[k]) + + except AssertionError: + print(k) + print(dd_hkl[k]) + print(dd[k]) + print(type(dd_hkl[k]), type(dd[k])) + raise + + +def test_np_float(): + """ Test for singular np dtypes """ + filename, mode = 'np_float.h5', 'w' + + dtype_list = (np.float16, np.float32, np.float64, + np.complex64, np.complex128, + np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32, np.uint64) + + for dt in dtype_list: + + dd = dt(1) + dump(dd, filename, mode) + dd_hkl = load(filename) + assert dd == dd_hkl + assert dd.dtype == dd_hkl.dtype + + dd = {} + for dt in dtype_list: + dd[str(dt)] = dt(1.0) + dump(dd, filename, mode) + dd_hkl = load(filename) + + print(dd) + for dt in dtype_list: + assert dd[str(dt)] == dd_hkl[str(dt)] + + +def md5sum(filename, blocksize=65536): + """ Compute MD5 sum for a given file """ + hash = hashlib.md5() + + with open(filename, "r+b") as f: + for block in iter(lambda: f.read(blocksize), ""): + hash.update(block) + return hash.hexdigest() + + +def caching_dump(obj, filename, *args, **kwargs): + """ Save arguments of all dump calls """ + DUMP_CACHE.append((obj, filename, args, kwargs)) + return hickle_dump(obj, filename, *args, **kwargs) + + +def test_track_times(): + """ Verify that track_times = False produces identical files """ + hashes = [] + for obj, filename, mode, kwargs in DUMP_CACHE: + if isinstance(filename, hickle.H5FileWrapper): + filename = str(filename.file_name) + kwargs['track_times'] = False + caching_dump(obj, filename, mode, **kwargs) + hashes.append(md5sum(filename)) + + time.sleep(1) + + for hash1, (obj, filename, mode, kwargs) in zip(hashes, DUMP_CACHE): + if isinstance(filename, hickle.H5FileWrapper): + filename = str(filename.file_name) + caching_dump(obj, filename, mode, **kwargs) + hash2 = md5sum(filename) + print(hash1, hash2) + assert hash1 == hash2 + + +def test_comp_kwargs(): + """ Test compression with some kwargs for shuffle and chunking """ + + filename, mode = 'test.h5', 'w' + dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] + + comps = [None, 'gzip', 'lzf'] + chunks = [(100, 100), (250, 250)] + shuffles = [True, False] + scaleoffsets = [0, 1, 2] + + for dt in dtypes: + for cc in comps: + for ch in chunks: + for sh in shuffles: + for so in scaleoffsets: + kwargs = { + 'compression' : cc, + 'dtype': dt, + 'chunks': ch, + 'shuffle': sh, + 'scaleoffset': so + } + #array_obj = np.random.random_integers(low=-8192, high=8192, size=(1000, 1000)).astype(dt) + array_obj = NESTED_DICT + dump(array_obj, filename, mode, compression=cc) + print(kwargs, os.path.getsize(filename)) + array_hkl = load(filename) + + +def test_list_numpy(): + """ Test converting a list of numpy arrays """ + + filename, mode = 'test.h5', 'w' + + a = np.ones(1024) + b = np.zeros(1000) + c = [a, b] + + dump(c, filename, mode) + dd_hkl = load(filename) + + print(dd_hkl) + + assert isinstance(dd_hkl, list) + assert isinstance(dd_hkl[0], np.ndarray) + + +def test_tuple_numpy(): + """ Test converting a list of numpy arrays """ + + filename, mode = 'test.h5', 'w' + + a = np.ones(1024) + b = np.zeros(1000) + c = (a, b, a) + + dump(c, filename, mode) + dd_hkl = load(filename) + + print(dd_hkl) + + assert isinstance(dd_hkl, tuple) + assert isinstance(dd_hkl[0], np.ndarray) + + +def test_none(): + """ Test None type hickling """ + + filename, mode = 'test.h5', 'w' + + a = None + + dump(a, filename, mode) + dd_hkl = load(filename) + print(a) + print(dd_hkl) + + assert isinstance(dd_hkl, type(None)) + + +def test_dict_none(): + """ Test None type hickling """ + + filename, mode = 'test.h5', 'w' + + a = {'a': 1, 'b' : None} + + dump(a, filename, mode) + dd_hkl = load(filename) + print(a) + print(dd_hkl) + + assert isinstance(a['b'], type(None)) + + +def test_file_open_close(): + """ https://github.com/telegraphic/hickle/issues/20 """ + import h5py + f = h5py.File('test.hdf', 'w') + a = np.arange(5) + + dump(a, 'test.hkl') + dump(a, 'test.hkl') + + dump(a, f, mode='w') + f.close() + try: + dump(a, f, mode='w') + except hickle.hickle.ClosedFileError: + print("Tests: Closed file exception caught") + + +def test_list_order(): + """ https://github.com/telegraphic/hickle/issues/26 """ + d = [np.arange(n + 1) for n in range(20)] + hickle.dump(d, 'test.h5') + d_hkl = hickle.load('test.h5') + + try: + for ii, xx in enumerate(d): + assert d[ii].shape == d_hkl[ii].shape + for ii, xx in enumerate(d): + assert np.allclose(d[ii], d_hkl[ii]) + except AssertionError: + print(d[ii], d_hkl[ii]) + raise + + +def test_embedded_array(): + """ See https://github.com/telegraphic/hickle/issues/24 """ + + d_orig = [[np.array([10., 20.]), np.array([10, 20, 30])], [np.array([10, 2]), np.array([1.])]] + hickle.dump(d_orig, 'test.h5') + d_hkl = hickle.load('test.h5') + + for ii, xx in enumerate(d_orig): + for jj, yy in enumerate(xx): + assert np.allclose(d_orig[ii][jj], d_hkl[ii][jj]) + + print(d_hkl) + print(d_orig) + + +################ +## NEW TESTS ## +################ + + +def generate_nested(): + a = [1, 2, 3] + b = [a, a, a] + c = [a, b, 's'] + d = [a, b, c, c, a] + e = [d, d, d, d, 1] + f = {'a' : a, 'b' : b, 'e' : e} + g = {'f' : f, 'a' : e, 'd': d} + h = {'h': g, 'g' : f} + z = [f, a, b, c, d, e, f, g, h, g, h] + a = np.array([1, 2, 3, 4]) + b = set([1, 2, 3, 4, 5]) + c = (1, 2, 3, 4, 5) + d = np.ma.array([1, 2, 3, 4, 5, 6, 7, 8]) + z = {'a': a, 'b': b, 'c': c, 'd': d, 'z': z} + return z + + +def test_is_iterable(): + a = [1, 2, 3] + b = 1 + + assert check_is_iterable(a) == True + assert check_is_iterable(b) == False + + +def test_check_iterable_item_type(): + + a = [1, 2, 3] + b = [a, a, a] + c = [a, b, 's'] + + type_a = check_iterable_item_type(a) + type_b = check_iterable_item_type(b) + type_c = check_iterable_item_type(c) + + assert type_a is int + assert type_b is list + assert type_c == False + + +def test_dump_nested(): + """ Dump a complicated nested object to HDF5 + """ + z = generate_nested() + dump(z, 'test.hkl', mode='w') + + +def test_with_dump(): + lst = [1] + tpl = (1) + dct = {1: 1} + arr = np.array([1]) + + with h5py.File('test.hkl') as file: + dump(lst, file, path='/lst') + dump(tpl, file, path='/tpl') + dump(dct, file, path='/dct') + dump(arr, file, path='/arr') + + +def test_with_load(): + lst = [1] + tpl = (1) + dct = {1: 1} + arr = np.array([1]) + + with h5py.File('test.hkl') as file: + assert load(file, '/lst') == lst + assert load(file, '/tpl') == tpl + assert load(file, '/dct') == dct + assert load(file, '/arr') == arr + + +def test_load(): + + a = set([1, 2, 3, 4]) + b = set([5, 6, 7, 8]) + c = set([9, 10, 11, 12]) + z = (a, b, c) + z = [z, z] + z = (z, z, z, z, z) + + print("Original:") + pprint(z) + dump(z, 'test.hkl', mode='w') + + print("\nReconstructed:") + z = load('test.hkl') + pprint(z) + + +def test_sort_keys(): + keys = [b'data_0', b'data_1', b'data_2', b'data_3', b'data_10'] + keys_sorted = [b'data_0', b'data_1', b'data_2', b'data_3', b'data_10'] + + print(keys) + print(keys_sorted) + assert sort_keys(keys) == keys_sorted + + +def test_ndarray(): + + a = np.array([1,2,3]) + b = np.array([2,3,4]) + z = (a, b) + + print("Original:") + pprint(z) + dump(z, 'test.hkl', mode='w') + + print("\nReconstructed:") + z = load('test.hkl') + pprint(z) + + +def test_ndarray_masked(): + + a = np.ma.array([1,2,3]) + b = np.ma.array([2,3,4], mask=[True, False, True]) + z = (a, b) + + print("Original:") + pprint(z) + dump(z, 'test.hkl', mode='w') + + print("\nReconstructed:") + z = load('test.hkl') + pprint(z) + + +def test_simple_dict(): + a = {'key1': 1, 'key2': 2} + + dump(a, 'test.hkl') + z = load('test.hkl') + + pprint(a) + pprint(z) + + +def test_complex_dict(): + a = {'akey': 1, 'akey2': 2} + if six.PY2: + # NO LONG TYPE IN PY3! + b = {'bkey': 2.0, 'bkey3': long(3.0)} + else: + b = a + c = {'ckey': "hello", "ckey2": "hi there"} + z = {'zkey1': a, 'zkey2': b, 'zkey3': c} + + print("Original:") + pprint(z) + dump(z, 'test.hkl', mode='w') + + print("\nReconstructed:") + z = load('test.hkl') + pprint(z) + +def test_multi_hickle(): + a = {'a': 123, 'b': [1, 2, 4]} + + if os.path.exists("test.hkl"): + os.remove("test.hkl") + dump(a, "test.hkl", path="/test", mode="w") + dump(a, "test.hkl", path="/test2", mode="r+") + dump(a, "test.hkl", path="/test3", mode="r+") + dump(a, "test.hkl", path="/test4", mode="r+") + + a = load("test.hkl", path="/test") + b = load("test.hkl", path="/test2") + c = load("test.hkl", path="/test3") + d = load("test.hkl", path="/test4") + +def test_complex(): + """ Test complex value dtype is handled correctly + + https://github.com/telegraphic/hickle/issues/29 """ + + data = {"A":1.5, "B":1.5 + 1j, "C":np.linspace(0,1,4) + 2j} + dump(data, "test.hkl") + data2 = load("test.hkl") + for key in data.keys(): + assert type(data[key]) == type(data2[key]) + +def test_nonstring_keys(): + """ Test that keys are reconstructed back to their original datatypes + https://github.com/telegraphic/hickle/issues/36 + """ + if six.PY2: + u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) + + data = {u'test': 123, + 'def': 456, + 'hik' : np.array([1,2,3]), + u: u, + 0: 0, + True: 'hi', + 1.1 : 'hey', + #2L : 'omg', + 1j: 'complex_hashable', + (1, 2): 'boo', + ('A', 17.4, 42): [1, 7, 'A'], + (): '1313e was here', + '0': 0 + } + #data = {'0': 123, 'def': 456} + print(data) + dump(data, "test.hkl") + data2 = load("test.hkl") + print(data2) + + for key in data.keys(): + assert key in data2.keys() + + print(data2) + else: + pass + +def test_scalar_compression(): + """ Test bug where compression causes a crash on scalar datasets + + (Scalars are incompressible!) + https://github.com/telegraphic/hickle/issues/37 + """ + data = {'a' : 0, 'b' : np.float(2), 'c' : True} + + dump(data, "test.hkl", compression='gzip') + data2 = load("test.hkl") + + print(data2) + for key in data.keys(): + assert type(data[key]) == type(data2[key]) + +def test_bytes(): + """ Dumping and loading a string. PYTHON3 ONLY """ + if six.PY3: + filename, mode = 'test.h5', 'w' + string_obj = b"The quick brown fox jumps over the lazy dog" + dump(string_obj, filename, mode) + string_hkl = load(filename) + #print "Initial list: %s"%list_obj + #print "Unhickled data: %s"%list_hkl + print(type(string_obj)) + print(type(string_hkl)) + assert type(string_obj) == type(string_hkl) == bytes + assert string_obj == string_hkl + else: + pass + +def test_np_scalar(): + """ Numpy scalar datatype + + https://github.com/telegraphic/hickle/issues/50 + """ + + fid='test.h5py' + r0={'test': np.float64(10.)} + s = dump(r0, fid) + r = load(fid) + print(r) + assert type(r0['test']) == type(r['test']) + +if __name__ == '__main__': + """ Some tests and examples """ + test_sort_keys() + + test_np_scalar() + test_scalar_compression() + test_complex() + test_file_open_close() + test_dict_none() + test_none() + test_masked_dict() + test_list() + test_set() + test_numpy() + test_dict() + test_empty_dict() + test_compression() + test_masked() + test_dict_nested() + test_comp_kwargs() + test_list_numpy() + test_tuple_numpy() + test_track_times() + test_list_order() + test_embedded_array() + test_np_float() + + if six.PY2: + test_unicode() + test_unicode2() + test_string() + test_nonstring_keys() + + if six.PY3: + test_bytes() + + + # NEW TESTS + test_is_iterable() + test_check_iterable_item_type() + test_dump_nested() + test_with_dump() + test_with_load() + test_load() + test_sort_keys() + test_ndarray() + test_ndarray_masked() + test_simple_dict() + test_complex_dict() + test_multi_hickle() + test_dict_int_key() + + # Cleanup + print("ALL TESTS PASSED!") \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle_helpers.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle_helpers.py new file mode 100755 index 0000000000000000000000000000000000000000..253839e97c96e484b7a66ad9d174648d281d1c66 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_hickle_helpers.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python +# encoding: utf-8 +""" +# test_hickle_helpers.py + +Unit tests for hickle module -- helper functions. + +""" + +import numpy as np +try: + import scipy + from scipy import sparse + _has_scipy = True +except ImportError: + _has_scipy = False + +from hickle.helpers import check_is_hashable, check_is_iterable, check_iterable_item_type + +from hickle.loaders.load_numpy import check_is_numpy_array +if _has_scipy: + from hickle.loaders.load_scipy import check_is_scipy_sparse_array + + + +def test_check_is_iterable(): + assert check_is_iterable([1,2,3]) is True + assert check_is_iterable(1) is False + + +def test_check_is_hashable(): + assert check_is_hashable(1) is True + assert check_is_hashable([1,2,3]) is False + + +def test_check_iterable_item_type(): + assert check_iterable_item_type([1,2,3]) is int + assert check_iterable_item_type([int(1), float(1)]) is False + assert check_iterable_item_type([]) is False + + +def test_check_is_numpy_array(): + assert check_is_numpy_array(np.array([1,2,3])) is True + assert check_is_numpy_array(np.ma.array([1,2,3])) is True + assert check_is_numpy_array([1,2]) is False + + +def test_check_is_scipy_sparse_array(): + t_csr = scipy.sparse.csr_matrix([0]) + t_csc = scipy.sparse.csc_matrix([0]) + t_bsr = scipy.sparse.bsr_matrix([0]) + assert check_is_scipy_sparse_array(t_csr) is True + assert check_is_scipy_sparse_array(t_csc) is True + assert check_is_scipy_sparse_array(t_bsr) is True + assert check_is_scipy_sparse_array(np.array([1])) is False + +if __name__ == "__main__": + test_check_is_hashable() + test_check_is_iterable() + test_check_is_numpy_array() + test_check_iterable_item_type() + if _has_scipy: + test_check_is_scipy_sparse_array() \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_legacy_load.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_legacy_load.py new file mode 100755 index 0000000000000000000000000000000000000000..e849bcf6594c7139357659f8cf0721ef777da3b0 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_legacy_load.py @@ -0,0 +1,30 @@ +import glob +import warnings +import hickle as hkl +import h5py +import six + +def test_legacy_load(): + if six.PY2: + filelist = sorted(glob.glob('legacy_hkls/*.hkl')) + + # Make all warnings show + warnings.simplefilter("always") + + for filename in filelist: + try: + print(filename) + a = hkl.load(filename) + except: + with h5py.File(filename) as a: + print(a.attrs.items()) + print(a.items()) + for key, item in a.items(): + print(item.attrs.items()) + raise + else: + print("Legacy loading only works in Py2. Sorry.") + pass + +if __name__ == "__main__": + test_legacy_load() \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_scipy.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_scipy.py new file mode 100755 index 0000000000000000000000000000000000000000..ab78311d3eb543f4d3515b6aef2eba4e5ea2a175 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/hickle-3.4.3-py3.6.egg/tests/test_scipy.py @@ -0,0 +1,57 @@ +import numpy as np +from scipy.sparse import csr_matrix, csc_matrix, bsr_matrix + +import hickle +from hickle.loaders.load_scipy import check_is_scipy_sparse_array + +from py.path import local + +# Set the current working directory to the temporary directory +local.get_temproot().chdir() + + +def test_is_sparse(): + sm0 = csr_matrix((3, 4), dtype=np.int8) + sm1 = csc_matrix((1, 2)) + + assert check_is_scipy_sparse_array(sm0) + assert check_is_scipy_sparse_array(sm1) + + +def test_sparse_matrix(): + sm0 = csr_matrix((3, 4), dtype=np.int8).toarray() + + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + sm1 = csr_matrix((data, (row, col)), shape=(3, 3)) + sm2 = csc_matrix((data, (row, col)), shape=(3, 3)) + + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]).repeat(4).reshape(6, 2, 2) + sm3 = bsr_matrix((data,indices, indptr), shape=(6, 6)) + + hickle.dump(sm1, 'test_sp.h5') + sm1_h = hickle.load('test_sp.h5') + hickle.dump(sm2, 'test_sp2.h5') + sm2_h = hickle.load('test_sp2.h5') + hickle.dump(sm3, 'test_sp3.h5') + sm3_h = hickle.load('test_sp3.h5') + + assert isinstance(sm1_h, csr_matrix) + assert isinstance(sm2_h, csc_matrix) + assert isinstance(sm3_h, bsr_matrix) + + assert np.allclose(sm1_h.data, sm1.data) + assert np.allclose(sm2_h.data, sm2.data) + assert np.allclose(sm3_h.data, sm3.data) + + assert sm1_h. shape == sm1.shape + assert sm2_h. shape == sm2.shape + assert sm3_h. shape == sm3.shape + + +if __name__ == "__main__": + test_sparse_matrix() + test_is_sparse() \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/site.py b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/site.py new file mode 100755 index 0000000000000000000000000000000000000000..0d2d2ff8da3960ecdaa6591fcee836c186fb8c91 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/hickle/lib/python3.6/site-packages/site.py @@ -0,0 +1,74 @@ +def __boot(): + import sys + import os + PYTHONPATH = os.environ.get('PYTHONPATH') + if PYTHONPATH is None or (sys.platform == 'win32' and not PYTHONPATH): + PYTHONPATH = [] + else: + PYTHONPATH = PYTHONPATH.split(os.pathsep) + + pic = getattr(sys, 'path_importer_cache', {}) + stdpath = sys.path[len(PYTHONPATH):] + mydir = os.path.dirname(__file__) + + for item in stdpath: + if item == mydir or not item: + continue # skip if current dir. on Windows, or my own directory + importer = pic.get(item) + if importer is not None: + loader = importer.find_module('site') + if loader is not None: + # This should actually reload the current module + loader.load_module('site') + break + else: + try: + import imp # Avoid import loop in Python >= 3.3 + stream, path, descr = imp.find_module('site', [item]) + except ImportError: + continue + if stream is None: + continue + try: + # This should actually reload the current module + imp.load_module('site', stream, path, descr) + finally: + stream.close() + break + else: + raise ImportError("Couldn't find the real 'site' module") + + known_paths = dict([(makepath(item)[1], 1) for item in sys.path]) # 2.2 comp + + oldpos = getattr(sys, '__egginsert', 0) # save old insertion position + sys.__egginsert = 0 # and reset the current one + + for item in PYTHONPATH: + addsitedir(item) + + sys.__egginsert += oldpos # restore effective old position + + d, nd = makepath(stdpath[0]) + insert_at = None + new_path = [] + + for item in sys.path: + p, np = makepath(item) + + if np == nd and insert_at is None: + # We've hit the first 'system' path entry, so added entries go here + insert_at = len(new_path) + + if np in known_paths or insert_at is None: + new_path.append(item) + else: + # new path after the insert point, back-insert it + new_path.insert(insert_at, item) + insert_at += 1 + + sys.path[:] = new_path + + +if __name__ == 'site': + __boot() + del __boot diff --git a/workflow_parallel_frame_prediction/Training/horovodJob.sh b/workflow_parallel_frame_prediction/Training/horovodJob.sh new file mode 100644 index 0000000000000000000000000000000000000000..236a08d9913dadfee3c7b1a76b4256797ec11533 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/horovodJob.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --account=deepacf +# budget account where contingent is taken from# TASKS = NODES * GPUS_PER_NODE +#SBATCH --nodes=3 +#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks=12 +# can be omitted if --nodes and --ntasks-per-node +# are given +# SBATCH --cpus-per-task=1 +# for OpenMP/hybrid jobs only +#SBATCH --output=horovod-%j.out +# if keyword omitted: Default is slurm-%j.out in +# the submission directory (%j is replaced by +# the job ID). +#SBATCH --error=horovod-%j.err +# if keyword omitted: Default is slurm-%j.out in +# the submission directory. +#SBATCH --time=20:00:00 +#SBATCH --gres=gpu:4 +#SBATCH --partition=gpus +#SBATCH --mail-user=b.gong@fz-juelich.de +#SBATCH --mail-type=ALL + +#create a folder to save the output +jutil env activate -p deepacf +module --force purge +module load Stages/Devel-2019a +module load GCC/8.3.0 +module load MVAPICH2/2.3.2-GDR +module load Stages/2019a +module load Horovod/0.16.2-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 + +#module load ParaStationMPI/5.2.2-1 +#module load h5py/2.9.0-Python-3.6.8 +# *** start of job script ***: +# Note: The current working directory at this point is +# the directory where sbatch was executed. +# export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +# *** start of job script *** +# Note: The current working directory at this point is +# the directory where sbatch was executed. +# export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +srun --cpu_bind=none python3.6 kitti_train_horovod.py diff --git a/workflow_parallel_frame_prediction/Training/keras_utils.py b/workflow_parallel_frame_prediction/Training/keras_utils.py new file mode 100755 index 0000000000000000000000000000000000000000..ededcc74fed982654d82cfb610b79224f1e08554 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/keras_utils.py @@ -0,0 +1,58 @@ +import os +import numpy as np + +from keras import backend as K +from keras.legacy.interfaces import generate_legacy_interface, recurrent_args_preprocessor +from keras.models import model_from_json + +legacy_prednet_support = generate_legacy_interface( + allowed_positional_args=['stack_sizes', 'R_stack_sizes', + 'A_filt_sizes', 'Ahat_filt_sizes', 'R_filt_sizes'], + conversions=[('dim_ordering', 'data_format'), + ('consume_less', 'implementation')], + value_conversions={'dim_ordering': {'tf': 'channels_last', + 'th': 'channels_first', + 'default': None}, + 'consume_less': {'cpu': 0, + 'mem': 1, + 'gpu': 2}}, + preprocessor=recurrent_args_preprocessor) + +# Convert old Keras (1.2) json models and weights to Keras 2.0 +def convert_model_to_keras2(old_json_file, old_weights_file, new_json_file, new_weights_file): + from prednet import PredNet + # If using tensorflow, it doesn't allow you to load the old weights. + if K.backend() != 'theano': + os.environ['KERAS_BACKEND'] = backend + reload(K) + + f = open(old_json_file, 'r') + json_string = f.read() + f.close() + model = model_from_json(json_string, custom_objects = {'PredNet': PredNet}) + model.load_weights(old_weights_file) + + weights = model.layers[1].get_weights() + if weights[0].shape[0] == model.layers[1].stack_sizes[1]: + for i, w in enumerate(weights): + if w.ndim == 4: + weights[i] = np.transpose(w, (2, 3, 1, 0)) + model.set_weights(weights) + + model.save_weights(new_weights_file) + json_string = model.to_json() + with open(new_json_file, "w") as f: + f.write(json_string) + + +if __name__ == '__main__': + old_dir = './model_data/' + new_dir = './model_data_keras2/' + if not os.path.exists(new_dir): + os.mkdir(new_dir) + for w_tag in ['', '-Lall', '-extrapfinetuned']: + m_tag = '' if w_tag == '-Lall' else w_tag + convert_model_to_keras2(old_dir + 'prednet_kitti_model' + m_tag + '.json', + old_dir + 'prednet_kitti_weights' + w_tag + '.hdf5', + new_dir + 'prednet_kitti_model' + m_tag + '.json', + new_dir + 'prednet_kitti_weights' + w_tag + '.hdf5') diff --git a/workflow_parallel_frame_prediction/Training/kitti_settings.py b/workflow_parallel_frame_prediction/Training/kitti_settings.py new file mode 100755 index 0000000000000000000000000000000000000000..547671117e573c9e41096ebc4775b925bb99a87f --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/kitti_settings.py @@ -0,0 +1,19 @@ +# Where KITTI data will be saved if you run process_kitti.py +# If you directly download the processed data, change to the path of the data. +## Changed logic: Now this is the path where the processed data lies: X_train,val,test +#DATA_DIR = './kitti_data/' +#data directory for training data 2015 and 2016 +#DATA_DIR = '/p/project/cjjsc42/severin/try3' +#data directory for moving objects: +#DATA_DIR = '/p/home/jusers/hussmann1/jureca/movingObjects/se_nw' +#data directory for featuretesting: +##DATA_DIR = './testTry2' +DATA_DIR = '/p/scratch/cjjsc42/bing/PredNet/processData/splits' +# Where model weights and config will be saved if you run kitti_train.py +# If you directly download the trained weights, change to appropriate path. +WEIGHTS_DIR = './model_data_keras2/' +#WEIGHTS_DIR = '/p/project/cjjsc42/bing/ml-severin/model_data_keras2' + +# Where results (prediction plots and evaluation file) will be saved. +#RESULTS_SAVE_DIR = './kitti_results' + diff --git a/workflow_parallel_frame_prediction/Training/kitti_train_horovod.py b/workflow_parallel_frame_prediction/Training/kitti_train_horovod.py new file mode 100755 index 0000000000000000000000000000000000000000..72539927f6a0dd9c88df47f4738b11c7124c39bc --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/kitti_train_horovod.py @@ -0,0 +1,119 @@ +''' +Train PredNet on KITTI sequences. (Geiger et al. 2013, http://www.cvlibs.net/datasets/kitti/) +''' + +import os +import numpy as np +np.random.seed(123) +#from six.moves import cPickle + +from keras import backend as K +from keras.models import Model +from keras.layers import Input, Dense, Flatten +from keras.layers import LSTM +from keras.layers import TimeDistributed +from keras.callbacks import LearningRateScheduler, ModelCheckpoint +from keras.optimizers import Adam +from prednet import PredNet +from data_utils import SequenceGenerator +from kitti_settings import * +import datetime +import horovod.keras as hvd +import keras +import tensorflow as tf +#Horovod:initialize horovod +hvd.init() +#Horovod: pin GPU to be used for process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +print("horovode size", hvd.size()) + +save_model = True# if weights will be saved +weights_file = os.path.join(WEIGHTS_DIR, 'prednet_kitti_weights.hdf5') # where weights will be saved +json_file = os.path.join(WEIGHTS_DIR, 'prednet_kitti_model.json') +if not os.path.exists(WEIGHTS_DIR): os.mkdir(WEIGHTS_DIR) +# Data files +train_file = os.path.join(DATA_DIR, 'X_train.hkl') +train_sources = os.path.join(DATA_DIR, 'sources_train.hkl') +val_file = os.path.join(DATA_DIR, 'X_val.hkl') +val_sources = os.path.join(DATA_DIR, 'sources_val.hkl') + +# Training parameters +nb_epoch = 10 #original: 150; for all tests so far set to 100; t2onlyMax: 150 +batch_size = 15 +samples_per_epoch = 500 #original: 500; for all tests so far set to 300; t2onlyMax: 500 +N_seq_val = 80 # number of sequences to use for validation ##original: 100; for all tests so far set to 65; t2onlyMax: 80 + +# Model parameters +n_channels, im_height, im_width = (3, 128, 160) +input_shape = (n_channels, im_height, im_width) if K.image_data_format() == 'channels_first' else (im_height, im_width, n_channels) +stack_sizes = (n_channels, 48, 96, 192) +R_stack_sizes = stack_sizes +A_filt_sizes = (3, 3, 3) +Ahat_filt_sizes = (3, 3, 3, 3) +R_filt_sizes = (3, 3, 3, 3) +layer_loss_weights = np.array([1., 0., 0., 0.]) # weighting for each layer in final loss; "L_0" model: [1, 0, 0, 0], "L_all": [1, 0.1, 0.1, 0.1] +layer_loss_weights = np.expand_dims(layer_loss_weights, 1) +nt = 10 # number of timesteps used for sequences in training +time_loss_weights = 1./ (nt - 1) * np.ones((nt,1)) # equally weight all timesteps except the first +time_loss_weights[0] = 0 + +prednet = PredNet(stack_sizes, R_stack_sizes, + A_filt_sizes, Ahat_filt_sizes, R_filt_sizes, + output_mode='error', return_sequences=True) +inputs = Input(shape=(nt,) + input_shape) +errors = prednet(inputs) # errors will be (batch_size, nt, nb_layers) +errors_by_time = TimeDistributed(Dense(1, trainable=False), weights=[layer_loss_weights, np.zeros(1)], trainable=False)(errors) # calculate weighted error by layer +errors_by_time = Flatten()(errors_by_time) # will be (batch_size, nt) +final_errors = Dense(1, weights=[time_loss_weights, np.zeros(1)], trainable=False)(errors_by_time) # weight errors by time +model = Model(inputs=inputs, outputs=final_errors) +#Horovod:ajust learning rate based on number of GPUs +opt = keras.optimizers.Adam(0.01 * hvd.size()) +#Horovod: add horovod DistributedOptimizer +opt = hvd.DistributedOptimizer(opt) +#Horovode: use hvd.DistributedOptimizer to compute gradients +model.compile(loss="mean_absolute_error", optimizer=opt, metrics=["accuracy"]) + + + +train_generator = SequenceGenerator(train_file, train_sources, nt, batch_size=batch_size, shuffle=True) +val_generator = SequenceGenerator(val_file, val_sources, nt, batch_size=batch_size, N_seq=N_seq_val) + +#lr_schedule = lambda epoch: 0.001 if epoch < 75 else 0.0001 # start with lr of 0.001 and then drop to 0.0001 after 75 epochs +callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0), + #hvd.callbacks.MetricAverageCallback(), + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,verbose=1) + ] +#bing: original save_model is True +if hvd.rank() == 0: + if save_model: + print("===========The model will be saved =======") + callbacks.append(ModelCheckpoint(filepath=weights_file, monitor='val_loss', save_best_only=True)) + +#the start training time +a = datetime.datetime.now() + +history = model.fit_generator(generator=train_generator,steps_per_epoch=samples_per_epoch/(batch_size*hvd.size()), epochs=nb_epoch, callbacks=callbacks, + validation_data=val_generator, validation_steps=N_seq_val/(batch_size*hvd.size())) + + +b = datetime.datetime.now() + +#the training time +t = b-a + +stats = list(train_generator.X.shape) +stats.append(t) + +print("training time is",stats) + +if save_model: + json_string = model.to_json() + with open(json_file, "w") as f: + f.write(json_string) + + + diff --git a/workflow_parallel_frame_prediction/Training/minMaxExtractor.py b/workflow_parallel_frame_prediction/Training/minMaxExtractor.py new file mode 100644 index 0000000000000000000000000000000000000000..5c1216208fae071c9dcf62e2152722dd626cd15e --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/minMaxExtractor.py @@ -0,0 +1,76 @@ +import hickle as hkl +import numpy as np +import matplotlib.pyplot as plt + +#x_train = hkl.load('/Users/Severin/Desktop/X_train.hkl') #load X_train produces on jureca +x_train = hkl.load('/p/project/cjjsc42/severin/try3/X_train.hkl') #load X_train produces on jureca +print('Shape of X:') +print(x_train.shape) +print('') + +#Print example +#t2_cutout = x_train[100,:,:,0] +#printt2cutout = plt.pcolormesh(t2_cutout[::-1,:], shading='bottom', cmap=plt.cm.jet) +#plt.savefig('t2_cutout') +#Extract Max min values: +maxT2 = np.amax(x_train[:,:,:,0]) # numpy.amax() returns the maximum of an array or maximum along an axis. +print('maxT2: ' + str(maxT2)) +minT2 = np.amin(x_train[:,:,:,0]) +print('minT2: ' + str(minT2)) +meanT2 = np.mean(x_train[:,:,:,0]) +print('meanT2: ' + str(meanT2)) +stdT2 = np.std(x_train[:,:,:,0]) +print('stdT2: ' + str(stdT2)) +highCutT2 = meanT2 + 3 * stdT2 +print('highCutT2: ' + str(highCutT2)) +lowCutT2 = meanT2 - 3 * stdT2 +print('lowCutT2: ' + str(lowCutT2)) +print('') + +maxGP = np.amax(x_train[:,:,:,1]) +print('maxGP: ' + str(maxGP)) +minGP = np.amin(x_train[:,:,:,1]) +print('minGP: ' + str(minGP)) +meanGP = np.mean(x_train[:,:,:,1]) +print('meanGP: ' + str(meanGP)) +stdGP = np.std(x_train[:,:,:,1]) +print('stdGP: ' + str(stdGP)) +highCutGP = meanGP + 3 * stdGP +print('highCutGP: ' + str(highCutGP)) +lowCutGP = meanGP - 3 * stdGP +print('lowCutGP: ' + str(lowCutGP)) +print('') + +maxGPH = np.amax(x_train[:,:,:,2]) +print('maxGPH: ' + str(maxGPH)) +minGPH = np.amin(x_train[:,:,:,2]) +print('minGPH: ' + str(minGPH)) +meanGPH = np.mean(x_train[:,:,:,2]) +print('meanGP: ' + str(meanGPH)) +stdGPH = np.std(x_train[:,:,:,2]) +print('stdGPH: ' + str(stdGPH)) +highCutGPH = meanGPH + 3 * stdGPH +print('highCutGPH: ' + str(highCutGPH)) +lowCutGPH = meanGPH - 3 * stdGPH +print('lowCutGPH: ' + str(lowCutGPH)) +print('') + +# Formel zum normalisieren: z = (x-min(x))/(max(x)-min(x)) +#x_trainNormalized2 = np.zeros(shape=x_train.shape) +#print('Empty shape:') +#print(x_trainNormalized2.shape) +#x_trainNormalized2[:,:,:,0] = (x_train[:,:,:,0]-minT2)/(maxT2-minT2) +#x_trainNormalized2[:,:,:,1] = (x_train[:,:,:,1]-minGP)/(maxGP-minGP) +#x_trainNormalized2[:,:,:,2] = (x_train[:,:,:,2]-minGPH)/(maxGPH-minGPH) + +#print('MaxMin values of normalized dataset:') +#print('T2:') +#print(np.amax(x_trainNormalized2[:,:,:,0])) +#print(np.amin(x_trainNormalized2[:,:,:,0])) +#print('GP:') +#print(np.amax(x_trainNormalized2[:,:,:,1])) +#print(np.amin(x_trainNormalized2[:,:,:,1])) +#print('GPH:') +#print(np.amax(x_trainNormalized2[:,:,:,2])) +#print(np.amin(x_trainNormalized2[:,:,:,2])) +#print(x_trainNormalized2) \ No newline at end of file diff --git a/workflow_parallel_frame_prediction/Training/prednet.py b/workflow_parallel_frame_prediction/Training/prednet.py new file mode 100755 index 0000000000000000000000000000000000000000..b5a0208ae137666c9bc284b21d6affe04d721053 --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/prednet.py @@ -0,0 +1,311 @@ +import numpy as np + +from keras import backend as K +from keras import activations +from keras.layers import Recurrent +from keras.layers import Conv2D, UpSampling2D, MaxPooling2D +from keras.engine import InputSpec +from keras_utils import legacy_prednet_support + +class PredNet(Recurrent): + '''PredNet architecture - Lotter 2016. + Stacked convolutional LSTM inspired by predictive coding principles. + + # Arguments + stack_sizes: number of channels in targets (A) and predictions (Ahat) in each layer of the architecture. + Length is the number of layers in the architecture. + First element is the number of channels in the input. + Ex. (3, 16, 32) would correspond to a 3 layer architecture that takes in RGB images and has 16 and 32 + channels in the second and third layers, respectively. + R_stack_sizes: number of channels in the representation (R) modules. + Length must equal length of stack_sizes, but the number of channels per layer can be different. + A_filt_sizes: filter sizes for the target (A) modules. + Has length of 1 - len(stack_sizes). + Ex. (3, 3) would mean that targets for layers 2 and 3 are computed by a 3x3 convolution of the errors (E) + from the layer below (followed by max-pooling) + Ahat_filt_sizes: filter sizes for the prediction (Ahat) modules. + Has length equal to length of stack_sizes. + Ex. (3, 3, 3) would mean that the predictions for each layer are computed by a 3x3 convolution of the + representation (R) modules at each layer. + R_filt_sizes: filter sizes for the representation (R) modules. + Has length equal to length of stack_sizes. + Corresponds to the filter sizes for all convolutions in the LSTM. + pixel_max: the maximum pixel value. + Used to clip the pixel-layer prediction. + error_activation: activation function for the error (E) units. + A_activation: activation function for the target (A) and prediction (A_hat) units. + LSTM_activation: activation function for the cell and hidden states of the LSTM. + LSTM_inner_activation: activation function for the gates in the LSTM. + output_mode: either 'error', 'prediction', 'all' or layer specification (ex. R2, see below). + Controls what is outputted by the PredNet. + If 'error', the mean response of the error (E) units of each layer will be outputted. + That is, the output shape will be (batch_size, nb_layers). + If 'prediction', the frame prediction will be outputted. + If 'all', the output will be the frame prediction concatenated with the mean layer errors. + The frame prediction is flattened before concatenation. + Nomenclature of 'all' is kept for backwards compatibility, but should not be confused with returning all of the layers of the model + For returning the features of a particular layer, output_mode should be of the form unit_type + layer_number. + For instance, to return the features of the LSTM "representational" units in the lowest layer, output_mode should be specificied as 'R0'. + The possible unit types are 'R', 'Ahat', 'A', and 'E' corresponding to the 'representation', 'prediction', 'target', and 'error' units respectively. + extrap_start_time: time step for which model will start extrapolating. + Starting at this time step, the prediction from the previous time step will be treated as the "actual" + data_format: 'channels_first' or 'channels_last'. + It defaults to the `image_data_format` value found in your + Keras config file at `~/.keras/keras.json`. + + # References + - [Deep predictive coding networks for video prediction and unsupervised learning](https://arxiv.org/abs/1605.08104) + - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) + - [Convolutional LSTM network: a machine learning approach for precipitation nowcasting](http://arxiv.org/abs/1506.04214) + - [Predictive coding in the visual cortex: a functional interpretation of some extra-classical receptive-field effects](http://www.nature.com/neuro/journal/v2/n1/pdf/nn0199_79.pdf) + ''' + @legacy_prednet_support + def __init__(self, stack_sizes, R_stack_sizes, + A_filt_sizes, Ahat_filt_sizes, R_filt_sizes, + pixel_max=1., error_activation='relu', A_activation='relu', + LSTM_activation='tanh', LSTM_inner_activation='hard_sigmoid', + output_mode='error', extrap_start_time=None, + data_format=K.image_data_format(), **kwargs): + self.stack_sizes = stack_sizes + self.nb_layers = len(stack_sizes) + assert len(R_stack_sizes) == self.nb_layers, 'len(R_stack_sizes) must equal len(stack_sizes)' + self.R_stack_sizes = R_stack_sizes + assert len(A_filt_sizes) == (self.nb_layers - 1), 'len(A_filt_sizes) must equal len(stack_sizes) - 1' + self.A_filt_sizes = A_filt_sizes + assert len(Ahat_filt_sizes) == self.nb_layers, 'len(Ahat_filt_sizes) must equal len(stack_sizes)' + self.Ahat_filt_sizes = Ahat_filt_sizes + assert len(R_filt_sizes) == (self.nb_layers), 'len(R_filt_sizes) must equal len(stack_sizes)' + self.R_filt_sizes = R_filt_sizes + + self.pixel_max = pixel_max + self.error_activation = activations.get(error_activation) + self.A_activation = activations.get(A_activation) + self.LSTM_activation = activations.get(LSTM_activation) + self.LSTM_inner_activation = activations.get(LSTM_inner_activation) + + default_output_modes = ['prediction', 'error', 'all'] + layer_output_modes = [layer + str(n) for n in range(self.nb_layers) for layer in ['R', 'E', 'A', 'Ahat']] + assert output_mode in default_output_modes + layer_output_modes, 'Invalid output_mode: ' + str(output_mode) + self.output_mode = output_mode + if self.output_mode in layer_output_modes: + self.output_layer_type = self.output_mode[:-1] + self.output_layer_num = int(self.output_mode[-1]) + else: + self.output_layer_type = None + self.output_layer_num = None + self.extrap_start_time = extrap_start_time + + assert data_format in {'channels_last', 'channels_first'}, 'data_format must be in {channels_last, channels_first}' + self.data_format = data_format + self.channel_axis = -3 if data_format == 'channels_first' else -1 + self.row_axis = -2 if data_format == 'channels_first' else -3 + self.column_axis = -1 if data_format == 'channels_first' else -2 + super(PredNet, self).__init__(**kwargs) + self.input_spec = [InputSpec(ndim=5)] + + def compute_output_shape(self, input_shape): + if self.output_mode == 'prediction': + out_shape = input_shape[2:] + elif self.output_mode == 'error': + out_shape = (self.nb_layers,) + elif self.output_mode == 'all': + out_shape = (np.prod(input_shape[2:]) + self.nb_layers,) + else: + stack_str = 'R_stack_sizes' if self.output_layer_type == 'R' else 'stack_sizes' + stack_mult = 2 if self.output_layer_type == 'E' else 1 + out_stack_size = stack_mult * getattr(self, stack_str)[self.output_layer_num] + out_nb_row = input_shape[self.row_axis] / 2**self.output_layer_num + out_nb_col = input_shape[self.column_axis] / 2**self.output_layer_num + if self.data_format == 'channels_first': + out_shape = (out_stack_size, out_nb_row, out_nb_col) + else: + out_shape = (out_nb_row, out_nb_col, out_stack_size) + + if self.return_sequences: + return (input_shape[0], input_shape[1]) + out_shape + else: + return (input_shape[0],) + out_shape + + def get_initial_state(self, x): + input_shape = self.input_spec[0].shape + init_nb_row = input_shape[self.row_axis] + init_nb_col = input_shape[self.column_axis] + + base_initial_state = K.zeros_like(x) # (samples, timesteps) + image_shape + non_channel_axis = -1 if self.data_format == 'channels_first' else -2 + for _ in range(2): + base_initial_state = K.sum(base_initial_state, axis=non_channel_axis) + base_initial_state = K.sum(base_initial_state, axis=1) # (samples, nb_channels) + + initial_states = [] + states_to_pass = ['r', 'c', 'e'] + nlayers_to_pass = {u: self.nb_layers for u in states_to_pass} + if self.extrap_start_time is not None: + states_to_pass.append('ahat') # pass prediction in states so can use as actual for t+1 when extrapolating + nlayers_to_pass['ahat'] = 1 + for u in states_to_pass: + for l in range(nlayers_to_pass[u]): + ds_factor = 2 ** l + nb_row = init_nb_row // ds_factor + nb_col = init_nb_col // ds_factor + if u in ['r', 'c']: + stack_size = self.R_stack_sizes[l] + elif u == 'e': + stack_size = 2 * self.stack_sizes[l] + elif u == 'ahat': + stack_size = self.stack_sizes[l] + output_size = stack_size * nb_row * nb_col # flattened size + + reducer = K.zeros((input_shape[self.channel_axis], output_size)) # (nb_channels, output_size) + initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) + if self.data_format == 'channels_first': + output_shp = (-1, stack_size, nb_row, nb_col) + else: + output_shp = (-1, nb_row, nb_col, stack_size) + initial_state = K.reshape(initial_state, output_shp) + initial_states += [initial_state] + + if K._BACKEND == 'theano': + from theano import tensor as T + # There is a known issue in the Theano scan op when dealing with inputs whose shape is 1 along a dimension. + # In our case, this is a problem when training on grayscale images, and the below line fixes it. + initial_states = [T.unbroadcast(init_state, 0, 1) for init_state in initial_states] + + if self.extrap_start_time is not None: + initial_states += [K.variable(0, int if K.backend() != 'tensorflow' else 'int32')] # the last state will correspond to the current timestep + return initial_states + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + self.conv_layers = {c: [] for c in ['i', 'f', 'c', 'o', 'a', 'ahat']} + + for l in range(self.nb_layers): + for c in ['i', 'f', 'c', 'o']: + act = self.LSTM_activation if c == 'c' else self.LSTM_inner_activation + self.conv_layers[c].append(Conv2D(self.R_stack_sizes[l], self.R_filt_sizes[l], padding='same', activation=act, data_format=self.data_format)) + + act = 'relu' if l == 0 else self.A_activation + self.conv_layers['ahat'].append(Conv2D(self.stack_sizes[l], self.Ahat_filt_sizes[l], padding='same', activation=act, data_format=self.data_format)) + + if l < self.nb_layers - 1: + self.conv_layers['a'].append(Conv2D(self.stack_sizes[l+1], self.A_filt_sizes[l], padding='same', activation=self.A_activation, data_format=self.data_format)) + + self.upsample = UpSampling2D(data_format=self.data_format) + self.pool = MaxPooling2D(data_format=self.data_format) + + self.trainable_weights = [] + nb_row, nb_col = (input_shape[-2], input_shape[-1]) if self.data_format == 'channels_first' else (input_shape[-3], input_shape[-2]) + for c in sorted(self.conv_layers.keys()): + for l in range(len(self.conv_layers[c])): + ds_factor = 2 ** l + if c == 'ahat': + nb_channels = self.R_stack_sizes[l] + elif c == 'a': + nb_channels = 2 * self.stack_sizes[l] + else: + nb_channels = self.stack_sizes[l] * 2 + self.R_stack_sizes[l] + if l < self.nb_layers - 1: + nb_channels += self.R_stack_sizes[l+1] + in_shape = (input_shape[0], nb_channels, nb_row // ds_factor, nb_col // ds_factor) + if self.data_format == 'channels_last': in_shape = (in_shape[0], in_shape[2], in_shape[3], in_shape[1]) + with K.name_scope('layer_' + c + '_' + str(l)): + self.conv_layers[c][l].build(in_shape) + self.trainable_weights += self.conv_layers[c][l].trainable_weights + + self.states = [None] * self.nb_layers*3 + + if self.extrap_start_time is not None: + self.t_extrap = K.variable(self.extrap_start_time, int if K.backend() != 'tensorflow' else 'int32') + self.states += [None] * 2 # [previous frame prediction, timestep] + + def step(self, a, states): + r_tm1 = states[:self.nb_layers] + c_tm1 = states[self.nb_layers:2*self.nb_layers] + e_tm1 = states[2*self.nb_layers:3*self.nb_layers] + + if self.extrap_start_time is not None: + t = states[-1] + a = K.switch(t >= self.t_extrap, states[-2], a) # if past self.extrap_start_time, the previous prediction will be treated as the actual + + c = [] + r = [] + e = [] + + # Update R units starting from the top + for l in reversed(range(self.nb_layers)): + inputs = [r_tm1[l], e_tm1[l]] + if l < self.nb_layers - 1: + inputs.append(r_up) + + inputs = K.concatenate(inputs, axis=self.channel_axis) + i = self.conv_layers['i'][l].call(inputs) + f = self.conv_layers['f'][l].call(inputs) + o = self.conv_layers['o'][l].call(inputs) + _c = f * c_tm1[l] + i * self.conv_layers['c'][l].call(inputs) + _r = o * self.LSTM_activation(_c) + c.insert(0, _c) + r.insert(0, _r) + + if l > 0: + r_up = self.upsample.call(_r) + + # Update feedforward path starting from the bottom + for l in range(self.nb_layers): + ahat = self.conv_layers['ahat'][l].call(r[l]) + if l == 0: + ahat = K.minimum(ahat, self.pixel_max) + frame_prediction = ahat + + # compute errors + e_up = self.error_activation(ahat - a) + e_down = self.error_activation(a - ahat) + + e.append(K.concatenate((e_up, e_down), axis=self.channel_axis)) + + if self.output_layer_num == l: + if self.output_layer_type == 'A': + output = a + elif self.output_layer_type == 'Ahat': + output = ahat + elif self.output_layer_type == 'R': + output = r[l] + elif self.output_layer_type == 'E': + output = e[l] + + if l < self.nb_layers - 1: + a = self.conv_layers['a'][l].call(e[l]) + a = self.pool.call(a) # target for next layer + + if self.output_layer_type is None: + if self.output_mode == 'prediction': + output = frame_prediction + else: + for l in range(self.nb_layers): + layer_error = K.mean(K.batch_flatten(e[l]), axis=-1, keepdims=True) + all_error = layer_error if l == 0 else K.concatenate((all_error, layer_error), axis=-1) + if self.output_mode == 'error': + output = all_error + else: + output = K.concatenate((K.batch_flatten(frame_prediction), all_error), axis=-1) + + states = r + c + e + if self.extrap_start_time is not None: + states += [frame_prediction, t + 1] + return output, states + + def get_config(self): + config = {'stack_sizes': self.stack_sizes, + 'R_stack_sizes': self.R_stack_sizes, + 'A_filt_sizes': self.A_filt_sizes, + 'Ahat_filt_sizes': self.Ahat_filt_sizes, + 'R_filt_sizes': self.R_filt_sizes, + 'pixel_max': self.pixel_max, + 'error_activation': self.error_activation.__name__, + 'A_activation': self.A_activation.__name__, + 'LSTM_activation': self.LSTM_activation.__name__, + 'LSTM_inner_activation': self.LSTM_inner_activation.__name__, + 'data_format': self.data_format, + 'extrap_start_time': self.extrap_start_time, + 'output_mode': self.output_mode} + base_config = super(PredNet, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/workflow_parallel_frame_prediction/Training/process_netCDF.py b/workflow_parallel_frame_prediction/Training/process_netCDF.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb822ec6afa49a2be6c6411b64c20e5c0fe71ff --- /dev/null +++ b/workflow_parallel_frame_prediction/Training/process_netCDF.py @@ -0,0 +1,169 @@ +''' +Code for processing staged ERA5 data +''' + +import os +#import requests +#from bs4 import BeautifulSoup +#import urllib.request +import numpy as np +#from imageio import imread +#from scipy.misc import imresize +import hickle as hkl +from netCDF4 import Dataset +from kitti_settings import * + +#TODO: Not optimal with DATA_DIR and filingPath: In original process_kitti.py +# there's just DATA_DIR (which is specified in kitti_settings.py) and in there +# the processed data will be stores. The raw data lies also in there in a subfolder + +#Path of ERA5 Data +DATA_DIR = './testData2' +print(DATA_DIR) +#Path where to save processed data +filingPath = './testTry2' + +# ToDo: Define a convenient function to create a list containing all files. +imageList = list(os.walk(DATA_DIR, topdown=False))[-1][-1] +imageList = sorted(imageList) +print('Image List:') +print(imageList) +print('Length of Image List: ' + str(len(imageList))) +#scp hussmann1@jureca.fz-juelich.de:/p/project/cjjsc42/severin/data/era5_extract_481.nc ~/Desktop/netCDFdata + +# ToDo: Define properly the train, val and test index +# Here just for testing and taking weird .DS_Store file into consideration +# http://www.apfelwiki.de/Main/DSStore +#train_recordings = imageList[1:6] +#val_recordings = imageList[7:9] +#test_recordings = imageList[-2:] + +#Train,Val,Test size in percentage +partition = [0.8, 0.05, 0.15] +#determine correct indices +train_begin = 0 +train_end = round(partition[0]*len(imageList))-1 +val_begin = train_end + 1 +val_end = train_end + round(partition[1]*len(imageList)) +test_begin = val_end + 1 +test_end = len(imageList)-1 +print('Indices of Train, Val and test: '+ str(train_begin) + ' ' + str(val_begin) + ' ' + str(test_begin)) +#slightly adapting start and end because starts at the first index given and stops before(!) the last. +train_recordings = imageList[train_begin:val_begin] +val_recordings = imageList[val_begin:test_begin] +test_recordings = imageList[test_begin:test_end] + +#adapted for feature testing: just first year (2015); Otherwise would take too long and some weird mistake in some data in 2016 +#in total: 17544 +#half: 8772 +#train: 0-6900 +#val:6901-7000 +#test:7001-8772 +#train_recordings = imageList[0:1000] +#val_recordings = imageList[6901:7000] +#test_recordings = imageList[7001:8772] + +print('Now everything together:') +print('Train:') +print(train_recordings) +print('Val:') +print(val_recordings) +print('Test:') +print(test_recordings) + +desired_im_sz = (128, 160) +# Create image datasets. +# Processes images and saves them in train, val, test splits. +def process_data(): + splits = {s: [] for s in ['train', 'test', 'val']} + splits['val'] = val_recordings + splits['test'] = test_recordings + splits['train'] = train_recordings + for split in splits: + source_list = [DATA_DIR] * len(splits[split]) # corresponds to recording that image came from + print(splits[split]) + print(source_list) + print((len(splits[split])==(len(source_list)))) + print('The list of ' + split + ' has length: ' + str(len(source_list))) + print( 'Creating ' + split + ' data: ' + str(len(source_list)) + ' images') + + #X = np.zeros((len(splits[split]),) + desired_im_sz + (3,), np.uint8) + #print(X) + #print('shape of X' + str(X.shape)) + + ##### TODO: iterate over split and read every .nc file, cut out array, + ##### overlay arrays for RGB like style. + ##### Save everything after for loop. + EU_stack_list = [0] * (len(splits[split])) + + for i, im_file in enumerate(splits[split]): + im_path = os.path.join(DATA_DIR, im_file) + print('Open following dataset: ' + im_path) + im = Dataset(im_path, mode = 'r') + #print(im) + t2 = im.variables['T2'][0,:,:] + msl = im.variables['MSL'][0,:,:] + gph500 = im.variables['gph500'][0,:,:] + im.close() + EU_t2 = t2[74:202, 550:710] + EU_msl = msl[74:202, 550:710] + EU_gph500 = gph500[74:202, 550:710] + print(EU_t2.shape, EU_msl.shape, EU_gph500.shape) + #Normal stack: T2, MSL & GPH500 + #EU_stack = np.stack([EU_t2, EU_msl, EU_gph500],axis=2) + #Stack T2 only: + #EU_stack = np.stack([EU_t2, EU_t2, EU_t2],axis=2) + #EU_stack_list[i]=EU_stack + #Stack T2*2 MSL*1: + #EU_stack = np.stack([EU_t2, EU_t2, EU_msl],axis=2) + #EU_stack_list[i]=EU_stack + #EU_stack = np.stack([EU_t2, EU_msl, EU_msl],axis=2) + #EU_stack_list[i]=EU_stack + #Stack T2*2 gph500*1: + #EU_stack = np.stack([EU_t2, EU_t2, EU_gph500],axis=2) + #EU_stack_list[i]=EU_stack + #Stack T2*1 gph500*2 + #EU_stack = np.stack([EU_t2, EU_gph500, EU_gph500],axis=2) + #EU_stack_list[i]=EU_stack + #print(EU_stack.shape) + #X[i]=EU_stack #this should be unnecessary + #t2_1 stack. Stack t2 with two empty arrays + #empty_image = np.zeros(shape = (128, 160)) + #EU_stack = np.stack([EU_t2, empty_image, empty_image],axis=2) + #EU_stack_list[i]=EU_stack + #t2_2 stack. Stack t2 with one empty array + empty_image = np.zeros(shape = (128, 160)) + EU_stack = np.stack([EU_t2, EU_t2, empty_image],axis=2) + EU_stack_list[i]=EU_stack + #print('Does ist work? ') + #print(EU_stack_list[i][:,:,0]==EU_t2) + #print(EU_stack[:,:,1]==EU_msl) + X = np.array(EU_stack_list) + print('Shape of X: ' + str(X.shape)) + hkl.dump(X, os.path.join(filingPath, 'X_' + split + '.hkl')) #Not optimal! + hkl.dump(source_list, os.path.join(filingPath, 'sources_' + split + '.hkl')) + + + #for category, folder in splits[split]: + # im_dir = os.path.join(DATA_DIR, 'raw/', category, folder, folder[:10], folder, 'image_03/data/') + # files = list(os.walk(im_dir, topdown=False))[-1][-1] + # im_list += [im_dir + f for f in sorted(files)] + # multiply path of respective recording with lengths of its files in order to ensure + # that each entry in X_train.hkl corresponds with an entry of source_list/ sources_train.hkl + # source_list += [category + '-' + folder] * len(files) + + #print( 'Creating ' + split + ' data: ' + str(len(im_list)) + ' images') + #X = np.zeros((len(im_list),) + desired_im_sz + (3,), np.uint8) + # enumerate allows us to loop over something and have an automatic counter + #for i, im_file in enumerate(im_list): + # im = imread(im_file) + # X[i] = process_im(im, desired_im_sz) + + #hkl.dump(X, os.path.join(DATA_DIR, 'X_' + split + '.hkl')) + #hkl.dump(source_list, os.path.join(DATA_DIR, 'sources_' + split + '.hkl')) + + +if __name__ == '__main__': + #download_data() + #extract_data() + process_data() diff --git a/workflow_parallel_frame_prediction/Workflow.png b/workflow_parallel_frame_prediction/Workflow.png new file mode 100644 index 0000000000000000000000000000000000000000..5edc5ed807f496597f52d978d2cd28532e689b2b Binary files /dev/null and b/workflow_parallel_frame_prediction/Workflow.png differ diff --git a/workflow_parallel_frame_prediction/__init__.py b/workflow_parallel_frame_prediction/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow_parallel_frame_prediction/main.sh b/workflow_parallel_frame_prediction/main.sh new file mode 100644 index 0000000000000000000000000000000000000000..066b202d0daaf60555afd5a2898372c3a06eba0c --- /dev/null +++ b/workflow_parallel_frame_prediction/main.sh @@ -0,0 +1,28 @@ + +#!/bin/bash +EXTRACT_PATH="./DataExtraction/Stager_devel_N_24_Bing.sh" +PREPROCESS_PATH="./DataPreprocess/Stager_devel_N_24_process_netCDF.sh" +TRAINING_PATH="./Training/horovodJob.sh" +POSPROCESS_PATH = "./DataPostprocess/Stager_devel_N_24_evaluation.sh" + +echo "============ Parallel Data Extraction ==========\n" + +sbatch "$EXTRACT_PATH" + +echo "============= Parallel Data Preprocessing =========\n " + + +sbatch "$PREPROCESS_PATH" + + +echo "============= Parallel Training ================\n" + +sbatch "$TRAINING_PATH" + + +echo "=============Parallel Postprocessing ===============\n" + +sbatch "$POSTPROCESS_PATH" + + + diff --git a/workflow_parallel_frame_prediction/setup.py b/workflow_parallel_frame_prediction/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..207b869dd947101b9160c90a1511037f44611552 --- /dev/null +++ b/workflow_parallel_frame_prediction/setup.py @@ -0,0 +1,11 @@ + +from setuptools import setup + +setup( + name='Parallel_Workflow_PredNet', + author="Bing,Gong; Amirpasha Mozaffari, Severin Hussman", + description="This is the parallel workflow for PredNet", + copyright= "Copyright 2019, The ESDE project", + version='1.0.0', + author_email="b.gong@fz-juelich.de", +)