Skip to content
Snippets Groups Projects
Commit 46a6f9ca authored by Michael Langguth's avatar Michael Langguth
Browse files

Merge branch 'michael_issue053_advanced_dir_handling' into test_zam347

parents c789dbb4 c9de71eb
Branches
Tags
No related merge requests found
from mpi4py import MPI from mpi4py import MPI
import argparse import argparse
from process_netCDF_v2 import * from process_netCDF_v2 import *
from metadata import MetaData
import json import json
#add parser arguments #add parser arguments
...@@ -12,7 +13,9 @@ parser.add_argument("--varnames","-vars",dest="varnames", nargs = '+') ...@@ -12,7 +13,9 @@ parser.add_argument("--varnames","-vars",dest="varnames", nargs = '+')
# help="--partition allows to control the splitting of the processed data in training, test and validation data. Pass a dictionary-like string.") # help="--partition allows to control the splitting of the processed data in training, test and validation data. Pass a dictionary-like string.")
args = parser.parse_args() args = parser.parse_args()
target_dir = args.destination_dir # ML 2020/06/08: Dirty workaround as long as data-splitting is done with this seperate Python-script
# called from the same parent Shell-/Batch-script as 'mpi_stager_v2_process_netCDF.py'
target_dir = os.path.join(MetaData.get_destdir_jsontmp(),"hickle")
varnames = args.varnames varnames = args.varnames
#partition = args.partition #partition = args.partition
......
...@@ -9,6 +9,7 @@ from external_function import load_distributor ...@@ -9,6 +9,7 @@ from external_function import load_distributor
from external_function import hash_directory from external_function import hash_directory
from external_function import md5 from external_function import md5
from process_netCDF_v2 import * from process_netCDF_v2 import *
from metadata import MetaData as MetaData
import os import os
import argparse import argparse
import json import json
...@@ -18,6 +19,8 @@ def main(): ...@@ -18,6 +19,8 @@ def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--source_dir", type=str, default="/p/scratch/deepacf/bing/extractedData/") parser.add_argument("--source_dir", type=str, default="/p/scratch/deepacf/bing/extractedData/")
parser.add_argument("--destination_dir", type=str, default="/p/scratch/deepacf/bing/processData_size_64_64_3_3t_norm") parser.add_argument("--destination_dir", type=str, default="/p/scratch/deepacf/bing/processData_size_64_64_3_3t_norm")
parser.add_argument("--script_dir","-scr_dir",dest="script_dir",type=str)
parser.add_argument("--years", "-y", dest="years")
parser.add_argument("--checksum_status", type=int, default=0) parser.add_argument("--checksum_status", type=int, default=0)
parser.add_argument("--rsync_status", type=int, default=1) parser.add_argument("--rsync_status", type=int, default=1)
parser.add_argument("--vars", nargs="+",default = ["T2","T2","T2"]) #"MSL","gph500" parser.add_argument("--vars", nargs="+",default = ["T2","T2","T2"]) #"MSL","gph500"
...@@ -28,8 +31,10 @@ def main(): ...@@ -28,8 +31,10 @@ def main():
args = parser.parse_args() args = parser.parse_args()
current_path = os.getcwd() current_path = os.getcwd()
source_dir = args.source_dir years = args.years
source_dir = os.path.join(args.source_dir,str(years))+"/"
destination_dir = args.destination_dir destination_dir = args.destination_dir
scr_dir = args.script_dir
checksum_status = args.checksum_status checksum_status = args.checksum_status
rsync_status = args.rsync_status rsync_status = args.rsync_status
...@@ -100,12 +105,35 @@ def main(): ...@@ -100,12 +105,35 @@ def main():
sys.exit(1) sys.exit(1)
if not os.path.exists(destination_dir): # check if the Destination dir. is existing # ML 2020/04/26
# Expand destination_dir-variable by searching for netCDF-files in source_dir and processing the file from the first list element to obtain all relevant (meta-)data.
if my_rank == 0: if my_rank == 0:
data_files_list = glob.glob(source_dir+"/**/*.nc",recursive=True)
if not data_files_list: raise ValueError("Could not find any data to be processed in '"+source_dir+"'")
md = MetaData(suffix_indir=destination_dir,data_filename=data_files_list[0],slices=slices,variables=vars)
# modify Batch scripts if metadata has been retrieved for the first time (md.status = "new")
if (md.status == "new"):
md.write_dirs_to_batch_scripts(scr_dir+"/DataPreprocess_to_tf.sh")
md.write_dirs_to_batch_scripts(scr_dir+"/generate_era5.sh")
md.write_dirs_to_batch_scripts(scr_dir+"/train_era5.sh")
# ML 2020/06/08: Dirty workaround as long as data-splitting is done with a seperate Python-script
# called from the same parent Shell-/Batch-script
# -> work with temproary json-file in working directory
md.write_destdir_jsontmp(os.path.join(md.expdir,md.expname),tmp_dir=current_path)
#else: nothing to do
destination_dir= os.path.join(md.expdir,md.expname,"hickle",years)
# ...and create directory if necessary
if not os.path.exists(destination_dir): # check if the Destination dir. is existing
logging.critical('The Destination does not exist') logging.critical('The Destination does not exist')
logging.info('Create new destination dir') logging.info('Create new destination dir')
os.makedirs(destination_dir,exist_ok=True) os.makedirs(destination_dir,exist_ok=True)
# ML 2020/04/24 E
if my_rank == 0: # node is master: if my_rank == 0: # node is master:
# ==================================== Master : Directory scanner ================================= # # ==================================== Master : Directory scanner ================================= #
...@@ -189,6 +217,8 @@ def main(): ...@@ -189,6 +217,8 @@ def main():
#os.system(rsync_str) #os.system(rsync_str)
#process_era5_in_dir(job, src_dir=source_dir, target_dir=destination_dir) #process_era5_in_dir(job, src_dir=source_dir, target_dir=destination_dir)
# ML 2020/06/09: workaround to get correct destination_dir obtained by the master node
destination_dir = os.path.join(MetaData.get_destdir_jsontmp(tmp_dir=current_path),"hickle",years)
process_netCDF_in_dir(job_name=job, src_dir=source_dir, target_dir=destination_dir,slices=slices,vars=vars) process_netCDF_in_dir(job_name=job, src_dir=source_dir, target_dir=destination_dir,slices=slices,vars=vars)
if checksum_status == 1: if checksum_status == 1:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment