Update for D6.5

5e40d8d1 · Tom Ridley · 96311d12 · 5e40d8d1 · 5e40d8d1 · 5e40d8d1
Commit 5e40d8d1 authored Aug 15, 2023 by Tom Ridley
--- a/CIAO_s3_remote_read.py
+++ b/CIAO_s3_remote_read.py
@@ -12,7 +12,10 @@ import boto3
 import json
 import multiprocessing as mp

-
+######################################################################
+########################Function Definition###########################
+######################################################################
+#Section required for the proper functioning of multiprocessing
 class s3_details:       #Variables for defining S3 file details and how much to read from file
    def __init__(self, s3_endpoint, bucket_name, access_id, access_key, region, dimension, target, window_width, prefix):
           self.s3_endpoint = s3_endpoint
@@ -25,29 +28,39 @@ class s3_details:       #Variables for defining S3 file details and how much to
           self.window_width = window_width
           self.prefix = prefix
    
-    def read_variable(self,column_to_read):     #Downloads subsection of separated SC dataset via S3, of window_width around the target centre element. Should automatically work with either 2D or 3D data 
+    def read_variable(self,column_to_read):     #Downloads subsection of separated SC dataset via S3, of window_width around the target centre element. Should work with either 2D or 3D data, based on the "dim.json" file
        print(self.s3_endpoint+"/"+self.bucket_name+"/"+self.prefix+"_"+column_to_read+".h5")
        with  h5py.File(self.s3_endpoint+"/"+self.bucket_name+"/"+self.prefix+"_"+column_to_read+".h5", driver='ros3', secret_id=bytes(self.access_id, encoding='utf-8'), secret_key=bytes(self.access_key, encoding='utf-8'), aws_region=bytes(self.region, encoding = 'utf-8')) as f:#Encoding is important for h5py to properly understand the strings
-                if self.dimension ==1:
-                    data_part = f[column_to_read][0,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]
-                elif self.dimension == 3:
-                    data_part = f[column_to_read][self.target[0]-self.window_width:self.target[0]+self.window_width+1,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]
+                if self.dimension ==1:          #2d case
+                    data_part = f[column_to_read][0,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]#download target element and window_width around it only, for 2d case
+                elif self.dimension == 3:       #3d case
+                    data_part = f[column_to_read][self.target[0]-self.window_width:self.target[0]+self.window_width+1,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]#download target element and window_width around it only, for 3d case
                return data_part
         
-
-                                                    #Section Defines connection details to S3 Endpoint
+######################################################################
+######################S3 Connection Details###########################
+######################################################################
 s3_endpoint = "https://s3-coec.jsc.fz-juelich.de"   #S3 Endpoint to connect to. As supplied, this is the best available for CoEC
 bucket_name = "smalltarget"                         #Set bucket to target. Script expects all files inside to be from the same simulation
 access_id = ""                                      #Set accesss_id for S3 account
 access_key = ""                                     #Set secret access_key for S3 account
 region = "us-east-1"                                #Set region for S3 endpoint.

+######################################################################
+##########################Target Window###############################
+######################################################################
 window_width=10                                     #Set number of elements either side of the target element to download in each dimension
 #target_type = "manual"                             #Use a single target element for all frames of simulation output
 #target_type = "manual_list"                        #Manually define a list of target elements for all frames of simulation output
 target_type = "auto"                                #Use the pre-generated hottest element list for all frames of simulation output
 target = (100,100,100)                              # If using the target_type manual option, define the target element for all frames here
+manual_target=[]
+
+######################################################################
+#####################Functional Section###############################
+######################################################################

+##########################Read Metadata ##############################
 with open("prefix_list.json", 'r') as f:            #Read .json for list of filenames
    prefix_list = json.load(f)
 #print(prefix_list)
@@ -57,23 +70,26 @@ with open("dim.json", 'r') as f:                    #Read .json for dimensions o
    dim = json.load(f)
 with open("T_max_arg_list.json", 'r') as f:         #Read .json for list of hottest elements of each simulation frame.
    T_max_arg_list = json.load(f)
+dimension = dim[0]                                  #Use information from .json file to determine 2D or 3D simulation

+#############################Targeting###############################
 if target_type == "auto":
-    print "Target list populated from .json file"
+    print("Target list populated from .json file")
    target_list = T_max_arg_list
 elif target_type == "manual":
-    target = [target] *len(prefix_list)
+    print("Target list populated from single target element file")
+    target_list = [target] *len(prefix_list)
 elif target_type == "manual_list":                  #If use the manual list option, you will have to create a list the same length as the number of files
-    target= []                                      #with a 3-element list as each element within it.
-     
-dimension = dim[0]                                  #Use information from .json file to determine 2D or 3D simulation
+    print("Target list populated from manually supplied element list")
+    target_list= manual_target                                   #with a 3-element list as each element within it.     

+#############################Download###############################
 pool = mp.Pool(len(var_list))                       #Declare pool of threads, one for each variable to download
 for count, prefix in enumerate(prefix_list):        
-    target = target_list[count]                     #Set the target centre element according to the 
+    target = target_list[count]                     #Set the target centre element
    s3file = s3_details(s3_endpoint,bucket_name,access_id,access_key,region, dimension, target ,window_width,prefix)        #Set details of download
    data_payload=np.array(pool.map(s3file.read_variable,var_list)) #Download all variables in parallel
-    np.save(prefix+"_out", data_payload)            #Save array to file, based on each original file name. All variables saved to one array in original order
+    np.save(prefix+"_out", data_payload)            #Save array to file, based on each original file name. All variables saved to one array in corrected order, constructed infirst half of workflow

 pool.close()                                        #Close MultiProcessing Thread Pool


--- a/CIAO_split_and_upload.py
+++ b/CIAO_split_and_upload.py
 import glob
 import os
-# import boto3
-# import botocore
 import h5py
 import numpy as np
 print(h5py.__version__)
@@ -11,17 +9,21 @@ import time
 import boto3
 import json

+
+
+
+
+
+######################################################################
+#########################Folder Details###############################
+######################################################################
 #generate list of hdf5 files to split and upload from the target directory
 datadir = "/p/scratch/ccstao/ridley2/coec/hdf5_playground/test_target_small"    #Supply target directory
 splitdir = "test_loc"                                                           #Supply sub-directory for split HDF5 files
-file_list = glob.glob(os.path.join(datadir, "*.h5"))                            #Generate list of files in datadir directory
-prefix_list= [i.split('/')[-1][:-3] for i in file_list]                         #Generate list of filenames without filetype
-save_loc=datadir+'/'+splitdir+'/'                                               #Variable for storing sub-directory absolute path 
-if not os.path.exists(save_loc):                                                #Check if sub-directory exists and make it if it doesn't
-    os.makedirs(save_loc)
-with open(save_loc + "prefix_list.json",'w') as f:                              #Dump list of filenames without filetype to .json file
-    json.dump(prefix_list, f, indent=2) 

+######################################################################
+######################S3 Connection Details###########################
+######################################################################
 s3_endpoint = "https://s3-coec.jsc.fz-juelich.de"                               #Set S3 endpoint to access - as supplied, this is the best endpoint available for CoEC
 bucket_name = "smalltarget"                                                     #Set a bucket to upload to. This will need to be created via s3cmd or otherwise before using the script
                                                                                #Choose a different bucket for files from different simulations, script assumes that all files to be uploaded in the folder have the same structure
@@ -30,19 +32,35 @@ access_key = ""
 region = "us-east-1"                                                            #Set region - leave for CoEC endpoint, others may need different region settings


-                                                                                # Sections reads the structure of the file, assumes that all files in folder have the same structure 
+######################################################################
+######################Functional Section##############################
+######################################################################
+
+
+######################File List Construction##########################
+file_list = glob.glob(os.path.join(datadir, "*.h5"))                            #Generate list of files in datadir directory
+prefix_list= [i.split('/')[-1][:-3] for i in file_list]                         #Generate list of filenames without filetype
+save_loc=datadir+'/'+splitdir+'/'                                               #Variable for storing sub-directory absolute path 
+if not os.path.exists(save_loc):                                                #Check if sub-directory exists and make it if it doesn't
+    os.makedirs(save_loc)
+with open(save_loc + "prefix_list.json",'w') as f:                              #Dump list of filenames without filetype to .json file
+    json.dump(prefix_list, f, indent=2) 
+
+
+######################File Structure Read#############################
+#Assumes that all files in folder have the same structure 
 var_list = []
 hf = h5py.File(file_list[0], 'r')                                               #Sets the file to read structure from first file in file list
 dataset_name = list(hf.keys())[1]                                               #Finds the top level name of the internal HDF5 structure, as this changes per simulation
-field_keys = list(hf[dataset_name+'/data/scalars/SC'].attrs.keys())             
+field_keys = list(hf[dataset_name+'/data/scalars/SC'].attrs.keys())             #retrieves dictionary of keys and column variable names from SC Dataset
 field_names  = list(hf[dataset_name+'/data/scalars/SC'].attrs.values())
-field_names_organised = []                                                      #Creates correctly-ordered list of variables in the SC dataset, as this changes per simulation
+field_names_organised = []                                                      #h5py creates wrongly-ordered list of columns by default, so create list to store correctly-ordered variables in the SC dataset, as this changes per simulation
 t_loc=0
 for i in range(len(field_names)):                                               
-    var_list.insert(int(field_keys[i].split(' ')[1])-1,field_names[i][0].decode("utf-8"))
+    var_list.insert(int(field_keys[i].split(' ')[1])-1,field_names[i][0].decode("utf-8")) #insert the variable at the correct location in the list. First input argument is the Index from the column name, -1 to get zero-indexed. Second variable is the name of the variable or chemical species in the SC dataset, correctly decoded to be stored properly by python
 for i in range(len(var_list)):
    if var_list[i]=="T":
-        t_loc=i
+        t_loc=i                                                                 #Detect which index contains Temperature, for flame jet detection
 with open(save_loc + "var_list.json",'w') as f:                                 #Dump variable list to .json file
    json.dump(var_list, f, indent=2)

@@ -50,19 +68,21 @@ dim = hf[dataset_name+"/data/scalars/SC/"][0].shape
 with open(save_loc + "dim.json",'w') as f:                                      #Dump the dimensions of the first variable to a .json file
    json.dump(dim, f, indent=2) 

-
+######################Pre-Processing##################################
 T_max_arg_list=[]                                                               #Create empty list to store location of  hottest elements of each file
 for readcount, h5file in enumerate(file_list):                                  #Loop through files and save separate HDF5 files for each variable
    with h5py.File(h5file, "r") as f_src:
-        T_max_arg_list.append(np.unravel_index(f_src[dataset_name+"/data/scalars/SC"][t_loc][()].argmax(),dim))     #Add location of hottest element to list
-        for writecount, var in enumerate(var_list):
+        T_max_arg_list.append(np.unravel_index(f_src[dataset_name+"/data/scalars/SC"][t_loc][()].argmax(),dim))     #Add location of hottest element to list. [t_loc] finds temperature column of dataset, [()] converts dataset to nparray, argmax finds the argument of the highest temperature element, np.unravel_index changes the result of argmax from a 1d to a 2d location
+        for writecount, var in enumerate(var_list):                             #Loop through variables of 
            with h5py.File(save_loc+ prefix_list[readcount] +'_'+ var+'.h5', "w") as f_dest:
                payload = f_src[dataset_name+"/data/scalars/SC/"][writecount]
                dset = f_dest.create_dataset(var, data=payload)
 with open(save_loc + "T_max_arg_list.json",'w') as f:                           #Dump list of location of hottest elements of each file
    json.dump(T_max_arg_list, f,default=int, indent=2) 

-session = boto3.session.Session()                                               #Open S3 session with given parameters for upload
+
+############################Upload#####################################
+session = boto3.session.Session()                                               #Open S3 session as a client with given parameters for upload
 s3_client = session.client(
    service_name='s3',
    endpoint_url = s3_endpoint,

--- a/README.md
+++ b/README.md
 # S3 CIAO workflow

-
-
-## Getting started
-
-To make it easy for you to get started with GitLab, here's a list of recommended next steps.
-
-Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
-
-## Add your files
-
- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
-
-```
-cd existing_repo
-git remote add origin https://gitlab.jsc.fz-juelich.de/coec/s3-ciao-workflow.git
-git branch -M main
-git push -uf origin main
-```
-
-## Integrate with your tools
-
- [ ] [Set up project integrations](https://gitlab.jsc.fz-juelich.de/coec/s3-ciao-workflow/-/settings/integrations)
-
-## Collaborate with your team
-
- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
-
-## Test and Deploy
-
-Use the built-in continuous integration in GitLab.
-
- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
-
-***
-
-# Editing this README
-
-When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template.
-
-## Suggestions for a good README
-Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
-
-## Name
-Choose a self-explaining name for your project.
-
 ## Description
-Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
-
-## Badges
-On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
-
-## Visuals
-Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
-
-## Installation
-Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
-
-## Usage
-Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
-
-## Support
-Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
-
-## Roadmap
-If you have ideas for releases in the future, it is a good idea to list them in the README.
-
-## Contributing
-State if you are open to contributions and what your requirements are for accepting them.
-
-For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
-
-You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
-
-## Authors and acknowledgment
-Show your appreciation to those who have contributed to the project.
-
-## License
-For open source projects, say how it is licensed.
-
-## Project status
-If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+Two-part workflow for partial download of specific dataset (SC) from CIAO-output HDF5 files. Developed in the context of CoEC. Solution entails:
+- CIAO_split_and_upload.py
+    - Takes a folder of HDF5 files, splits the SC dataset into constituent variables (for faster file access) and uploads them to a specified S3 endpoint and bucket, along with metadata .json files
+- CIAO_s3_remote_read.py
+    - Takes the the S3 bucket and saves a numpy array for each file, downloading a subsection of each file in a window around a target centre element, either definded manually by the user or targeted around the hottest element of each simulation timestep
+
+## Advantages
+- Avoid downloading entire output files - can reach significant size.
+- Enables collaboration - other sites can download just the flame tip data, as only a subset of data around this area is usually required.
+- Can be manually or automatically target
+- Approach can be adapted to process similarly structured HDF5 output for other programs. 
+
+## Requirements
+- Required Python modules:
+    - boto3
+    - json
+    - h5py
+    - numpy
+
+## Basic Usage instructions
+- Edit CIAO_split_and_upload.py:
+    - Edit "Folder Details" section:
+        - Define datadir - directory that contains only the HDF5 files you want to split and upload.
+            - All files in the same folder should ideally be from the same simulation. Script assumes each file has the same structure. Top level group is based on the experiment name, and SC dataset constituent variables depend on the chemical species simulated.
+        - Define splitdir, the sub-directory within datadir to write split files to.
+            - Not deleted automatically, in case files are useful. Delete if necessary.
+    - Edit "S3 Connection Details section:
+        - Define S3 Endpoint - "https://s3-coec.jsc.fz-juelich.de" is likely to be the best endpoint for CoEC purposes, with region "us-east-1".
+            - https://apps.fz-juelich.de/jsc/hps/judac/object-storage.html is also available, using `region = "just"`, but performance in testing is lower than the s3-coec endpoint. Documentation for gaining access when one already has a JSC account can be found at https://apps.fz-juelich.de/jsc/hps/judac/object-storage.html
+        - Define Bucket_name - Intended use of script assumes new bucket for each set of simulations to share.
+            - Create bucket with s3cmd etc.
+            - If the bucket already exists, files with the same name will be overwritten, such as the .json metadata files.  Ensure you have created a new bucket to avoid this.
+        - Set access_id and access_key to access to your S3 endpoint credentials - access key is also sometimes called secret key.
+- Run CIAO_split_and_upload.py
+    - This will create files with the filename of the original file (minus the file extension) plus the variable name from the SC dataset (H2, T, Enthalpy, etc.). These split files will be uploaded to the specified S3 bucket.
+    - Additionally, metadata .json files are created. These are:
+        - `prefix_list.json` - List of original filenames without file extensions. Used by read script to know which files to access
+        - `var_list.json` - List of variables in SC, generated from the first file in `prefix_list`, assumed to be the same for all files in folder/bucket
+        - `dim.json` - Dimension of each SC variable, generated from first file in `prefix_list`, assumed to be the same for all files in folder/bucket. Used by read script to determine whether the file is two or three dimensional.
+        - `T_max_arg_list.json` - List of elements with highest temperature for each timestep of CIAO simulation. Used by read script for targeting window to download.
\ No newline at end of file