Skip to content
Snippets Groups Projects
Commit 5e40d8d1 authored by Tom Ridley's avatar Tom Ridley
Browse files

Update for D6.5

parent 96311d12
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,10 @@ import boto3
import json
import multiprocessing as mp
######################################################################
########################Function Definition###########################
######################################################################
#Section required for the proper functioning of multiprocessing
class s3_details: #Variables for defining S3 file details and how much to read from file
def __init__(self, s3_endpoint, bucket_name, access_id, access_key, region, dimension, target, window_width, prefix):
self.s3_endpoint = s3_endpoint
......@@ -25,29 +28,39 @@ class s3_details: #Variables for defining S3 file details and how much to
self.window_width = window_width
self.prefix = prefix
def read_variable(self,column_to_read): #Downloads subsection of separated SC dataset via S3, of window_width around the target centre element. Should automatically work with either 2D or 3D data
def read_variable(self,column_to_read): #Downloads subsection of separated SC dataset via S3, of window_width around the target centre element. Should work with either 2D or 3D data, based on the "dim.json" file
print(self.s3_endpoint+"/"+self.bucket_name+"/"+self.prefix+"_"+column_to_read+".h5")
with h5py.File(self.s3_endpoint+"/"+self.bucket_name+"/"+self.prefix+"_"+column_to_read+".h5", driver='ros3', secret_id=bytes(self.access_id, encoding='utf-8'), secret_key=bytes(self.access_key, encoding='utf-8'), aws_region=bytes(self.region, encoding = 'utf-8')) as f:#Encoding is important for h5py to properly understand the strings
if self.dimension ==1:
data_part = f[column_to_read][0,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]
elif self.dimension == 3:
data_part = f[column_to_read][self.target[0]-self.window_width:self.target[0]+self.window_width+1,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]
if self.dimension ==1: #2d case
data_part = f[column_to_read][0,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]#download target element and window_width around it only, for 2d case
elif self.dimension == 3: #3d case
data_part = f[column_to_read][self.target[0]-self.window_width:self.target[0]+self.window_width+1,self.target[1]-self.window_width:self.target[1]+self.window_width+1,self.target[2]-self.window_width:self.target[2]+self.window_width+1]#download target element and window_width around it only, for 3d case
return data_part
#Section Defines connection details to S3 Endpoint
######################################################################
######################S3 Connection Details###########################
######################################################################
s3_endpoint = "https://s3-coec.jsc.fz-juelich.de" #S3 Endpoint to connect to. As supplied, this is the best available for CoEC
bucket_name = "smalltarget" #Set bucket to target. Script expects all files inside to be from the same simulation
access_id = "" #Set accesss_id for S3 account
access_key = "" #Set secret access_key for S3 account
region = "us-east-1" #Set region for S3 endpoint.
######################################################################
##########################Target Window###############################
######################################################################
window_width=10 #Set number of elements either side of the target element to download in each dimension
#target_type = "manual" #Use a single target element for all frames of simulation output
#target_type = "manual_list" #Manually define a list of target elements for all frames of simulation output
target_type = "auto" #Use the pre-generated hottest element list for all frames of simulation output
target = (100,100,100) # If using the target_type manual option, define the target element for all frames here
manual_target=[]
######################################################################
#####################Functional Section###############################
######################################################################
##########################Read Metadata ##############################
with open("prefix_list.json", 'r') as f: #Read .json for list of filenames
prefix_list = json.load(f)
#print(prefix_list)
......@@ -57,23 +70,26 @@ with open("dim.json", 'r') as f: #Read .json for dimensions o
dim = json.load(f)
with open("T_max_arg_list.json", 'r') as f: #Read .json for list of hottest elements of each simulation frame.
T_max_arg_list = json.load(f)
dimension = dim[0] #Use information from .json file to determine 2D or 3D simulation
#############################Targeting###############################
if target_type == "auto":
print "Target list populated from .json file"
print("Target list populated from .json file")
target_list = T_max_arg_list
elif target_type == "manual":
target = [target] *len(prefix_list)
print("Target list populated from single target element file")
target_list = [target] *len(prefix_list)
elif target_type == "manual_list": #If use the manual list option, you will have to create a list the same length as the number of files
target= [] #with a 3-element list as each element within it.
dimension = dim[0] #Use information from .json file to determine 2D or 3D simulation
print("Target list populated from manually supplied element list")
target_list= manual_target #with a 3-element list as each element within it.
#############################Download###############################
pool = mp.Pool(len(var_list)) #Declare pool of threads, one for each variable to download
for count, prefix in enumerate(prefix_list):
target = target_list[count] #Set the target centre element according to the
target = target_list[count] #Set the target centre element
s3file = s3_details(s3_endpoint,bucket_name,access_id,access_key,region, dimension, target ,window_width,prefix) #Set details of download
data_payload=np.array(pool.map(s3file.read_variable,var_list)) #Download all variables in parallel
np.save(prefix+"_out", data_payload) #Save array to file, based on each original file name. All variables saved to one array in original order
np.save(prefix+"_out", data_payload) #Save array to file, based on each original file name. All variables saved to one array in corrected order, constructed infirst half of workflow
pool.close() #Close MultiProcessing Thread Pool
......
import glob
import os
# import boto3
# import botocore
import h5py
import numpy as np
print(h5py.__version__)
......@@ -11,17 +9,21 @@ import time
import boto3
import json
######################################################################
#########################Folder Details###############################
######################################################################
#generate list of hdf5 files to split and upload from the target directory
datadir = "/p/scratch/ccstao/ridley2/coec/hdf5_playground/test_target_small" #Supply target directory
splitdir = "test_loc" #Supply sub-directory for split HDF5 files
file_list = glob.glob(os.path.join(datadir, "*.h5")) #Generate list of files in datadir directory
prefix_list= [i.split('/')[-1][:-3] for i in file_list] #Generate list of filenames without filetype
save_loc=datadir+'/'+splitdir+'/' #Variable for storing sub-directory absolute path
if not os.path.exists(save_loc): #Check if sub-directory exists and make it if it doesn't
os.makedirs(save_loc)
with open(save_loc + "prefix_list.json",'w') as f: #Dump list of filenames without filetype to .json file
json.dump(prefix_list, f, indent=2)
######################################################################
######################S3 Connection Details###########################
######################################################################
s3_endpoint = "https://s3-coec.jsc.fz-juelich.de" #Set S3 endpoint to access - as supplied, this is the best endpoint available for CoEC
bucket_name = "smalltarget" #Set a bucket to upload to. This will need to be created via s3cmd or otherwise before using the script
#Choose a different bucket for files from different simulations, script assumes that all files to be uploaded in the folder have the same structure
......@@ -30,19 +32,35 @@ access_key = ""
region = "us-east-1" #Set region - leave for CoEC endpoint, others may need different region settings
# Sections reads the structure of the file, assumes that all files in folder have the same structure
######################################################################
######################Functional Section##############################
######################################################################
######################File List Construction##########################
file_list = glob.glob(os.path.join(datadir, "*.h5")) #Generate list of files in datadir directory
prefix_list= [i.split('/')[-1][:-3] for i in file_list] #Generate list of filenames without filetype
save_loc=datadir+'/'+splitdir+'/' #Variable for storing sub-directory absolute path
if not os.path.exists(save_loc): #Check if sub-directory exists and make it if it doesn't
os.makedirs(save_loc)
with open(save_loc + "prefix_list.json",'w') as f: #Dump list of filenames without filetype to .json file
json.dump(prefix_list, f, indent=2)
######################File Structure Read#############################
#Assumes that all files in folder have the same structure
var_list = []
hf = h5py.File(file_list[0], 'r') #Sets the file to read structure from first file in file list
dataset_name = list(hf.keys())[1] #Finds the top level name of the internal HDF5 structure, as this changes per simulation
field_keys = list(hf[dataset_name+'/data/scalars/SC'].attrs.keys())
field_keys = list(hf[dataset_name+'/data/scalars/SC'].attrs.keys()) #retrieves dictionary of keys and column variable names from SC Dataset
field_names = list(hf[dataset_name+'/data/scalars/SC'].attrs.values())
field_names_organised = [] #Creates correctly-ordered list of variables in the SC dataset, as this changes per simulation
field_names_organised = [] #h5py creates wrongly-ordered list of columns by default, so create list to store correctly-ordered variables in the SC dataset, as this changes per simulation
t_loc=0
for i in range(len(field_names)):
var_list.insert(int(field_keys[i].split(' ')[1])-1,field_names[i][0].decode("utf-8"))
var_list.insert(int(field_keys[i].split(' ')[1])-1,field_names[i][0].decode("utf-8")) #insert the variable at the correct location in the list. First input argument is the Index from the column name, -1 to get zero-indexed. Second variable is the name of the variable or chemical species in the SC dataset, correctly decoded to be stored properly by python
for i in range(len(var_list)):
if var_list[i]=="T":
t_loc=i
t_loc=i #Detect which index contains Temperature, for flame jet detection
with open(save_loc + "var_list.json",'w') as f: #Dump variable list to .json file
json.dump(var_list, f, indent=2)
......@@ -50,19 +68,21 @@ dim = hf[dataset_name+"/data/scalars/SC/"][0].shape
with open(save_loc + "dim.json",'w') as f: #Dump the dimensions of the first variable to a .json file
json.dump(dim, f, indent=2)
######################Pre-Processing##################################
T_max_arg_list=[] #Create empty list to store location of hottest elements of each file
for readcount, h5file in enumerate(file_list): #Loop through files and save separate HDF5 files for each variable
with h5py.File(h5file, "r") as f_src:
T_max_arg_list.append(np.unravel_index(f_src[dataset_name+"/data/scalars/SC"][t_loc][()].argmax(),dim)) #Add location of hottest element to list
for writecount, var in enumerate(var_list):
T_max_arg_list.append(np.unravel_index(f_src[dataset_name+"/data/scalars/SC"][t_loc][()].argmax(),dim)) #Add location of hottest element to list. [t_loc] finds temperature column of dataset, [()] converts dataset to nparray, argmax finds the argument of the highest temperature element, np.unravel_index changes the result of argmax from a 1d to a 2d location
for writecount, var in enumerate(var_list): #Loop through variables of
with h5py.File(save_loc+ prefix_list[readcount] +'_'+ var+'.h5', "w") as f_dest:
payload = f_src[dataset_name+"/data/scalars/SC/"][writecount]
dset = f_dest.create_dataset(var, data=payload)
with open(save_loc + "T_max_arg_list.json",'w') as f: #Dump list of location of hottest elements of each file
json.dump(T_max_arg_list, f,default=int, indent=2)
session = boto3.session.Session() #Open S3 session with given parameters for upload
############################Upload#####################################
session = boto3.session.Session() #Open S3 session as a client with given parameters for upload
s3_client = session.client(
service_name='s3',
endpoint_url = s3_endpoint,
......
# S3 CIAO workflow
## Getting started
To make it easy for you to get started with GitLab, here's a list of recommended next steps.
Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
## Add your files
- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
```
cd existing_repo
git remote add origin https://gitlab.jsc.fz-juelich.de/coec/s3-ciao-workflow.git
git branch -M main
git push -uf origin main
```
## Integrate with your tools
- [ ] [Set up project integrations](https://gitlab.jsc.fz-juelich.de/coec/s3-ciao-workflow/-/settings/integrations)
## Collaborate with your team
- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
## Test and Deploy
Use the built-in continuous integration in GitLab.
- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
***
# Editing this README
When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template.
## Suggestions for a good README
Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
## Name
Choose a self-explaining name for your project.
## Description
Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
## Badges
On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
## Visuals
Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
## Installation
Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
## Usage
Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
## Support
Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
## Roadmap
If you have ideas for releases in the future, it is a good idea to list them in the README.
## Contributing
State if you are open to contributions and what your requirements are for accepting them.
For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
## Authors and acknowledgment
Show your appreciation to those who have contributed to the project.
## License
For open source projects, say how it is licensed.
## Project status
If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
Two-part workflow for partial download of specific dataset (SC) from CIAO-output HDF5 files. Developed in the context of CoEC. Solution entails:
- CIAO_split_and_upload.py
- Takes a folder of HDF5 files, splits the SC dataset into constituent variables (for faster file access) and uploads them to a specified S3 endpoint and bucket, along with metadata .json files
- CIAO_s3_remote_read.py
- Takes the the S3 bucket and saves a numpy array for each file, downloading a subsection of each file in a window around a target centre element, either definded manually by the user or targeted around the hottest element of each simulation timestep
## Advantages
- Avoid downloading entire output files - can reach significant size.
- Enables collaboration - other sites can download just the flame tip data, as only a subset of data around this area is usually required.
- Can be manually or automatically target
- Approach can be adapted to process similarly structured HDF5 output for other programs.
## Requirements
- Required Python modules:
- boto3
- json
- h5py
- numpy
## Basic Usage instructions
- Edit CIAO_split_and_upload.py:
- Edit "Folder Details" section:
- Define datadir - directory that contains only the HDF5 files you want to split and upload.
- All files in the same folder should ideally be from the same simulation. Script assumes each file has the same structure. Top level group is based on the experiment name, and SC dataset constituent variables depend on the chemical species simulated.
- Define splitdir, the sub-directory within datadir to write split files to.
- Not deleted automatically, in case files are useful. Delete if necessary.
- Edit "S3 Connection Details section:
- Define S3 Endpoint - "https://s3-coec.jsc.fz-juelich.de" is likely to be the best endpoint for CoEC purposes, with region "us-east-1".
- https://apps.fz-juelich.de/jsc/hps/judac/object-storage.html is also available, using `region = "just"`, but performance in testing is lower than the s3-coec endpoint. Documentation for gaining access when one already has a JSC account can be found at https://apps.fz-juelich.de/jsc/hps/judac/object-storage.html
- Define Bucket_name - Intended use of script assumes new bucket for each set of simulations to share.
- Create bucket with s3cmd etc.
- If the bucket already exists, files with the same name will be overwritten, such as the .json metadata files. Ensure you have created a new bucket to avoid this.
- Set access_id and access_key to access to your S3 endpoint credentials - access key is also sometimes called secret key.
- Run CIAO_split_and_upload.py
- This will create files with the filename of the original file (minus the file extension) plus the variable name from the SC dataset (H2, T, Enthalpy, etc.). These split files will be uploaded to the specified S3 bucket.
- Additionally, metadata .json files are created. These are:
- `prefix_list.json` - List of original filenames without file extensions. Used by read script to know which files to access
- `var_list.json` - List of variables in SC, generated from the first file in `prefix_list`, assumed to be the same for all files in folder/bucket
- `dim.json` - Dimension of each SC variable, generated from first file in `prefix_list`, assumed to be the same for all files in folder/bucket. Used by read script to determine whether the file is two or three dimensional.
- `T_max_arg_list.json` - List of elements with highest temperature for each timestep of CIAO simulation. Used by read script for targeting window to download.
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment