Commit 5b3d7a79 authored by Bing Gong's avatar Bing Gong
Browse files

Merge branch 'bing_issue#188_restructure_ambs' into develop

parents 4ed09385 b445f890
Pipeline #121747 failed with stages
in 17 seconds
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import xarray as xr
import numpy as np
filenames_t850 = [
"data_t850/temperature_850hPa_1979_5.625deg.nc",
"data_t850/temperature_850hPa_1980_5.625deg.nc"
]
filenames_z500 = [
"data_z500/geopotential_500hPa_1979_5.625deg.nc",
"data_z500/geopotential_500hPa_1980_5.625deg.nc"
]
filenames = [*filenames_t850, *filenames_z500]
ds = xr.open_mfdataset(filenames, coords="minimal", compat="override")
ds = ds.drop_vars("level")
```
%% Cell type:code id: tags:
``` python
da = ds.to_array(dim="variables").squeeze()
dims = ["time", "lat", "lon", "variables"]
da = da.transpose(*dims)
def generator(iterable):
iterator = iter(iterable)
yield from iterator
da.shape[1:]
```
%% Output
(32, 64, 2)
%% Cell type:code id: tags:
``` python
```
#!#bin/bash
# Name of virtual environment
#VIRT_ENV_NAME="vp_new_structure"
VIRT_ENV_NAME="env_hdfml"
# Name of virtual environment
VIRT_ENV_NAME="venv2_hdfml"
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../video_prediction_tools/${VIRT_ENV_NAME}/bin/activate ]]; then
......@@ -21,6 +20,7 @@ fi
#python -m pytest test_prepare_era5_data.py
##Test for preprocess_step1
#python -m pytest test_process_netCDF_v2.py
#source ../video_prediction_tools/env_setup/modules_preprocess+extract.sh
source ../video_prediction_tools/env_setup/modules_train.sh
##Test for preprocess moving mnist
#python -m pytest test_prepare_moving_mnist_data.py
......@@ -33,5 +33,5 @@ source ../video_prediction_tools/env_setup/modules_train.sh
#rm /p/project/deepacf/deeprain/video_prediction_shared_folder/models/test/*
#python -m pytest test_train_model_era5.py
#python -m pytest test_vanilla_vae_model.py
python -m pytest test_visualize_postprocess.py
python -m pytest test_gzprcp_data.py
#python -m pytest test_meta_postprocess.py
# Name of virtual environment
VIRT_ENV_NAME="venv_hdfml"
CONTAINER_IMG="../video_prediction_tools/HPC_scripts/tensorflow_21.09-tf1-py3.sif"
WRAPPER="./wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
#source ../video_prediction_tools/env_setup/modules_preprocess+extract.sh
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} python3 -m pytest test_era5_data.py
__email__ = "b.gong@fz-juelich.de"
__author__ = "Bing Gong, Scarlet Stadtler,Michael Langguth"
__author__ = "Bing Gong"
from video_prediction.datasets.era5_dataset import *
import pytest
import xarray as xr
import os
import tensorflow as tf
import numpy as np
import json
import datetime
input_dir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/test"
datasplit_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/data_split/cv_test.json"
hparams_dict_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/hparams/era5/convLSTM/model_hparams.json"
sequences_per_file = 10
mode = "val"
input_dir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/test_data_roshni"
datasplit_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/data_split/test/cv_test.json"
hparams_dict_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/hparams/era5/convLSTM/model_hparams_template.json"
mode = "test"
@pytest.fixture(scope="module")
def era5_dataset_case2():
return ERA5Dataset(input_dir=input_dir,mode=mode,
datasplit_config=datasplit_config,hparams_dict_config=hparams_dict_config,seed=1234)
def test_init_era5_dataset(era5_dataset_case2):
assert era5_dataset_case2.hparams.max_epochs == 20
assert era5_dataset_case2.mode == mode
def era5_dataset_case1():
return ERA5Dataset(input_dir=input_dir, datasplit_config=datasplit_config, hparams_dict_config=hparams_dict_config,
mode="test", seed=1234, nsamples_ref=1000)
def test_get_tfrecords_filesnames(era5_dataset_case2):
era5_dataset_case2.get_tfrecords_filesnames_base_datasplit()
assert era5_dataset_case2.filenames[0] == os.path.join(input_dir,"tfrecords","sequence_Y_2017_M_2_0_to_9.tfrecords")# def test_check_pkl_tfrecords_consistency(era5_dataset_case1):
def test_get_example_info(era5_dataset_case2):
era5_dataset_case2.get_tfrecords_filesnames_base_datasplit()
era5_dataset_case2.get_example_info()
assert era5_dataset_case2.image_shape[0] == 160
assert era5_dataset_case2.image_shape[1] == 128
assert era5_dataset_case2.image_shape[2] == 3
def test_init_era5_dataset(era5_dataset_case1):
era5_dataset_case1.get_hparams()
assert era5_dataset_case1.max_epochs == 20
assert era5_dataset_case1.mode == mode
assert era5_dataset_case1.batch_size == 4
def test_get_filenames_from_datasplit(era5_dataset_case1):
flname= os.path.join(era5_dataset_case1.input_dir, "era5_vars4ambs_201901.nc")
n_files = len(era5_dataset_case1.filenames)
check = flname in era5_dataset_case1.filenames
assert check == True
assert n_files == 12
def test_make_dataset(era5_dataset_case1):
# Get the data from nc files directly
data_arr = era5_dataset_case1.load_data_from_nc()
assert len(data_arr) !=0
ds = xr.open_mfdataset(era5_dataset_case1.filenames)
len_dt = len(ds["time"].values) # count number of images/samples in the test dataset
da = ds.to_array(dim = "variables").squeeze()
dims = ["time", "lat", "lon"]
data_arr = np.squeeze(da.values) #[vars,samples,lat,lon]
max_vars, min_vars = da.max(dim=dims).values, da.min(dim=dims).values #three dimension
print("data_arr shape",data_arr.shape)
#normalise the data for the first variable
def norm_var(x, min_value, max_value):
return (x - min_value) / (max_value - min_value)
assert np.max(data_arr[0]) == max_vars[0]
#mannualy calculate the normalization of the data
dt_norm = norm_var(data_arr[0],np.min(data_arr[0]), np.max(data_arr[0]))
print("dt_norm",dt_norm.shape)
s1 = dt_norm[0] #the first sample, first timestamp
s2 = dt_norm[23] #the first sample, last timestamp
s3 = dt_norm[1] # the second sample, first timestamp
s4 = dt_norm[24] # the second sample, last timestamp
# Get the data from make_dataset function
test_dataset = era5_dataset_case1.make_dataset()
test_iterator = test_dataset.make_one_shot_iterator()
# The `Iterator.string_handle()` method returns a tensor that can be evaluated
# and used to feed the `handle` placeholder.
test_handle = test_iterator.string_handle()
iterator = tf.data.Iterator.from_string_handle(test_handle, test_dataset.output_types, test_dataset.output_shapes)
inputs = iterator.get_next()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
#get the batch size samples from dataset
dt = sess.run(inputs) #[batch_size,sequence_len,n_vars,lon,lat]
dt.shape[0] == 4
dt.shape[1] == 24
print("shape of dt",dt.shape)
s1t = dt[0,0,0]
s2t = dt[0,23,0]
#get the second sample from dataset
s3t = dt[1,0,0]
s4t = dt[1,23,0]
#s2t = sess.run(inputs)[0,:,0]
assert np.sum(s1-s1t) < 0.0001
assert np.sum(s2-s2t) < 0.0001
assert np.sum(s3-s3t) < 0.0001
assert np.sum(s4 -s4t) < 0.0001
#compare the data from nc files and make_dataset
__email__ = "b.gong@fz-juelich.de"
from video_prediction.datasets.gzprcp_dataset import *
import pytest
import tensorflow as tf
import xarray as xr
input_dir = "/p/largedata/jjsc42/project/deeprain/project_data/10min_AWS_prcp"
datasplit_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/data_split/gzprcp/datasplit.json"
hparams_dict_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/hparams/gzprcp/convLSTM_gan/model_hparams_template.json"
sequences_per_file = 10
mode = "test"
@pytest.fixture(scope="module")
def gzprcp_dataset_case1():
dataset = GzprcpDataset(input_dir=input_dir, datasplit_config=datasplit_config, hparams_dict_config=hparams_dict_config,
mode="test", seed=1234, nsamples_ref=1000)
dataset.get_hparams()
dataset.get_filenames_from_datasplit()
dataset.load_data_from_nc()
return dataset
def test_init_gzprcp_dataset(gzprcp_dataset_case1):
# gzprcp_dataset_case1.get_hparams()
print('gzprcp_dataset_case1.max_epochs: {}'.format(gzprcp_dataset_case1.max_epochs))
print('gzprcp_dataset_case1.mode: {}'.format(gzprcp_dataset_case1.mode))
print('gzprcp_dataset_case1.batch_size: {}'.format(gzprcp_dataset_case1.batch_size))
print('gzprcp_dataset_case1.k: {}'.format(gzprcp_dataset_case1.k))
print('gzprcp_dataset_case1.filenames: {}'.format(gzprcp_dataset_case1.filenames))
assert gzprcp_dataset_case1.max_epochs == 8
assert gzprcp_dataset_case1.mode == mode
assert gzprcp_dataset_case1.batch_size == 32
assert gzprcp_dataset_case1.k == 0.01
# assert gzprcp_dataset_case1.filenames[0] == 'GZ_prcp_2019.nc'
def test_load_data_from_nc(gzprcp_dataset_case1):
train_tf_dataset = gzprcp_dataset_case1.make_dataset()
train_iterator = train_tf_dataset.make_one_shot_iterator()
# The `Iterator.string_handle()` method returns a tensor that can be evaluated
# and used to feed the `handle` placeholder.
train_handle = train_iterator.string_handle()
iterator = tf.data.Iterator.from_string_handle(train_handle, train_tf_dataset.output_types, train_tf_dataset.output_shapes)
inputs = iterator.get_next()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
for step in range(2):
sess.run(inputs)
# df = xr.open_mfdataset(era5_dataset_case1.filenames)
# if __name__ == '__main__':
# dataset = ERA5Dataset(input_dir: str = None, datasplit_config: str = None, hparams_dict_config: str = None,
# mode: str = "train", seed: int = None, nsamples_ref: int = None)
# for next_element in dataset.take(2):
# # time_s = time.time()
# # tf.print(next_element.shape)
# pass
......@@ -4,11 +4,8 @@ __author__ = "Bing Gong"
__date__ = "2021-03-03"
from data_preprocess.prepare_era5_data import *
import pytest
import numpy as np
import json
import os
year="2007"
......@@ -23,8 +20,6 @@ def dataExtraction_case1(year=year,job_name=job_name,src_dir=src_dir,target_dir=
return ERA5DataExtraction(year,job_name,src_dir,target_dir,varslist_json)
def test_init(dataExtraction_case1):
assert dataExtraction_case1.job_name == 1
assert dataExtraction_case1.src_dir == src_dir
......
#!/bin/bash -x
## Controlling Batch-job
#SBATCH --account=<your_project>
#SBATCH --account=deepacf
#SBATCH --nodes=1
#SBATCH --ntasks=13
##SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=1
#SBATCH --output=DataPreprocess_era5_step1-out.%j
#SBATCH --error=DataPreprocess_era5_step1-err.%j
#SBATCH --time=04:20:00
#SBATCH --gres=gpu:0
#SBATCH --output=log_out.%j
#SBATCH --error=log_err.%j
#SBATCH --time=00:10:00
#SBATCH --partition=batch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
ml Stages/2022
ml GCCcore/.11.2.0
ml GCC/11.2.0
ml ParaStationMPI/5.5.0-1
ml Python/3.9.6
ml SciPy-bundle/2021.10
ml xarray/0.20.1
ml netcdf4-python/1.5.7
ml dask/2021.9.1
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Activate virtual environment if needed (and possible)
"""
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
......@@ -33,27 +39,25 @@ if [ -z ${VIRTUAL_ENV} ]; then
fi
# Loading modules
source ../env_setup/modules_preprocess+extract.sh
"""
source_dir=/p/scratch/deepacf/inbound_data/weatherbench
destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/weatherbench_test/extracted
data_extraction_dir=/p/project/deepacf/deeprain/grasse/ambs/video_prediction_tools/data_preprocess
variables='[{"name":"temperature","lvl":[850],"interpolation":"p"},{"name":"geopotential","lvl":[500],"interpolation":"p"}]'
years=("2013" "2014" "2015" "2016" "2017")
# select years and variables for dataset and define target domain
years=( "2015" )
variables=( "t2" "t2" "t2" )
sw_corner=( -999.9 -999.9)
nyx=( -999 -999 )
cd ${data_extraction_dir}
# set some paths
# note, that destination_dir is adjusted during runtime based on the data
source_dir=/my/path/to/extracted/data/
destination_dir=/my/path/to/pickle/files
# execute Python-scripts
for year in "${years[@]}"; do
echo "start preprocessing data for year ${year}"
srun python ../main_scripts/main_preprocess_data_step1.py \
--source_dir ${source_dir} --destination_dir ${destination_dir} --years "${year}" \
--vars "${variables[0]}" "${variables[1]}" "${variables[2]}" \
--sw_corner "${sw_corner[0]}" "${sw_corner[1]}" --nyx "${nyx[0]}" "${nyx[1]}"
done
# Name of virtual environment
venv_dir=".venv"
python -m venv --system-site-packages ${venv_dir}
. ${venv_dir}/bin/activate
#pip3 install --no-cache-dir pytz
#pip3 install --no-cache-dir python-dateutil
export PYTHONPATH=${data_extraction_dir}:$PYTHONPATH
export PYTHONPATH="${data_extraction_dir}/..":$PYTHONPATH
python3 ../main_scripts/main_data_extraction.py ${source_dir} ${dest_dir} ${years[@]} ${variables}
#srun python ../../workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py --destination_dir ${destination_dir} --varnames T2 MSL gph500
rm -r ${venv_dir}
......@@ -3,13 +3,13 @@
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=13
##SBATCH --ntasks-per-node=13
##SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=1
#SBATCH --output=data_extraction_era5-out.%j
#SBATCH --error=data_extraction_era5-err.%j
#SBATCH --output=DataExtraction_era5_step1-out.%j
#SBATCH --error=DataExtraction_era5_step1-err.%j
#SBATCH --time=04:20:00
#SBATCH --partition=batch
#SBATCH --gres=gpu:0
#SBATCH --partition=batch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
......@@ -22,7 +22,7 @@ exit 99
VIRT_ENV_NAME="my_venv"
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [ -z "${VIRTUAL_ENV}" ]; then
if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
......@@ -34,16 +34,21 @@ fi
# Loading modules
source ../env_setup/modules_preprocess+extract.sh
# Declare path-variables (dest_dir will be set and configured automatically via generate_runscript.py)
source_dir=/my/path/to/era5
# select years and variables for dataset and define target domain
years=( 2017 )
months=( "all" )
var_dict='{"2t": {"sf": ""}, "tcc": {"sf": ""}, "t": {"ml": "p85000."}}'
sw_corner=(38.4 0.0)
nyx=(56 92)
# set some paths
# note, that destination_dir is adjusted during runtime based on the data
source_dir=/my/path/to/era5/data
destination_dir=/my/path/to/extracted/data
varmap_file=/my/path/to/varmapping/file
years=( "2015" )
# execute Python-script
srun python ../main_scripts/main_era5_data_extraction.py -src_dir "${source_dir}" \
-dest_dir "${destination_dir}" -y "${years[@]}" -m "${months[@]}" \
-swc "${sw_corner[@]}" -nyx "${nyx[@]}" -v "${var_dict}"
# Run data extraction
for year in "${years[@]}"; do
echo "Perform ERA5-data extraction for year ${year}"
srun python ../main_scripts/main_data_extraction.py --source_dir ${source_dir} --target_dir ${destination_dir} \
--year ${year} --varslist_path ${varmap_file}
done
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --output=train_model_era5-out.%j
#SBATCH --error=train_model_era5-err.%j
#SBATCH --time=24:00:00
#SBATCH --gres=gpu:1
#SBATCH --partition=some_partition
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by generate_runscript.py
source_dir=/my/path/to/tfrecords/files
destination_dir=/my/model/output/path
# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
model=convLSTM
datasplit_dict=${destination_dir}/data_split.json
model_hparams=${destination_dir}/model_hparams.json
# run training in container
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
--dataset weatherbench --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
</