diff --git a/Jupyter_Notebooks/calc_climatolgical_mean.ipynb b/Jupyter_Notebooks/calc_climatolgical_mean.ipynb index 76bbe5f944a9b136fddff4fa8c66cd98398196d9..40f25392f7e987c42947a1cb9b946ada99d07aed 100644 --- a/Jupyter_Notebooks/calc_climatolgical_mean.ipynb +++ b/Jupyter_Notebooks/calc_climatolgical_mean.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "id": "simple-gasoline", + "execution_count": 15, + "id": "governing-strengthening", "metadata": {}, "outputs": [], "source": [ @@ -15,8 +15,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "protected-protest", + "execution_count": 16, + "id": "colonial-chocolate", "metadata": {}, "outputs": [], "source": [ @@ -25,13 +25,14 @@ "datafile = \"1970-1999_t2m.nc\"\n", "\n", "datafile= os.path.join(datadir, datafile)\n", - "\n" + "\n", + "datafile=\"/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly/t2m_1970_1999.nc\"" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "eleven-arbor", + "execution_count": 17, + "id": "naval-behalf", "metadata": {}, "outputs": [], "source": [ @@ -42,20 +43,20 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "married-living", + "execution_count": 18, + "id": "photographic-avenue", "metadata": {}, "outputs": [], "source": [ "ntimes = len(coords[\"time\"])\n", "\n", - "t2m_all = t2m_all.chunk({\"time\": ntimes, \"lat\":10, \"lon\":10})" + "t2m_all = t2m_all.chunk({\"time\": ntimes, \"lat\":100, \"lon\":100})" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "lined-japan", + "execution_count": 19, + "id": "vocal-cholesterol", "metadata": {}, "outputs": [ { @@ -124,7 +125,21 @@ "To avoid creating the large chunks, set the option\n", " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", " ... array[indexer]\n", - " return self.array[key]\n", + " return self.array[key]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registering averaging took 1.08\n", + "Performing averaging took 1.08\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "/p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n", "chunk and silence this warning, set the option\n", " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n", @@ -171,14 +186,6 @@ " ... array[indexer]\n", " return self.array[key]\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Registering averaging took 40.18\n", - "Performing averaging took 40.18\n" - ] } ], "source": [ @@ -199,42 +206,84 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "passing-benchmark", + "execution_count": 20, + "id": "intelligent-florida", "metadata": {}, "outputs": [], "source": [ "t2m_hourly = t2m_hourly.compute()" ] }, + { + "cell_type": "markdown", + "id": "divided-feelings", + "metadata": {}, + "source": [ + "This works, but it takes about 3 minutes to process 30 years of data. <br>\n", + "However, the same operation is possible with CDO and only takes 36s to finish on Juwels. <br> \n", + "The two following shell commands (after loading CDO 1.9.8 and ecCodes 2.18.0) are:\n", + "```\n", + "clim_files=($(for year in {1991..2020}; do echo \"${year}_t2m.grb\"; done))\n", + "cdo -t ecmwf -f nc ensavg ${clim_files[@]} mutilyears_1991-2020.nc\n", + "```\n", + "In the following, we check the correctness of the data by computing the difference btween the data from a CDO-generated file against the data produced above. We choose the mean temperature in January at 12 UTC as an example." + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "medium-constitution", + "execution_count": 40, + "id": "authorized-ethics", "metadata": {}, "outputs": [], "source": [ - "print(t2m_hourly)" + "datafile_cdo = os.path.join(datadir, \"mutilyears_1970-1999.nc\")\n", + "\n", + "with xr.open_dataset(datafile_cdo) as dfile:\n", + " t2m_hourly_cdo = dfile[\"T2M\"]\n", + " " ] }, { "cell_type": "code", - "execution_count": null, - "id": "floating-affiliation", + "execution_count": 41, + "id": "greater-resistance", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<xarray.DataArray ()>\n", + "array(0.00097656, dtype=float32)\n", + "Coordinates:\n", + " hour int64 12\n", + " month int64 1\n", + " time datetime64[ns] 1979-01-01T12:00:00\n" + ] + } + ], "source": [ - "from dask.diagnostics import ProgressBar\n", + "import numpy as np\n", + "test1 = t2m_hourly.sel(month=1, hour=12)\n", + "test2 = t2m_hourly_cdo.sel(time=\"1979-01-01 12:00\")\n", + "\n", + "diff = np.abs(test1-test2)\n", "\n", - "delayed_obj = t2m_hourly.to_netcdf(os.path.join(datadir, \"climatology_t2m_1970_1999.nc\"), compute=False)\n", - "with ProgressBar():\n", - " results = delayed_obj.compute()" + "print(np.max(diff))" + ] + }, + { + "cell_type": "markdown", + "id": "western-thriller", + "metadata": {}, + "source": [ + "Thus, the maximum difference is in the $\\mathcal{O} (10^{-3})$ which can be neglected for our application." ] }, { "cell_type": "code", "execution_count": null, - "id": "administrative-employer", + "id": "hydraulic-appearance", "metadata": {}, "outputs": [], "source": []