Final version of calc_climatolgical_mean.ipynb with check for correctness of data from CDO-command.

78f4953c · Michael Langguth · 019e11b9 · 78f4953c
Commit 78f4953c authored 4 years ago by Michael Langguth
--- a/Jupyter_Notebooks/calc_climatolgical_mean.ipynb
+++ b/Jupyter_Notebooks/calc_climatolgical_mean.ipynb
@@ -2,8 +2,8 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "id": "simple-gasoline",
+   "execution_count": 15,
+   "id": "governing-strengthening",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -15,8 +15,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "id": "protected-protest",
+   "execution_count": 16,
+   "id": "colonial-chocolate",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -25,13 +25,14 @@
    "datafile = \"1970-1999_t2m.nc\"\n",
    "\n",
    "datafile= os.path.join(datadir, datafile)\n",
-    "\n"
+    "\n",
+    "datafile=\"/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly/t2m_1970_1999.nc\""
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "id": "eleven-arbor",
+   "execution_count": 17,
+   "id": "naval-behalf",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -42,20 +43,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "id": "married-living",
+   "execution_count": 18,
+   "id": "photographic-avenue",
   "metadata": {},
   "outputs": [],
   "source": [
    "ntimes = len(coords[\"time\"])\n",
    "\n",
-    "t2m_all = t2m_all.chunk({\"time\": ntimes, \"lat\":10, \"lon\":10})"
+    "t2m_all = t2m_all.chunk({\"time\": ntimes, \"lat\":100, \"lon\":100})"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "id": "lined-japan",
+   "execution_count": 19,
+   "id": "vocal-cholesterol",
   "metadata": {},
   "outputs": [
    {
@@ -124,7 +125,21 @@
      "To avoid creating the large chunks, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n",
      "    ...     array[indexer]\n",
-      "  return self.array[key]\n",
+      "  return self.array[key]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Registering averaging took 1.08\n",
+      "Performing averaging took 1.08\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
      "/p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n",
      "chunk and silence this warning, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n",
@@ -171,14 +186,6 @@
      "    ...     array[indexer]\n",
      "  return self.array[key]\n"
     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Registering averaging took 40.18\n",
-      "Performing averaging took 40.18\n"
-     ]
    }
   ],
   "source": [
@@ -199,42 +206,84 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "passing-benchmark",
+   "execution_count": 20,
+   "id": "intelligent-florida",
   "metadata": {},
   "outputs": [],
   "source": [
    "t2m_hourly = t2m_hourly.compute()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "divided-feelings",
+   "metadata": {},
+   "source": [
+    "This works, but it takes about 3 minutes to process 30 years of data. <br>\n",
+    "However, the same operation is possible with CDO and only takes 36s to finish on Juwels. <br> \n",
+    "The two following shell commands (after loading CDO 1.9.8 and ecCodes 2.18.0) are:\n",
+    "```\n",
+    "clim_files=($(for year in {1991..2020}; do echo \"${year}_t2m.grb\"; done))\n",
+    "cdo -t ecmwf -f nc ensavg ${clim_files[@]} mutilyears_1991-2020.nc\n",
+    "```\n",
+    "In the following, we check the correctness of the data by computing the difference btween the data from a CDO-generated file against the data produced above. We choose the mean temperature in January at 12 UTC as an example."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "medium-constitution",
+   "execution_count": 40,
+   "id": "authorized-ethics",
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(t2m_hourly)"
+    "datafile_cdo = os.path.join(datadir, \"mutilyears_1970-1999.nc\")\n",
+    "\n",
+    "with xr.open_dataset(datafile_cdo) as dfile:\n",
+    "    t2m_hourly_cdo = dfile[\"T2M\"]\n",
+    "   "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "floating-affiliation",
+   "execution_count": 41,
+   "id": "greater-resistance",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<xarray.DataArray ()>\n",
+      "array(0.00097656, dtype=float32)\n",
+      "Coordinates:\n",
+      "    hour     int64 12\n",
+      "    month    int64 1\n",
+      "    time     datetime64[ns] 1979-01-01T12:00:00\n"
+     ]
+    }
+   ],
   "source": [
-    "from dask.diagnostics import ProgressBar\n",
+    "import numpy as np\n",
+    "test1 = t2m_hourly.sel(month=1, hour=12)\n",
+    "test2 = t2m_hourly_cdo.sel(time=\"1979-01-01 12:00\")\n",
+    "\n",
+    "diff = np.abs(test1-test2)\n",
    "\n",
-    "delayed_obj = t2m_hourly.to_netcdf(os.path.join(datadir, \"climatology_t2m_1970_1999.nc\"), compute=False)\n",
-    "with ProgressBar():\n",
-    "    results = delayed_obj.compute()"
+    "print(np.max(diff))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "western-thriller",
+   "metadata": {},
+   "source": [
+    "Thus, the maximum difference is in the $\\mathcal{O} (10^{-3})$ which can be neglected for our application."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "administrative-employer",
+   "id": "hydraulic-appearance",
   "metadata": {},
   "outputs": [],
   "source": []

-%% Cell type:code id:simple-gasoline tags:
+%% Cell type:code id:governing-strengthening tags:

 ``` python
 import os, sys, time
 import xarray as xr
 import pandas as pd
 import datetime as dt
 ```

-%% Cell type:code id:protected-protest tags:
+%% Cell type:code id:colonial-chocolate tags:

 ``` python
 datadir = "/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly"

 datafile = "1970-1999_t2m.nc"

 datafile= os.path.join(datadir, datafile)

+datafile="/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly/t2m_1970_1999.nc"
 ```

-%% Cell type:code id:eleven-arbor tags:
+%% Cell type:code id:naval-behalf tags:

 ``` python
 with xr.open_dataset(datafile) as dfile:
    t2m_all = dfile["var167"]
    coords = t2m_all.coords
 ```

-%% Cell type:code id:married-living tags:
+%% Cell type:code id:photographic-avenue tags:

 ``` python
 ntimes = len(coords["time"])

-t2m_all = t2m_all.chunk({"time": ntimes, "lat":10, "lon":10})
+t2m_all = t2m_all.chunk({"time": ntimes, "lat":100, "lon":100})
 ```

-%% Cell type:code id:lined-japan tags:
+%% Cell type:code id:vocal-cholesterol tags:

 ``` python
 # define a function with the hourly calculation:
 def hour_mean(x):
     return x.groupby('time.hour').mean('time')

 time0 = time.time()
 t2m_hourly = t2m_all.groupby("time.month").apply(hour_mean)

 print("Registering averaging took {0:.2f}".format(time.time()-time0))

 #print(t2m_hourly.values)

 print("Performing averaging took {0:.2f}".format(time.time()-time0))

 ```

 %% Output

    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
+
+    Registering averaging took 1.08
+    Performing averaging took 1.08
+
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]
    /p/software/hdfml/stages/2020/software/Jupyter/2020.2.6-gcccoremkl-9.3.0-2020.2.254-Python-3.8.5/lib/python3.8/site-packages/xarray/core/indexing.py:1369: PerformanceWarning: Slicing is producing a large chunk. To accept the large
    chunk and silence this warning, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ...     array[indexer]
    
    To avoid creating the large chunks, set the option
        >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        ...     array[indexer]
      return self.array[key]

-    Registering averaging took 40.18
-    Performing averaging took 40.18
-
-%% Cell type:code id:passing-benchmark tags:
+%% Cell type:code id:intelligent-florida tags:

 ``` python
 t2m_hourly = t2m_hourly.compute()
 ```

-%% Cell type:code id:medium-constitution tags:
+%% Cell type:markdown id:divided-feelings tags:
+
+This works, but it takes about 3 minutes to process 30 years of data. <br>
+However, the same operation is possible with CDO and only takes 36s to finish on Juwels. <br>
+The two following shell commands (after loading CDO 1.9.8 and ecCodes 2.18.0) are:
+```
+clim_files=($(for year in {1991..2020}; do echo "${year}_t2m.grb"; done))
+cdo -t ecmwf -f nc ensavg ${clim_files[@]} mutilyears_1991-2020.nc
+```
+In the following, we check the correctness of the data by computing the difference btween the data from a CDO-generated file against the data produced above. We choose the mean temperature in January at 12 UTC as an example.
+
+%% Cell type:code id:authorized-ethics tags:

 ``` python
-print(t2m_hourly)
+datafile_cdo = os.path.join(datadir, "mutilyears_1970-1999.nc")
+
+with xr.open_dataset(datafile_cdo) as dfile:
+    t2m_hourly_cdo = dfile["T2M"]
+
 ```

-%% Cell type:code id:floating-affiliation tags:
+%% Cell type:code id:greater-resistance tags:

 ``` python
-from dask.diagnostics import ProgressBar
+import numpy as np
+test1 = t2m_hourly.sel(month=1, hour=12)
+test2 = t2m_hourly_cdo.sel(time="1979-01-01 12:00")
+
+diff = np.abs(test1-test2)

-delayed_obj = t2m_hourly.to_netcdf(os.path.join(datadir, "climatology_t2m_1970_1999.nc"), compute=False)
-with ProgressBar():
-    results = delayed_obj.compute()
+print(np.max(diff))
 ```

-%% Cell type:code id:administrative-employer tags:
+%% Output
+
+    <xarray.DataArray ()>
+    array(0.00097656, dtype=float32)
+    Coordinates:
+        hour     int64 12
+        month    int64 1
+        time     datetime64[ns] 1979-01-01T12:00:00
+
+%% Cell type:markdown id:western-thriller tags:
+
+Thus, the maximum difference is in the $\mathcal{O} (10^{-3})$ which can be neglected for our application.
+
+%% Cell type:code id:hydraulic-appearance tags:

 ``` python
 ```