Skip to content
Snippets Groups Projects
Commit 7d2b4c5b authored by Simon Grasse's avatar Simon Grasse
Browse files

implement data model for parsing dataset config

parent 6c3f1e47
Branches
No related tags found
No related merge requests found
from typing import Dict, Any, List, Tuple, Union, Literal from typing import List, Tuple, Union, Literal
import json import json
import logging import logging
from pydantic import BaseModel, validator, root_validator, conint, PositiveInt, conlist from pydantic import BaseModel, validator, root_validator, conint, PositiveInt, conlist
from utils.dataset_utils import DATASETS, get_dataset_info, get_vars from utils.dataset_utils import DATASETS, get_dataset_info, get_vars, INTERPOLATION_UNITS
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
...@@ -13,7 +13,7 @@ class VariableInfo(BaseModel): ...@@ -13,7 +13,7 @@ class VariableInfo(BaseModel):
dataset: str dataset: str
name: str name: str
lvl: List[int] lvl: List[int]
interpolation: Literal["z","p"] interpolation: Union[Literal[*INTERPOLATION_UNITS], None] # TODO align untis with units defined in InterpolationInfo
@validator("name") @validator("name")
def check_variable_name(cls, name, values): def check_variable_name(cls, name, values):
...@@ -22,17 +22,35 @@ class VariableInfo(BaseModel): ...@@ -22,17 +22,35 @@ class VariableInfo(BaseModel):
raise ValueError(f"no variable '{name}' available for dataset {values['dataset']}") raise ValueError(f"no variable '{name}' available for dataset {values['dataset']}")
return name return name
@validator("interpolation")
def check_interpolation_availability(cls, interpolation, values):
info = get_dataset_info(values["dataset"])
if not interpolation in info.levels.interpolation_units:
raise ValueError(f"no information on how to interpolate dataset {values['dataset']} in unit {interpolation}")
@root_validator(skip_on_failure=True) @root_validator(skip_on_failure=True)
def check_lvl_availability(cls, values): def check_lvl_availability(cls, values):
variables = get_dataset_info(values["dataset"])["variables"] dataset, name, lvls, interpolation = values.values()
variables = list(filter( info = get_dataset_info(dataset)
lambda v: v["name"] == values["name"] and set(values["lvl"]).issubset(v["lvl"]), variable = info.get_var(name)
variables
)) if variable.level_type == "sfc": # TODO mark convention for surface variables
if not len(variables) > 0: len(lvls) == 0
raise ValueError(f"variable {variables[0]['name']} at lvl {values['lvl']} is not available for dataset {values['dataset']}.") elif variable.level_type == "ml":
return values diff = set(lvls).issubset(info.levels.ml)
# interpolate difference from model lvls
interp_info = info.levels.get_interp(interpolation)
out_of_range = list(filter(lambda lvl: interp_info.start <= lvl <= interp_info.end, diff))
if len(out_of_range) > 0:
raise ValueError(f"Cannot interpolate {name} for lvls: {out_of_range}: Out of Range")
elif variable.level_type == "pl":
if not set(lvls).issubset(info.levels.pl):
diff = set(lvls).difference(info.levels.pl)
raise ValueError(f"variable {name} not available for lvls: {diff}")
return values
def __str__(self): def __str__(self):
return "_".join(f"{self.name}-{l}{self.interpolation}" for l in self.lvl) return "_".join(f"{self.name}-{l}{self.interpolation}" for l in self.lvl)
...@@ -56,7 +74,7 @@ class DomainInfo(BaseModel): ...@@ -56,7 +74,7 @@ class DomainInfo(BaseModel):
@validator("years", pre=True) @validator("years", pre=True)
def all_years(cls, years, values): def all_years(cls, years, values):
if years == "all": if years == "all":
return get_dataset_info(values["name"])["years"] return get_dataset_info(values["name"]).years
else: else:
return years return years
...@@ -127,3 +145,23 @@ class DomainInfo(BaseModel): ...@@ -127,3 +145,23 @@ class DomainInfo(BaseModel):
@property @property
def coords_ne(self) -> Tuple[float]: def coords_ne(self) -> Tuple[float]:
return DomainInfo._nxy_to_coords(self.coords_sw, self.nyx, self.resolution) return DomainInfo._nxy_to_coords(self.coords_sw, self.nyx, self.resolution)
@property
def lat_range(self) -> Tuple[float]:
return (self.coords_sw[0], self.coords_ne[0])
@property
def lon_range(self) -> Tuple[float]:
return (self.coords_sw[1], self.coords_ne[1])
@property
def years_count(self) -> int:
return len(self.years)
@property
def months_count(self) -> int:
return self.years_count*len(self.months)
@property
def variable_names(self) -> List[str]:
return [var.name for var in self.variables]
\ No newline at end of file
...@@ -14,7 +14,6 @@ from pydantic import ValidationError ...@@ -14,7 +14,6 @@ from pydantic import ValidationError
from data_extraction.weatherbench import ExtractWeatherbench from data_extraction.weatherbench import ExtractWeatherbench
from data_extraction.era5 import ExtractERA5 from data_extraction.era5 import ExtractERA5
from data_extraction.data_info import VariableInfo, DomainInfo from data_extraction.data_info import VariableInfo, DomainInfo
from utils.dataset_utils import DATASETS, get_dataset_info
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
......
...@@ -7,6 +7,7 @@ functions providing info about available options ...@@ -7,6 +7,7 @@ functions providing info about available options
Provides: * DATASET_META_LOCATION Provides: * DATASET_META_LOCATION
* DATASETS * DATASETS
* get_dataset_info * get_dataset_info
* DatasetInfo
""" """
# import sys # import sys
...@@ -14,46 +15,114 @@ Provides: * DATASET_META_LOCATION ...@@ -14,46 +15,114 @@ Provides: * DATASET_META_LOCATION
import json import json
from pathlib import Path from pathlib import Path
from typing import Dict, Any, List, Tuple from functools import cache
from dataclasses import dataclass from enum import StrEnum, auto
from typing import Dict, Any, List, Tuple, Literal
from pydantic import BaseModel, Literal, PositiveInt, conint, PositiveFloat, root_validator, ValidationError
DATASET_META_LOCATION = Path(__file__).parent.parent / "config" / "datasets" / "info" DATASET_META_LOCATION = Path(__file__).parent.parent / "config" / "datasets" / "info"
DATASETS = [path.stem for path in DATASET_META_LOCATION.iterdir() if path.name.endswith(".json")] DATASETS = [path.stem for path in DATASET_META_LOCATION.iterdir() if path.name.endswith(".json")]
DATE_TEMPLATE = "{year}-{month:02d}" DATE_TEMPLATE = "{year}-{month:02d}"
INTERP_UNITS = ["hpa", "m", "p"]
class VariableInfo(BaseModel):
name: str
level_type: Literal["sfc","pl","ml"]
def __eq__(self, other):
return str(self) == str(other)
def __hash__(self):
return hash(self.name)
def __str__(self):
return self.name
class InterpInfo(BaseModel):
unit: Literal[*INTERP_UNITS]
start: PositiveInt
end: PositiveInt
@root_validator
def start_lt_end(cls, values):
if not values["start"] < values["end"]:
raise ValueError(
f"Interpolation: unit {values['unit']} Start value should be bigger then end value."
)
return values
def __eq__(self, other):
return str(self) == str(other)
def __hash__(self):
return hash(self.unit)
def __str__(self):
return self.unit
class LevelInfo(BaseModel):
ml: List[PositiveInt]
pl: List[PositiveInt]
interpolation: List[InterpInfo]
@cache
@property
def interpolation_units(self) -> List[str]:
return [i_info.unit for i_info in self.interpolation]
def get_interp(self, unit: Literal[*INTERP_UNITS]) -> InterpInfo:
return self.interpolation[self.interpolation.index(unit)]
class Resolution(BaseModel):
deg: PositiveFloat
nx: PositiveInt
ny: PositiveInt
class GridInfo(BaseModel):
grid_type: Literal["lonlat"]
xname: str
xunits: Literal["degree"] # maybe unnessecariy ?
yname: str
yunits: Literal["degree"] # maybe unnessecariy ?
grid_spacing: List[Resolution]
class DatasetInfo(BaseModel):
variables: List[VariableInfo]
levels: LevelInfo
grid: GridInfo
years: List[conint(ge=1979)]
@cache
@property
def var_names(self) -> List[str]:
return [var.name for var in self.variables]
def get_var(self, name: str) -> VariableInfo:
return self.variables[self.var_names.index(name)]
def get_filename_template(name: str) -> str: def get_filename_template(name: str) -> str:
return f"{name}_{DATE_TEMPLATE}.nc" return f"{name}_{DATE_TEMPLATE}.nc"
def get_dataset_info(name: str) -> Dict[str,Any]: @cache
def get_dataset_info(name: str) -> DatasetInfo:
"""Extract metainformation about dataset from corresponding JSON file.""" """Extract metainformation about dataset from corresponding JSON file."""
file = DATASET_META_LOCATION / f"{name}.json" file = DATASET_META_LOCATION / f"{name}.json"
try: try:
with open(file, "r") as f: with open(file, "r") as f:
return json.load(f) # TODO: input validation => specify schema / pydantic ? return DatasetInfo(**json.load(f))
except FileNotFoundError as e: except FileNotFoundError as e:
raise ValueError(f"Information on dataset '{dataset}' doesnt exist.") raise ValueError(f"Cannot access {name} information: {f} not available")
except ValidationError as e:
raise ValueError(f"Cannot access {name} information: Invalid Format of {f}\n{str(e)}")
def get_vars(name: str) -> List[str]: def get_vars(name: str) -> List[str]:
"""Extract names of available variables.""" """Extract names of available variables."""
info = get_dataset_info(name) return get_dataset_info(name).var_names
return [variable["name"] for variable in info["variables"]] \ No newline at end of file
var_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"lvl": {
"type": "array",
"items": {
"type": "integer"
},
"minItems": 1,
"uniqueItems": True,
},
"interpolation": {"type": "string", "enum": ["p", "z"]}
},
"required": ["name"],
"additionalProperties": False
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment