diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000000000000000000000000000000000..41cfce4606f16207d02e441b809350f1a4ba6d24 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,5 @@ +# use automatic fixing from static linter / formatter (hatch fmt => ruff) +3c05a2029bbc078520ae6f7a22ad713593f6aeb8 + +# bulk formatting using ruff +a246594d04d22d62df1726e678dd2c45be205a5b diff --git a/README.md b/README.md index 25ba0b6317a1ce6c465d73ad9b21b92a2d8ab647..15f12d0c16de2e5446eb7de551ee9f682aa5692b 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ The documentation and this README are work in progress. # Requirements +This project requires python 3.11 or higher. TBD, see pyproject.toml @@ -23,7 +24,7 @@ TBD, see pyproject.toml Move to the folder you want to download this project to. We now need to download the source code from the [repository](https://gitlab.jsc.fz-juelich.de/esde/toar-public/toargridding/-/tree/dev?ref_type=heads) either as ZIP file or via git: -## Download with GIT +## 1) Download with GIT Clone the project from its git repository: ```bash git clone https://gitlab.jsc.fz-juelich.de/esde/toar-public/toargridding.git @@ -34,24 +35,33 @@ cd toargridding git checkout testing ``` -## Installing Dependencies and Setting up Virtual Environment +## 2) Installing Dependencies and Setting up Virtual Environment -The handling of required packages is done with [poetry](https://python-poetry.org/). -After installing poetry, you can simply install all required dependencies for this project by running poetry in the project directory: +Setup a virtual environment for your code to avoid conflicts with other projects. Here, you can use your preferred tool or run: ```bash -poetry install +python -m venv .venv +source .venv/bin/activate ``` -This also creates a virtual environment, which ensures that the dependencies of different projects do not interfere. -To run a jupyter notebook in the virtual environment execute +The latter line activates the virtual environment for the further usage. To deactivate your environment call +```bash +deactivate +``` +For the installation of all required dependencies call +```bash +pip install -e . +``` + + +To run scripts or notebooks use: ```bash #for selecting a notebook over the file browser in your webbrowser: -poetry run jupyter notebook +jupyter notebook #or for directly opening a notebook: -poetry run jupyter notebook [/path/to/notebookname.ipynb] +jupyter notebook [/path/to/notebookname.ipynb] ``` and to run a script use ```bash -poetry run python [/path/to/scriptname.py] +python [/path/to/scriptname.py] ``` # How does this tool work? @@ -94,13 +104,13 @@ For VS Code, please ensure to select the kernel of the virtual environment [see] Running the provided examples with the python environment created by poetry can be done by ```bash -poetry run jupyter notebook + jupyter notebook ``` as pointed out previously. ## High level function ```bash -poetry run jupyter notebook example/produce_data_manyStations.ipynb + jupyter notebook example/produce_data_manyStations.ipynb #(please see next notebook for a faster example) ``` This notebook provides an example on how to download data, apply gridding and save the results as [netCDF files](https://docs.xarray.dev/en/stable/user-guide/io.html). @@ -114,7 +124,7 @@ The subsequent requests function as a progress report and allow working with the As the gridding is done offline, it will be executed for already downloaded files, whenever the notebook is rerun. Please note, that the file name for the gridded data also contains the date of creation. ```bash -poetry run jupyter notebook example/produce_data_withOptional.ipynb + jupyter notebook example/produce_data_withOptional.ipynb ``` This example is based on the previous one but uses additional arguments to reduce the number of stations per request. As an example, different classifications of the stations are used: first the "toar1_category" and second the "type_of_area". Details can be found in [documentation of the FastAPI REST interface](https://toar-data.fz-juelich.de/api/v2/#stationmeta) or the [user guide](https://toar-data.fz-juelich.de/sphinx/TOAR_UG_Vol03_Database/build/latex/toardatabase--userguide.pdf). @@ -123,7 +133,7 @@ The selection of only a limited number of stations leads to significant faster r ## Retrieving data ```bash -poetry run jupyter notebook example/get_sample_data_manual.ipynb + jupyter notebook example/get_sample_data_manual.ipynb ``` Downloads data from the TOAR database with a manual creation of the request to the TOAR database. The extracted data are written to disc. No further processing or gridding is done. @@ -131,13 +141,13 @@ The result is a ZIP-file containing two CSV files. The first one contains the st ## Retrieving data ```bash -poetry run jupyter notebook example/get_sample_data.ipynb + jupyter notebook example/get_sample_data.ipynb ``` As a comparison to the previous example, this one performs the same request by using the interface of this project. ## Retrieving data and visualization ```bash -poetry run jupyter notebook example/quality_controll.ipynb + jupyter notebook example/quality_controll.ipynb ``` Notebook for downloading and visualization of data. The data are downloaded and reused for subsequent executions of this notebook. @@ -148,7 +158,7 @@ The gridding is done on the downloaded data. Gridded data are not saved to disc. ## Duration of Different Requests ```bash -poetry run jupyter notebook tests/benchmark.py + jupyter notebook tests/benchmark.py ``` This script requests datasets with different durations (days to month) from the TOAR Database and saves them to disc. It reports the duration for the different requests. @@ -174,7 +184,7 @@ At the moment time differences larger than one day are working, i.e. start and e This package comes with all required information. There is a first function to fetch an update of the available variables from the TAOR-DB. This will override the original file: ```bash -poetry run python toargridding/setupFunctions.py + python toargridding/setupFunctions.py ``` # Documentation of Source Code: diff --git a/pyproject.toml b/pyproject.toml index 4137448d1088f24370bf540263e9b894cdddbc80..3a1df3c3901f48453b3a8f4bfe15e847cd84f922 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,81 @@ -[tool.poetry] +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] name = "toargridding" -version = "0.1.0" +dynamic = ["version"] description = "" -authors = ["Simon Grasse <s.grasse@fz-juelich.de>"] readme = "README.md" +requires-python = ">=3.10,<3.12" +license = "MIT" +keywords = [] +authors = [ + { name = "Simon Grasse", email = "s.grasse@fz-juelich.de" }, + { name = "Carsten Hinz", email = "c.hinz@fz-juelich.de"}, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "requests", + "numpy", + "xarray", + "pandas", +] -[tool.poetry.dependencies] -python = "^3.10" -xarray = "^2023.10.1" -requests = "^2.31.0" -aiohttp = "^3.8.6" -sparse = "^0.14.0" +[project.urls] +Issues = "https://gitlab.jsc.fz-juelich.de/esde/toar-public/toargridding/-/issues" +Source = "https://gitlab.jsc.fz-juelich.de/esde/toar-public/toargridding" +[tool.hatch.version] +path = "src/toargridding/__about__.py" -[tool.poetry.group.dev.dependencies] -pytest = "^7.4.3" -pytest-cov = "^4.1.0" -ipykernel = "^6.26.0" -compliance-checker = "^5.1.0" -cartopy = "^0.22.0" -ruff = "^0.1.13" -jupyter = "^1.0.0" +[tool.hatch.envs.default] +installer = "uv" -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +[tool.hatch.envs.hatch-test] +installer = "uv" +extra-dependencies = [ + "compliance-checker", +] + +[tool.hatch.envs.interactive] +installer = "uv" +extra-dependencies = [ + "jupyter", + "ipykernel", +] + +[tool.hatch.envs.hatch-static-analysis] +installer = "uv" + +[tool.hatch.envs.types] +installer = "uv" +extra-dependencies = [ + "mypy>=1.0.0", +] +[tool.hatch.envs.types.scripts] +check = "mypy --install-types --non-interactive {args:src/toargridding tests}" + +[tool.coverage.run] +source_pkgs = ["toargridding", "tests"] +branch = true +parallel = true +omit = [ + "src/toargridding/__about__.py", +] + +[tool.coverage.paths] +toargridding = ["src/toargridding", "*/toargridding/src/toargridding"] +tests = ["tests", "*/toargridding/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] \ No newline at end of file diff --git a/ruff_defaults.toml b/ruff_defaults.toml new file mode 100644 index 0000000000000000000000000000000000000000..9b6d90358373e4e481668c0ce16db55f88527f82 --- /dev/null +++ b/ruff_defaults.toml @@ -0,0 +1,557 @@ +line-length = 88 + +[format] +docstring-code-format = true +docstring-code-line-length = 80 + +[lint] +select = [ + "A001", + "A002", + "A003", + "ARG001", + "ARG002", + "ARG003", + "ARG004", + "ARG005", + "ASYNC100", + "ASYNC101", + "ASYNC102", + "B002", + "B003", + "B004", + "B005", + "B006", + "B007", + "B008", + "B009", + "B010", + "B011", + "B012", + "B013", + "B014", + "B015", + "B016", + "B017", + "B018", + "B019", + "B020", + "B021", + "B022", + "B023", + "B024", + "B025", + "B026", + "B028", + "B029", + "B030", + "B031", + "B032", + "B033", + "B034", + "B035", + "B904", + "B905", + "BLE001", + "C400", + "C401", + "C402", + "C403", + "C404", + "C405", + "C406", + "C408", + "C409", + "C410", + "C411", + "C413", + "C414", + "C415", + "C416", + "C417", + "C418", + "C419", + "COM818", + "DTZ001", + "DTZ002", + "DTZ003", + "DTZ004", + "DTZ005", + "DTZ006", + "DTZ007", + "DTZ011", + "DTZ012", + "E101", + "E401", + "E402", + "E701", + "E702", + "E703", + "E711", + "E712", + "E713", + "E714", + "E721", + "E722", + "E731", + "E741", + "E742", + "E743", + "E902", + "E999", + "EM101", + "EM102", + "EM103", + "EXE001", + "EXE002", + "EXE003", + "EXE004", + "EXE005", + "F401", + "F402", + "F403", + "F404", + "F405", + "F406", + "F407", + "F501", + "F502", + "F503", + "F504", + "F505", + "F506", + "F507", + "F508", + "F509", + "F521", + "F522", + "F523", + "F524", + "F525", + "F541", + "F601", + "F602", + "F621", + "F622", + "F631", + "F632", + "F633", + "F634", + "F701", + "F702", + "F704", + "F706", + "F707", + "F722", + "F811", + "F821", + "F822", + "F823", + "F841", + "F842", + "F901", + "FA100", + "FA102", + "FBT001", + "FBT002", + "FLY002", + "G001", + "G002", + "G003", + "G004", + "G010", + "G101", + "G201", + "G202", + "I001", + "I002", + "ICN001", + "ICN002", + "ICN003", + "INP001", + "INT001", + "INT002", + "INT003", + "ISC003", + "LOG001", + "LOG002", + "LOG007", + "LOG009", + "N801", + "N802", + "N803", + "N804", + "N805", + "N806", + "N807", + "N811", + "N812", + "N813", + "N814", + "N815", + "N816", + "N817", + "N818", + "N999", + "PERF101", + "PERF102", + "PERF401", + "PERF402", + "PGH005", + "PIE790", + "PIE794", + "PIE796", + "PIE800", + "PIE804", + "PIE807", + "PIE808", + "PIE810", + "PLC0105", + "PLC0131", + "PLC0132", + "PLC0205", + "PLC0208", + "PLC0414", + "PLC3002", + "PLE0100", + "PLE0101", + "PLE0116", + "PLE0117", + "PLE0118", + "PLE0237", + "PLE0241", + "PLE0302", + "PLE0307", + "PLE0604", + "PLE0605", + "PLE1142", + "PLE1205", + "PLE1206", + "PLE1300", + "PLE1307", + "PLE1310", + "PLE1507", + "PLE1700", + "PLE2502", + "PLE2510", + "PLE2512", + "PLE2513", + "PLE2514", + "PLE2515", + "PLR0124", + "PLR0133", + "PLR0206", + "PLR0402", + "PLR1701", + "PLR1711", + "PLR1714", + "PLR1722", + "PLR2004", + "PLR5501", + "PLW0120", + "PLW0127", + "PLW0129", + "PLW0131", + "PLW0406", + "PLW0602", + "PLW0603", + "PLW0711", + "PLW1508", + "PLW1509", + "PLW1510", + "PLW2901", + "PLW3301", + "PT001", + "PT002", + "PT003", + "PT006", + "PT007", + "PT008", + "PT009", + "PT010", + "PT011", + "PT012", + "PT013", + "PT014", + "PT015", + "PT016", + "PT017", + "PT018", + "PT019", + "PT020", + "PT021", + "PT022", + "PT023", + "PT024", + "PT025", + "PT026", + "PT027", + "PYI001", + "PYI002", + "PYI003", + "PYI004", + "PYI005", + "PYI006", + "PYI007", + "PYI008", + "PYI009", + "PYI010", + "PYI011", + "PYI012", + "PYI013", + "PYI014", + "PYI015", + "PYI016", + "PYI017", + "PYI018", + "PYI019", + "PYI020", + "PYI021", + "PYI024", + "PYI025", + "PYI026", + "PYI029", + "PYI030", + "PYI032", + "PYI033", + "PYI034", + "PYI035", + "PYI036", + "PYI041", + "PYI042", + "PYI043", + "PYI044", + "PYI045", + "PYI046", + "PYI047", + "PYI048", + "PYI049", + "PYI050", + "PYI051", + "PYI052", + "PYI053", + "PYI054", + "PYI055", + "PYI056", + "PYI058", + "RET503", + "RET504", + "RET505", + "RET506", + "RET507", + "RET508", + "RSE102", + "RUF001", + "RUF002", + "RUF003", + "RUF005", + "RUF006", + "RUF007", + "RUF008", + "RUF009", + "RUF010", + "RUF012", + "RUF013", + "RUF015", + "RUF016", + "RUF017", + "RUF018", + "RUF019", + "RUF020", + "RUF100", + "S101", + "S102", + "S103", + "S104", + "S105", + "S106", + "S107", + "S108", + "S110", + "S112", + "S113", + "S201", + "S202", + "S301", + "S302", + "S303", + "S304", + "S305", + "S306", + "S307", + "S308", + "S310", + "S311", + "S312", + "S313", + "S314", + "S315", + "S316", + "S317", + "S318", + "S319", + "S320", + "S321", + "S323", + "S324", + "S501", + "S502", + "S503", + "S504", + "S505", + "S506", + "S507", + "S508", + "S509", + "S601", + "S602", + "S604", + "S605", + "S606", + "S607", + "S608", + "S609", + "S611", + "S612", + "S701", + "S702", + "SIM101", + "SIM102", + "SIM103", + "SIM105", + "SIM107", + "SIM108", + "SIM109", + "SIM110", + "SIM112", + "SIM113", + "SIM114", + "SIM115", + "SIM116", + "SIM117", + "SIM118", + "SIM201", + "SIM202", + "SIM208", + "SIM210", + "SIM211", + "SIM212", + "SIM220", + "SIM221", + "SIM222", + "SIM223", + "SIM300", + "SIM910", + "SIM911", + "SLF001", + "SLOT000", + "SLOT001", + "SLOT002", + "T100", + "T201", + "T203", + "TCH001", + "TCH002", + "TCH003", + "TCH004", + "TCH005", + "TCH010", + "TD004", + "TD005", + "TD006", + "TD007", + "TID251", + "TID252", + "TID253", + "TRIO100", + "TRIO105", + "TRIO109", + "TRIO110", + "TRIO115", + "TRY002", + "TRY003", + "TRY004", + "TRY201", + "TRY300", + "TRY301", + "TRY302", + "TRY400", + "TRY401", + "UP001", + "UP003", + "UP004", + "UP005", + "UP006", + "UP007", + "UP008", + "UP009", + "UP010", + "UP011", + "UP012", + "UP013", + "UP014", + "UP015", + "UP017", + "UP018", + "UP019", + "UP020", + "UP021", + "UP022", + "UP023", + "UP024", + "UP025", + "UP026", + "UP027", + "UP028", + "UP029", + "UP030", + "UP031", + "UP032", + "UP033", + "UP034", + "UP035", + "UP036", + "UP037", + "UP038", + "UP039", + "UP040", + "UP041", + "W291", + "W292", + "W293", + "W505", + "W605", + "YTT101", + "YTT102", + "YTT103", + "YTT201", + "YTT202", + "YTT203", + "YTT204", + "YTT301", + "YTT302", + "YTT303", +] + +[lint.per-file-ignores] +"**/scripts/*" = [ + "INP001", + "T201", +] +"**/tests/**/*" = [ + "PLC1901", + "PLR2004", + "PLR6301", + "S", + "TID252", +] + +[lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[lint.isort] +known-first-party = ["toargridding"] + +[lint.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false diff --git a/src/toargridding/__about__.py b/src/toargridding/__about__.py new file mode 100644 index 0000000000000000000000000000000000000000..1cf6267ae58cd56ed02bc3d30a68746cf92c485e --- /dev/null +++ b/src/toargridding/__about__.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/toargridding/__init__.py b/src/toargridding/__init__.py similarity index 100% rename from toargridding/__init__.py rename to src/toargridding/__init__.py diff --git a/toargridding/defaultLogging.py b/src/toargridding/defaultLogging.py similarity index 75% rename from toargridding/defaultLogging.py rename to src/toargridding/defaultLogging.py index 36ca102c23de13c4556a12677ffba302ed4524f6..4f9a448494d4ab1dff4325ae47bf0870cf282b09 100644 --- a/toargridding/defaultLogging.py +++ b/src/toargridding/defaultLogging.py @@ -1,11 +1,11 @@ -import sys import logging +import sys from collections import namedtuple -from pathlib import Path from logging.handlers import SysLogHandler, TimedRotatingFileHandler +from pathlib import Path +handlerPair = namedtuple("registeredLogger", ["handler", "formatter"]) -handlerPair = namedtuple("registeredLogger", ["handler","formatter"]) class toargridding_defaultLogging: """! class to setup default loggers for toargridding @@ -15,7 +15,7 @@ class toargridding_defaultLogging: ---------- loggername: name of the logger to be used. default: toargridding - + Methods: ------- registerHandler: @@ -31,36 +31,40 @@ class toargridding_defaultLogging: getFormatter: get a registered formatter """ - def __init__(self, loggername : str = "toargridding" ): + + def __init__(self, loggername: str = "toargridding"): self.logger = logging.getLogger(loggername) self.logger.setLevel(logging.DEBUG) - self.registeredHandlers = {}# name : registeredLogger - def registerHandler(self, name : str, handler, formatter = None): + self.registeredHandlers = {} # name : registeredLogger + + def registerHandler(self, name: str, handler, formatter=None): """register a handler. Adds it to the logger and stores references in this class. The given formatter (if not None) is added to the handler. The handler is added to the logger. Handler and logger are stored for a possible later access. - + Throws an exception if the name of the handler is already known. """ if name in self.registeredHandlers: - raise ValueError(f"There is already a registered handler with the name {name}.") + msg = f"There is already a registered handler with the name {name}." + raise ValueError(msg) if formatter is not None: handler.setFormatter(formatter) self.logger.addHandler(handler) self.registeredHandlers[name] = handlerPair(handler, formatter) - def getHandler(self, name : str): - """get a handler for logging by its name used for the registration - """ + + def getHandler(self, name: str): + """get a handler for logging by its name used for the registration""" if name in self.registeredHandlers: return self.registeredHandlers[name].handler return None - def getFormatter(self, name : str): - """get a formatter by its name used for the registration - """ + + def getFormatter(self, name: str): + """get a formatter by its name used for the registration""" if name in self.registeredHandlers: return self.registeredHandlers[name].formatter return None + def addShellLogger(self, level=logging.INFO): """!adds a formatted logging to the shell to the "toargridding" logger. The handler is registered as "shell". @@ -72,25 +76,31 @@ class toargridding_defaultLogging: """ shell_handler = logging.StreamHandler() shell_handler.setLevel(level) - shell_formatter = logging.Formatter(fmt="%(asctime)s [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'", datefmt="%Y-%m-%d %H:%M:%S") + shell_formatter = logging.Formatter( + fmt="%(asctime)s [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'", datefmt="%Y-%m-%d %H:%M:%S" + ) self.registerHandler("shell", shell_handler, shell_formatter) + def addSysLogger(self, level=logging.WARNING): """!adds a formatted logging to the system log of a linux system to the "toargridding" logger. This logging is registered as "syslog". - + Parameters: ---------- level: set the verbosity level of the logger (default: warning) """ - syslog_handler = SysLogHandler(facility=SysLogHandler.LOG_DAEMON, address='/dev/log') - syslog_formatter = logging.Formatter(fmt="TOARGRIDDING [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'") + syslog_handler = SysLogHandler(facility=SysLogHandler.LOG_DAEMON, address="/dev/log") + syslog_formatter = logging.Formatter( + fmt="TOARGRIDDING [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'" + ) syslog_handler.setLevel(level) - self.registerHandler("syslog",syslog_handler, syslog_formatter) - def addRotatingLogFile(self, filename : Path, level=logging.INFO): + self.registerHandler("syslog", syslog_handler, syslog_formatter) + + def addRotatingLogFile(self, filename: Path, level=logging.INFO): """creation of a rotating file handler, that will change the files at midnight. The last 7 files logfiles are stored. - + Parameters: ---------- filename: @@ -101,12 +111,15 @@ class toargridding_defaultLogging: filename.parent.mkdir(parents=True, exist_ok=True) handler = TimedRotatingFileHandler(filename, when="midnight", backupCount=7) handler.setLevel(level) - formatter = logging.Formatter(fmt="%(asctime)s [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'", datefmt="%Y-%m-%d %H:%M:%S") + formatter = logging.Formatter( + fmt="%(asctime)s [%(levelname)s] - %(filename)s:%(lineno)d: '%(message)s'", datefmt="%Y-%m-%d %H:%M:%S" + ) self.registerHandler("rotatingFile", handler, formatter) - def addRotatingLogFile_scriptName(self, scriptName : str | Path, level=logging.INFO): + + def addRotatingLogFile_scriptName(self, scriptName: str | Path, level=logging.INFO): """creation of an rotating log file by using the script name. In the /path/to/script a subdirectory log will be created. The logfile will be name [script basename].log - + Parameters: ---------- scriptName: @@ -116,24 +129,25 @@ class toargridding_defaultLogging: """ sn = Path(scriptName) if not sn.is_file: - raise ValueError(f"Expecting name to a script. {sn} is not a file.") + msg = f"Expecting name to a script. {sn} is not a file." + raise ValueError(msg) path = sn.parent / "log" path.mkdir(exist_ok=True) fn = path / f"{sn.stem}.log" self.addRotatingLogFile(fn, level) + def logExceptions(self): """calling this function will redirect all uncaught exceptions to the logger - This is especially useful to write the exceptions to the system log + This is especially useful to write the exceptions to the system log """ sys.excepthook = self.handle_exception - def handle_exception(self, exc_type, exc_value, exc_traceback): - """function for passing uncaught exceptions to the logger - """ + def handle_exception(self, exc_type, exc_value, exc_traceback): + """function for passing uncaught exceptions to the logger""" if issubclass(exc_type, KeyboardInterrupt): sys.__excepthook__(exc_type, exc_value, exc_traceback) return - self.logger.error("Program terminated by the following exception:", exc_info=(exc_type, exc_value, exc_traceback)) - - + self.logger.error( + "Program terminated by the following exception:", exc_info=(exc_type, exc_value, exc_traceback) + ) diff --git a/toargridding/gridding.py b/src/toargridding/gridding.py similarity index 87% rename from toargridding/gridding.py rename to src/toargridding/gridding.py index d82ad03106800633b9e645540e533a2fb0f3a862..fcd5e33647b8f8d4b73e5441bc0fc2b6e79ea0c5 100644 --- a/toargridding/gridding.py +++ b/src/toargridding/gridding.py @@ -1,11 +1,11 @@ -from itertools import product from collections import namedtuple +from itertools import product import xarray as xr -from toargridding.toar_rest_client import AnalysisService from toargridding.grids import GridDefinition from toargridding.metadata import Metadata, TimeSample +from toargridding.toar_rest_client import AnalysisService GriddedResult = namedtuple("GriddedResult", ["dataset", "metadata"]) @@ -16,10 +16,10 @@ def get_gridded_toar_data( time: TimeSample, variables: list[str], stats: list[str], - **kwargs + **kwargs, ) -> tuple[list[xr.Dataset], list[Metadata]]: - """ API to download data as xarrays - + """API to download data as xarrays + The function creates all combinations of the variable and stats list Parameters: @@ -41,7 +41,8 @@ def get_gridded_toar_data( """ metadatas = [ - Metadata.construct(standard_name=var, time=time, stat=stat, moreOptions=kwargs) for var, stat in product(variables, stats) + Metadata.construct(standard_name=var, time=time, stat=stat, moreOptions=kwargs) + for var, stat in product(variables, stats) ] datasets = [] @@ -50,5 +51,5 @@ def get_gridded_toar_data( ds = grid.as_xarray(data) datasets.append(ds) - #TODO: return this as a list of tuples to keep data and metadata together? + # TODO: return this as a list of tuples to keep data and metadata together? return datasets, metadatas diff --git a/toargridding/grids.py b/src/toargridding/grids.py similarity index 77% rename from toargridding/grids.py rename to src/toargridding/grids.py index c19323586c4f6d7c7ebd0f4f6419f2a4f51e45bc..ca501eeef770df49207d28a206a1ac1e327e1d66 100644 --- a/toargridding/grids.py +++ b/src/toargridding/grids.py @@ -1,23 +1,21 @@ -import logging -from enum import Enum +import logging from abc import ABC, abstractmethod from collections import namedtuple +from enum import Enum -import xarray as xr -import pandas as pd import numpy as np - -from typing import Dict +import pandas as pd +import xarray as xr from pandas.core.groupby import DataFrameGroupBy from toargridding.metadata import ( - Variables, - Coordinates, - get_global_attributes, AnalysisRequestResult, + Coordinates, Metadata, + Variables, + get_global_attributes, ) -from toargridding.variables import Variable, Coordinate +from toargridding.variables import Coordinate, Variable logger = logging.getLogger(__name__) @@ -25,26 +23,28 @@ GridType = Enum("GridType", ["regular"]) """list of available grids. """ + class GridDefinition(ABC): """factory and base class for definition of different grids usage: GridDefinition.construct( GridType, dict( parameter : value ) ) The dict must contain all parameters required for the creation of the desired GridType """ + cell_index_name = "cell_index" def __init__(self): self.fill_value = np.nan # TO(pd.DataFrame)) -> xr.Dataset: - pass # TODO make CF-compliant + # TODO make CF-compliant @staticmethod def construct(grid_type: GridType, **kwargs): """creation of requested grid type - + usage: GridDefinition.construct( GridType, dict( parameter : value ) ) The dict must contain all parameters required for the creation of the desired GridType """ - match (grid_type): + match grid_type: case GridType.regular: return RegularGrid(**kwargs) case other: @@ -52,10 +52,8 @@ class GridDefinition(ABC): @property @abstractmethod - def description(self)->str: - """description of this grid - """ - pass + def description(self) -> str: + """description of this grid""" @staticmethod def from_netcdf(): @@ -63,29 +61,24 @@ class GridDefinition(ABC): not yet implemented """ - pass @abstractmethod - def as_xarray( - timeseries: dict[str, pd.DataFrame], metadata: pd.DataFrame - ) -> dict[str, xr.Dataset]: - """conversion of panda Dataframes to an xarray dataset + def as_xarray(self, data: AnalysisRequestResult) -> xr.Dataset: + """gridding of a request to the TOAR database This includes the required setup to store the results as netCDF file according to CF (https://cfconventions.org/cf-conventions/cf-conventions.html) """ - pass + @staticmethod def get_id(): - """provide an ID for this grid and its resolution. - """ - pass + """provide an ID for this grid and its resolution.""" class RegularGrid(GridDefinition): """definition of a regular grid with longitude and latitude. - + The grid covers the complete globe and is defined by providing resolution for latitude (lat_resolution) and longitude (lon_resolution) - + Argument: -------- lat: @@ -112,12 +105,8 @@ class RegularGrid(GridDefinition): super().__init__() # TODO make sure only sensible resolutions - self.lat = Coordinate.from_resolution( - Coordinates.latitude, lat_resolution, min=-90, max=90, wraps=False - ) - self.lon = Coordinate.from_resolution( - Coordinates.longitude, lon_resolution, min=-180, max=180, wraps=True - ) + self.lat = Coordinate.from_resolution(Coordinates.latitude, lat_resolution, min=-90, max=90, wraps=False) + self.lon = Coordinate.from_resolution(Coordinates.longitude, lon_resolution, min=-180, max=180, wraps=True) spatial_shape = (self.lon.size, self.lat.size) spatial_size = self.lon.size * self.lat.size self.dims = [ @@ -126,15 +115,12 @@ class RegularGrid(GridDefinition): Coordinates.longitude.name, ] - self._as_xy_index = np.dstack( - np.meshgrid(range(self.lat.size), range(self.lon.size)) - ).reshape(-1, 2) + self._as_xy_index = np.dstack(np.meshgrid(range(self.lat.size), range(self.lon.size))).reshape(-1, 2) self._as_i_index = np.arange(spatial_size).reshape(spatial_shape).T @property - def description(self)->str: - """get description of grid - """ + def description(self) -> str: + """get description of grid""" return f"regular global grid with lat/lon resolutions ({self.lat.step}, {self.lon.step})" @@ -149,9 +135,7 @@ class RegularGrid(GridDefinition): results of the request, including data, station coordinates and metadata of request """ - data_grouped_by_cell = self.group_data_by_cell( - data.stations_data, data.stations_coords - ) + data_grouped_by_cell = self.group_data_by_cell(data.stations_data, data.stations_coords) cell_statistics = self.get_cell_statistics(data_grouped_by_cell) dataset = self.create_dataset(cell_statistics, data.metadata) @@ -159,7 +143,7 @@ class RegularGrid(GridDefinition): def group_data_by_cell(self, data: pd.DataFrame, coords: pd.DataFrame) -> DataFrameGroupBy: """grouping of stations into cells - + This function converts the lat/lon coordinates of the stations into cell indices and groups stations belonging to one cell. Parameters: @@ -174,14 +158,12 @@ class RegularGrid(GridDefinition): # will convert cell_indices to float as some nans ar present print(data) - data_with_indices = data.join( - cell_indices.to_frame(GridDefinition.cell_index_name), how="outer" - ) + data_with_indices = data.join(cell_indices.to_frame(GridDefinition.cell_index_name), how="outer") print(data) return data_with_indices.groupby(GridDefinition.cell_index_name) - def get_cell_statistics(self, groups : DataFrameGroupBy) -> dict[str, pd.DataFrame]: + def get_cell_statistics(self, groups: DataFrameGroupBy) -> dict[str, pd.DataFrame]: """calculation of mean, std and number of stations per cell Parameters: @@ -189,9 +171,9 @@ class RegularGrid(GridDefinition): groups: time series data grouped by stations in a cell return: - dictionary with calculated quantities + dictionary with calculated quantities """ - + stats = { Variables.mean: groups.mean(), Variables.std: groups.std(), @@ -200,7 +182,7 @@ class RegularGrid(GridDefinition): return stats - def create_dataset(self, cell_statistics : Dict[str, pd.DataFrame], metadata: Metadata) -> xr.Dataset: + def create_dataset(self, cell_statistics: dict[str, pd.DataFrame], metadata: Metadata) -> xr.Dataset: """creation of data set and filling with results from the gridding Parameters: @@ -214,7 +196,7 @@ class RegularGrid(GridDefinition): """ time = Coordinate.from_data( - metadata.time.as_cf_index( cell_statistics[Variables.mean].columns ), + metadata.time.as_cf_index(cell_statistics[Variables.mean].columns), Coordinates.time, metadata, step=metadata.time.sampling, @@ -222,16 +204,20 @@ class RegularGrid(GridDefinition): gridded_ds = self.get_empty_grid(time, metadata) for variable, aggregated_data in cell_statistics.items(): - data_array_dict = self.get_data_array_dict( - time, aggregated_data, variable, metadata - ) + data_array_dict = self.get_data_array_dict(time, aggregated_data, variable, metadata) gridded_ds = gridded_ds.assign(data_array_dict) return gridded_ds - def get_data_array_dict(self, time : Coordinate, aggregated_data : pd.DataFrame, variable : Variables, metadata : Metadata) -> Dict[str, xr.DataArray]: + def get_data_array_dict( + self, + time: Coordinate, + aggregated_data: pd.DataFrame, + variable: Variables, + metadata: Metadata, + ) -> dict[str, xr.DataArray]: """conversion of data to a dict for assigning them to the Dataset - + Parameters: ---------- time: @@ -248,34 +234,31 @@ class RegularGrid(GridDefinition): gridded_variable = Variable.from_data(gridded_data, variable, metadata) return {variable.name: gridded_variable.as_data_array(self.dims)} - def create_gridded_data(self, time : Coordinate, grouped_timeseries : pd.DataFrame)->np.array: + def create_gridded_data(self, time: Coordinate, grouped_timeseries: pd.DataFrame) -> np.array: """converts the available cell data to a full lat/lon-temporal data cube. Parameters: ---------- - time: + time: temporal coordinate grouped_timeseries: data frame with station position and data - return: + return: 3D-array with axis time, latitude and longitude. Fields without data are nan (fill_value defined in GridDefinition init) """ - #CAVE: This function might involve black magic... + # CAVE: This function might involve black magic... values = np.empty((time.size, self.lat.size, self.lon.size)) values[...] = self.fill_value index = self._as_xy_index[grouped_timeseries.index.astype(int)] - values[:, index.T[0], index.T[1]] = grouped_timeseries.values.reshape( - -1, time.size - ).T + values[:, index.T[0], index.T[1]] = grouped_timeseries.values.reshape(-1, time.size).T return values - def as_cell_index(self, coords : pd.DataFrame) -> pd.Series: - """converts coordinates of stations into x and y indices of the regular grid - """ - + def as_cell_index(self, coords: pd.DataFrame) -> pd.Series: + """converts coordinates of stations into x and y indices of the regular grid""" + id_x = self.coord_to_index(coords[self.lat.name], self.lat.min, self.lat.step) id_y = self.coord_to_index(coords[self.lon.name], self.lon.min, self.lon.step, len(self.lon.data)) @@ -283,7 +266,7 @@ class RegularGrid(GridDefinition): return pd.Series(id_i, index=id_x.index) - def coord_to_index(self, coord : pd.Series, x0_axis : float, d_axis : float, maxBin4Wrap : int=None) -> np.array: + def coord_to_index(self, coord: pd.Series, x0_axis: float, d_axis: float, maxBin4Wrap: int = None) -> np.array: """converts a coordinate into a bin index on one axis Parameters: @@ -326,7 +309,7 @@ class RegularGrid(GridDefinition): ds = xr.Dataset(coords=coords, attrs=get_global_attributes(metadata)) return ds + def get_id(self): - """provide an ID for this grid and its resolution. - """ + """provide an ID for this grid and its resolution.""" return f"_regular{len(self.lat.data)}x{len(self.lon.data)}" diff --git a/toargridding/metadata.py b/src/toargridding/metadata.py similarity index 83% rename from toargridding/metadata.py rename to src/toargridding/metadata.py index 6997ce10a857b97ed1093fe76778f4984f739aee..6af0394335c0b941e20364ef3ee8b3439c2b4c88 100644 --- a/toargridding/metadata.py +++ b/src/toargridding/metadata.py @@ -1,22 +1,19 @@ -import logging - +import importlib.metadata +import logging +from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum -from dataclasses import dataclass, field import numpy as np import pandas as pd -from toargridding.toarstats_constants import STATISTICS_LIST, ALLOWED_SAMPLING_VALUES -from toargridding.static_metadata import global_cf_attributes, TOARVariable - -from typing import Dict +from toargridding.static_metadata import TOARVariable, global_cf_attributes +from toargridding.toarstats_constants import ALLOWED_SAMPLING_VALUES, STATISTICS_LIST -import importlib.metadata logger = logging.getLogger(__name__) date_created = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") -#date_created = datetime.now(datetime.UTC).strftime("%Y-%m-dT%H:%M:%SZ") # fix as utcnow will be removed in the future +# date_created = datetime.now(datetime.UTC).strftime("%Y-%m-dT%H:%M:%SZ") # fix as utcnow will be removed in the future COORDINATE_VARIABLES = ["latitude", "longitude", "time"] DATA_VARIABLES = ["mean", "std", "n"] @@ -34,7 +31,7 @@ class TimeSample: """Sampling in time provides conversion into different formats - + Attributes: start: start time point @@ -47,7 +44,7 @@ class TimeSample: start: datetime end: datetime sampling: str - + @property def sampling(self) -> str: """sampling for data request @@ -56,18 +53,18 @@ class TimeSample: Allows only a limited number of supported sampling durations see toargridding.toarstats_constants.ALLOWED_SAMPLING_VALUES """ return self._sampling - + @sampling.setter - def sampling(self, sampling : str): + def sampling(self, sampling: str): if sampling not in ALLOWED_SAMPLING_VALUES: - raise ValueError(f"sampling: {sampling} is not in the list of supported samplings for toargridding: {ALLOWED_SAMPLING_VALUES}") + msg = f"sampling: {sampling} is not in the list of supported samplings for toargridding: {ALLOWED_SAMPLING_VALUES}" + raise ValueError(msg) self._sampling = sampling def as_datetime_index(self) -> pd.DatetimeIndex: - """Conversion to array with all sampled time points - """ - #print(self.start) - #print(self.end) + """Conversion to array with all sampled time points""" + # print(self.start) + # print(self.end) return pd.period_range(self.start, self.end, freq=self.frequency).to_timestamp() @property @@ -81,25 +78,24 @@ class TimeSample: @property def frequency(self) -> str: - """Converts sampling argument from TOAR to Pandas - """ + """Converts sampling argument from TOAR to Pandas""" return TIME_FREQUENCIES[self.sampling] - def as_cf_index(self, times : pd.Series = None) -> np.array: + def as_cf_index(self, times: pd.Series = None) -> np.array: """conversion to netCDF Climate and Forecast (CF) Metadata Conventions - + Calculates the duration in days relative to start time point. """ - + if times is None: times = self.as_datetime_index() - return np.array([ (t - self.start).days for t in times]) + return np.array([(t - self.start).days for t in times]) @dataclass class Metadata: - """Metadata of a request. - + """Metadata of a request. + Attributes: ---------- variable: @@ -111,15 +107,16 @@ class Metadata: moreOptions: collection of additional query options for the REST API. """ + variable: TOARVariable time: TimeSample statistic: str - moreOptions : Dict = field(default_factory=lambda: {}) + moreOptions: dict = field(default_factory=dict) @staticmethod - def construct(standard_name: str, time: TimeSample, stat: str, moreOptions : Dict = {}): - """constructor - + def construct(standard_name: str, time: TimeSample, stat: str, moreOptions: dict = {}): + """constructor + Parameters: ---------- standard_name: @@ -134,20 +131,21 @@ class Metadata: variable = TOARVariable.get(standard_name) return Metadata(variable, time, stat, moreOptions) - + @property - def statistic(self) -> str: # TODO make better + def statistic(self) -> str: # TODO make better """statistical property for being extracted from the TOAR database This can for example be the mean, max, min, median. For a full list see toargridding.toarstat_constants.STATISTICS_LIST """ return self._statistic - + @statistic.setter - def statistic(self, stat : str): + def statistic(self, stat: str): if stat not in STATISTICS_LIST: - raise ValueError(f"invalid statistic: {stat}") + msg = f"invalid statistic: {stat}" + raise ValueError(msg) self._statistic = stat def get_id(self) -> str: @@ -156,17 +154,27 @@ class Metadata: For example, used for saving link to results of a request in the cache. """ addition = "_".join(f"{key}-{val}" for key, val in sorted(self.moreOptions.items())) - addition = addition.replace("/","%2F") - return "_".join(str(i) for i in [self.variable.name, self.statistic, self.time.daterange_option, self.time.frequency, addition, "at", datetime.now().date().isoformat()]) + addition = addition.replace("/", "%2F") + return "_".join( + str(i) + for i in [ + self.variable.name, + self.statistic, + self.time.daterange_option, + self.time.frequency, + addition, + "at", + datetime.now().date().isoformat(), + ] + ) def get_title(self) -> str: - """creation of a title for metadata of a xarray according to the CF convention - """ + """creation of a title for metadata of a xarray according to the CF convention""" return f"{self.time.sampling} {self.statistic} statistic for {self.variable.standard_name} from {self.time.start} to {self.time.end} aggregated on global grid" def get_summary(self) -> str: """creation of a descriptive string - + At the moment same as title. """ return f"{self.time.sampling} {self.statistic} statistic for {self.variable.standard_name} from {self.time.start} to {self.time.end} aggregated on global grid" @@ -191,9 +199,9 @@ class AnalysisRequestResult: metadata: Metadata -def get_global_attributes(metadata: Metadata) -> Dict: +def get_global_attributes(metadata: Metadata) -> dict: """combination of global metadata with request specific values. - Also adds all additional options passed to the request as meta data. + Also adds all additional options passed to the request as meta data. Throws an exception if moreOptions contains an key already in use by the metadata. """ dynamic_cf_attributes = { @@ -214,22 +222,23 @@ def get_global_attributes(metadata: Metadata) -> Dict: "product_version": f"version of toargridding {importlib.metadata.version( __package__ or __name__ )}", } for key, value in metadata.moreOptions.items(): - if not key in dynamic_cf_attributes: + if key not in dynamic_cf_attributes: dynamic_cf_attributes[key] = value else: - raise ValueError(f"{key} is already has the value {dynamic_cf_attributes[key]}. Prohibited overriding with \"{value}\"!") + msg = f'{key} is already has the value {dynamic_cf_attributes[key]}. Prohibited overriding with "{value}"!' + raise ValueError(msg) cf_attributes = dynamic_cf_attributes | global_cf_attributes return cf_attributes -def get_cf_metadata(variable: Variables, metadata: Metadata | None)-> Dict: +def get_cf_metadata(variable: Variables, metadata: Metadata | None) -> dict: """get CF meta data for coordinates and requested parameter - The resulting meta data follow the CF convention: https://cfconventions.org/cf-conventions/cf-conventions.html. - + The resulting meta data follow the CF convention: https://cfconventions.org/cf-conventions/cf-conventions.html. + The resulting dictionary contains the required values for the netCDF files. """ - + match variable.name: case Variables.latitude.name: cf_metadata = { diff --git a/toargridding/setupFunctions.py b/src/toargridding/setupFunctions.py similarity index 81% rename from toargridding/setupFunctions.py rename to src/toargridding/setupFunctions.py index 4a841ff5c4f13acc691e781c6a89cdb832761fb2..3129c8464197c8e786ea90e684c8314d2973c481 100644 --- a/toargridding/setupFunctions.py +++ b/src/toargridding/setupFunctions.py @@ -1,17 +1,19 @@ -import logging +import json +import logging -from toargridding.static_metadata import TOAR_VARIABLES_METADATA_PATH import requests -import json + +from toargridding.static_metadata import TOAR_VARIABLES_METADATA_PATH logger = logging.getLogger(__name__) + def updateTOARVariables(): """Download the most recent list of variables from the TOAR database - This overwrites the current static file. + This overwrites the current static file. """ response = requests.get("https://toar-data.fz-juelich.de/api/v2/variables/?limit=None") - response.raise_for_status()#check, for errors. + response.raise_for_status() # check, for errors. varList = response.json() with open(TOAR_VARIABLES_METADATA_PATH, "w") as f: json.dump(varList, f, indent=2) diff --git a/toargridding/static_metadata.py b/src/toargridding/static_metadata.py similarity index 80% rename from toargridding/static_metadata.py rename to src/toargridding/static_metadata.py index 04ce79d23e00f894739fcd00b2b274c2a977bde8..9ab2717498f2337f5e21ba4c58732dd0285eb417 100644 --- a/toargridding/static_metadata.py +++ b/src/toargridding/static_metadata.py @@ -1,8 +1,7 @@ -import logging - -from pathlib import Path -from dataclasses import dataclass import json +import logging +from dataclasses import dataclass +from pathlib import Path logger = logging.getLogger(__name__) @@ -12,11 +11,11 @@ GLOABAL_CF_ATTRIBUTES_PATH = STATIC_METADATA_PATH / "global_cf_attributes.json" def load_global_cf_attributes(): - """loading of global + """loading of global Loads global metadata like details on the TOAD DB, license, and CF convention. """ - with open(GLOABAL_CF_ATTRIBUTES_PATH, "r") as f: + with open(GLOABAL_CF_ATTRIBUTES_PATH) as f: return json.load(f) @@ -30,7 +29,7 @@ class TOARVariable: vars = None """available variables from the TOAR database.""" - + name: str longname: str displayname: str @@ -41,11 +40,11 @@ class TOARVariable: @classmethod def load_toar_variables(cls) -> list["TOARVariable"]: - """load available variables - + """load available variables + Is executed by loading the TOARVariable class """ - with open(TOAR_VARIABLES_METADATA_PATH, "r") as f: + with open(TOAR_VARIABLES_METADATA_PATH) as f: variables = json.load(f) cls.vars = [TOARVariable(**var) for var in variables] @@ -53,7 +52,7 @@ class TOARVariable: @classmethod def get(cls, name: str) -> "TOARVariable": """searches the known variables for the requested variable - + Parameters: ---------- name: @@ -63,19 +62,21 @@ class TOARVariable: return: provides direct access to the meta data of the selected variable """ - #TODO: update README + # TODO: update README try: return cls.get_from_cf_standardname(standard_name=name) - except ValueError as e: + except ValueError: pass try: return cls.get_from_name(name=name) - except ValueError as e: - raise ValueError(f"TOAR Database contains no variable with cf_standardname or name '{name}'") + except ValueError: + msg = f"TOAR Database contains no variable with cf_standardname or name '{name}'" + raise ValueError(msg) + @classmethod def get_from_cf_standardname(cls, standard_name: str) -> "TOARVariable": """searches the known variables for the given cf_standardname - + Parameters: ---------- standard_name: @@ -89,12 +90,13 @@ class TOARVariable: for var in cls.vars: if var.standard_name == standard_name: return var - else: - raise ValueError(f"TOAR Database contains no variable with cf_standardname {standard_name}") + msg = f"TOAR Database contains no variable with cf_standardname {standard_name}" + raise ValueError(msg) + @classmethod def get_from_name(cls, name: str) -> "TOARVariable": """searches the known variables for the given name - + Parameters: ---------- name: @@ -106,27 +108,24 @@ class TOARVariable: if name == "" or name == None: raise ValueError("No name provided for variable.") for var in cls.vars: - if var.name==name: + if var.name == name: return var - else: - raise ValueError(f"TOAR Database contains no variable {name}") + msg = f"TOAR Database contains no variable {name}" + raise ValueError(msg) @property def toar_id(self): - """get toar ID; same as id - """ + """get toar ID; same as id""" return self.id @property def standard_name(self): - """alias to get cf_standardname - """ + """alias to get cf_standardname""" return self.cf_standardname @property def long_name(self): - """alias to get longname - """ + """alias to get longname""" return self.longname diff --git a/toargridding/static_metadata/global_cf_attributes.json b/src/toargridding/static_metadata/global_cf_attributes.json similarity index 100% rename from toargridding/static_metadata/global_cf_attributes.json rename to src/toargridding/static_metadata/global_cf_attributes.json diff --git a/toargridding/static_metadata/toar_variables.json b/src/toargridding/static_metadata/toar_variables.json similarity index 100% rename from toargridding/static_metadata/toar_variables.json rename to src/toargridding/static_metadata/toar_variables.json diff --git a/toargridding/toar_rest_client.py b/src/toargridding/toar_rest_client.py similarity index 75% rename from toargridding/toar_rest_client.py rename to src/toargridding/toar_rest_client.py index 65a52fb32bc4542ce135a80e59b5e2d17f15f37d..2b2229438b15a43fcd261db7a436e35e401881f0 100644 --- a/toargridding/toar_rest_client.py +++ b/src/toargridding/toar_rest_client.py @@ -1,20 +1,17 @@ -import logging - -import time -from datetime import datetime import io -from zipfile import ZipFile -from dataclasses import dataclass, asdict, field -from contextlib import contextmanager import json +import logging +import time +from contextlib import contextmanager +from dataclasses import asdict, dataclass, field +from datetime import datetime from pathlib import Path +from zipfile import ZipFile -import requests import pandas as pd +import requests -from typing import Dict - -from toargridding.metadata import Metadata, AnalysisRequestResult, Coordinates +from toargridding.metadata import AnalysisRequestResult, Coordinates, Metadata logger = logging.getLogger(__name__) @@ -28,13 +25,15 @@ class EmptyDataError(ValueError): This might be the case, if there are not statuins, or if the statistical analysis does not yield any data points """ + def __init__(self, message): super().__init__(message) + @dataclass(frozen=True) class QueryOptions: - """Creation of a request to the TOAR database. - + """Creation of a request to the TOAR database. + Attributes: ---------- datarange: @@ -48,13 +47,13 @@ class QueryOptions: min_data_capture: most probably the minimum data to include in the request metadata_schema: - amount of metadata being provided, see Quick Start for TOAR Analysis Service + amount of metadata being provided, see Quick Start for TOAR Analysis Service limit: - limit to amount of extracted data; see Quick Start for TOAR Analysis Service + limit to amount of extracted data; see Quick Start for TOAR Analysis Service format: - output format; see Quick Start for TOAR Analysis Service + output format; see Quick Start for TOAR Analysis Service moreOptions: - dict with additional query options for the request to the TOAR database. + dict with additional query options for the request to the TOAR database. """ daterange: str @@ -65,11 +64,11 @@ class QueryOptions: metadata_scheme: str = "basic" limit: str = "None" format: str = "by_statistic" - moreOptions : Dict = field(default_factory=lambda: {}) #needs to be last element for to dict factory + moreOptions: dict = field(default_factory=dict) # needs to be last element for to dict factory @staticmethod def from_metadata(metadata: Metadata): - """Creation from Metadata object + """Creation from Metadata object Copies datarange, variable_id, statistics and sampling from the metadata object. For the other parameters the default values are used. @@ -79,26 +78,27 @@ class QueryOptions: variable_id=str(metadata.variable.toar_id), statistics=metadata.statistic, sampling=metadata.time.sampling, - moreOptions=metadata.moreOptions + moreOptions=metadata.moreOptions, ) @property def cache_key(self): - """creation to identify the request in the cache of known request. - """ - return "".join(f"{key}{val}" for key, val in sorted(asdict(self, dict_factory=quarryToDict).items())) + """creation to identify the request in the cache of known request.""" + return "".join(f"{key}{val}" for key, val in sorted(asdict(self, dict_factory=quarryToDict).items())) -def quarryToDict(data : QueryOptions): - out = { field : value for field, value in data[:-1] } +def quarryToDict(data: QueryOptions): + out = dict(data[:-1]) extraVals = data[-1][1] for field, value in extraVals.items(): - if not field in data: + if field not in data: out[field] = value else: - raise ValueError(f"Providing invalid value for TAOR database: {field} is controlled by Metadata class") + msg = f"Providing invalid value for TAOR database: {field} is controlled by Metadata class" + raise ValueError(msg) return out + class Cache: """cache to store download links for requests to the TOAD database @@ -108,23 +108,25 @@ class Cache: It is created in a dict like way and supports operations like "in". A textfile with the extension "_v2.json" is created in cache_dir. CAVE: The first entry is required for loading the empty file. The cache also saves the creation time of a status endpoint and automatically deletes entries that are to old. The maximum age in days is set via Cache.setMaxDaysInCache(). - + """ - maxDaysInCache = 14 + maxDaysInCache = 14 __timeFormat = "%Y%m%d-%H:%M" - def __init__(self, cache_dir : Path, fn = "status_endpoints" ): + + def __init__(self, cache_dir: Path, fn="status_endpoints"): """constructor - + Throws exception if cache directory does not exists. Parameters ---------- cache_dir: directory for storing cache file. """ - + if not cache_dir.exists(): - raise RuntimeError(f"Given directory for saving cache file does not exists. Path: {cache_dir}") + msg = f"Given directory for saving cache file does not exists. Path: {cache_dir}" + raise RuntimeError(msg) self.cache_file = cache_dir / f"{fn}_v2.json" if not self.cache_file.is_file(): # initialize cache with dummy values @@ -132,32 +134,28 @@ class Cache: json.dump({"foo": ["bar", self.__getTime()]}, cache) def __contains__(self, item: str): - """allows usage of "in" - """ + """allows usage of "in" """ with self.storage_dict() as storage: - return item in storage.keys() + return item in storage def get(self, key: str): - """get an endpoint from the cache. - """ + """get an endpoint from the cache.""" with self.storage_dict() as storage: return storage[key][0] def put(self, key: str, content: str): - """get add key and content as key-value-pair to cache - """ + """get add key and content as key-value-pair to cache""" with self.storage_dict() as storage: storage[key] = [content, self.__getTime()] def remove(self, key: str): - """remove a key and content as key-value-pair to cache - """ + """remove a key and content as key-value-pair to cache""" with self.storage_dict() as storage: del storage[key] @contextmanager def storage_dict(self): - with open(self.cache_file, "r") as cache: # setup + with open(self.cache_file) as cache: # setup storage_dict = json.load(cache) yield storage_dict @@ -166,30 +164,26 @@ class Cache: json.dump(storage_dict, cache, indent=2) def __getTime(self): - """! get now as string for saving in cache - """ + """! get now as string for saving in cache""" return datetime.now().strftime(Cache.__timeFormat) - + def clearCache(self): - """!Delete all values from the cache, that are older than the maxDaysInCache value - - """ + """!Delete all values from the cache, that are older than the maxDaysInCache value""" with self.storage_dict() as storage: now = datetime.now() - storage["foo"] = ["bar", self.__getTime()]#preserve first line to ensure, that the file can be read + storage["foo"] = ["bar", self.__getTime()] # preserve first line to ensure, that the file can be read deleteList = [] - for key, [_,creation] in storage.items(): - creationTime = datetime.strptime(creation, Cache.__timeFormat) + for key, [_, creation] in storage.items(): + creationTime = datetime.strptime(creation, Cache.__timeFormat) age = now - creationTime - age = float(age.days) + age.seconds / 24 / 3600. + age = float(age.days) + age.seconds / 24 / 3600.0 if age > Cache.maxDaysInCache: deleteList.append(key) for key in deleteList: del storage[key] - - + @staticmethod - def setMaxDaysInCache(maxDays : float): + def setMaxDaysInCache(maxDays: float): """! set the maximum age of entries in the cache. Also accepts fractions of a day down to seconds. Parameters: @@ -200,7 +194,8 @@ class Cache: if maxDays > 0: Cache.maxDaysInCache = maxDays else: - raise ValueError(f"The maximum age needs to be larger than 0. You provided {maxDays}.") + msg = f"The maximum age needs to be larger than 0. You provided {maxDays}." + raise ValueError(msg) class Connection: @@ -226,10 +221,11 @@ class Connection: self.cache_backup = Cache(cache_dir, "status_endpoints.old") # max wait time is 30min self.wait_seconds = [] + def setRequestTimes(self, interval_min, maxWait_min): """set the intervals and maximum duration to wait, before requests to the analysis service are stopped - The waiting intervals determine how long and often the status endpoint is checked if the results are available. + The waiting intervals determine how long and often the status endpoint is checked if the results are available. As soon as the maximum waiting time is reached, the process is stopped. It can be restarted at any time, as the required endpoint is stored in a cache file Parameters: @@ -239,36 +235,36 @@ class Connection: maxWait_min: maximum duration to wait in minutes. """ - if maxWait_min <=0: - raise RuntimeError(f"The maximum waiting time needs to be larger than 0min.") - elif interval_min <0 or interval_min > maxWait_min: + if maxWait_min <= 0: + msg = "The maximum waiting time needs to be larger than 0min." + raise RuntimeError(msg) + elif interval_min < 0 or interval_min > maxWait_min: self.wait_seconds[0] = maxWait_min else: - self.wait_seconds = [interval_min * 60 for _ in range(interval_min, maxWait_min+1, interval_min)] - + self.wait_seconds = [interval_min * 60 for _ in range(interval_min, maxWait_min + 1, interval_min)] - def get(self, query_options : QueryOptions) -> requests.models.Response: + def get(self, query_options: QueryOptions) -> requests.models.Response: """get results for a request. This is the main function to obtained data from the TOAR DB. It will start requests or lookup if an already started requests is finished. - As soon as a request yields an status code 500 (which most likely indicates, that the worker of the analysis service has crashed), the endpoint is discarded and the request is resubmitted. + As soon as a request yields an status code 500 (which most likely indicates, that the worker of the analysis service has crashed), the endpoint is discarded and the request is resubmitted. Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results. - This function catches possible connection issues and continues to + This function catches possible connection issues and continues to """ status_endpoint = self.get_status_endpoint(query_options) - nRestart=0 + nRestart = 0 waitRestart = 300 for i, wait_time in enumerate(self.wait_seconds): logger.info(f"try: {i+1}, wait_time: {wait_time}") response = self.wait_and_get(status_endpoint, wait_secs=wait_time) - #do error handling i.e. look for connection issues + # do error handling i.e. look for connection issues try: response.raise_for_status() - except requests.exceptions.HTTPError as e: + except requests.exceptions.HTTPError as e: logger.warning(f"\tconnection error ({e.response.status_code}: {e.response.reason}).") self.printExecption(e, response) - #a Status Code 500 seems indicated an aborted request -> restart the request and continue with new status endpoint + # a Status Code 500 seems indicated an aborted request -> restart the request and continue with new status endpoint if e.response.status_code == 500: logger.warning(f"Received internal server error. Restarting request in {waitRestart}s.") time.sleep(waitRestart) @@ -277,14 +273,12 @@ class Connection: else: logger.warning("\tTrying again later.") continue - #are our results ready to obtain? + # are our results ready to obtain? if response.headers["Content-Type"] == "application/zip": logger.info("Results are available for download") return response - else: - raise RuntimeError( - f"No data available after {sum(self.wait_seconds) / 60 + nRestart*waitRestart / 60} minutes. retry later." - ) + msg = f"No data available after {sum(self.wait_seconds) / 60 + nRestart*waitRestart / 60} minutes. retry later." + raise RuntimeError(msg) def get_status_endpoint(self, query_options: QueryOptions) -> str: """get endpoint to results of a request @@ -293,9 +287,9 @@ class Connection: If yes, the know endpoint is returned. If the cache knows the endpoint, but the DB has deleted it, the endpoint is removed from the cache and a new request is started. Otherwise a new new request is started. - + Throws an RuntimeError in case, of a connection error or any other error. In case of an HTTPError, the request is removed from the cache - + Parameters: ---------- Options for the request. @@ -306,38 +300,38 @@ class Connection: try: # test for stale cache response = self.wait_and_get(status_endpoint) response.raise_for_status() - except requests.exceptions.ReadTimeout as e: + except requests.exceptions.ReadTimeout: logger.critical("Caught read timeout.") - raise RuntimeError("Connection to TAORDB timed out (ReadTimeout) while checking cached status point. Please try again later.") + msg = "Connection to TAORDB timed out (ReadTimeout) while checking cached status point. Please try again later." + raise RuntimeError(msg) except requests.exceptions.HTTPError as e: - #TODO add detailed processing: What was the reason for the error? Do we really need to create a new request or is there another problem, that might resolve by simply waiting - logger.warning(f"A HTTP error occurred:") + # TODO add detailed processing: What was the reason for the error? Do we really need to create a new request or is there another problem, that might resolve by simply waiting + logger.warning("A HTTP error occurred:") self.printExecption(e, response) logger.debug(f"Status Endpoint: {status_endpoint}") - #use inverse order for saving. the status endpoint should be more unique + # use inverse order for saving. the status endpoint should be more unique self.cache_backup.put(status_endpoint, query_options.cache_key) - #will be overwritten in the next step. + # will be overwritten in the next step. self.cache.remove(query_options.cache_key) logger.warning("Removing status endpoint from cache and submitting new request.") - pass except: - raise RuntimeError(f"An error occurred during accessing a cached request") + msg = "An error occurred during accessing a cached request" + raise RuntimeError(msg) else: logger.info("load status endpoint from cache") return status_endpoint else: logger.info("query not in cache") - status_endpoint = self.query_for_status_endpoint(query_options) - return status_endpoint + return self.query_for_status_endpoint(query_options) def query_for_status_endpoint(self, query_options: QueryOptions) -> str: """create and new request to the TOAR DB. - Adds the status endpoint of the request to the cache. + Adds the status endpoint of the request to the cache. Throws an exception if the TOAR Db returns an error. - + Parameters: ---------- query_options: @@ -352,23 +346,25 @@ class Connection: status_endpoint = response.json()["status"] response.raise_for_status() except requests.exceptions.HTTPError as e: - logger.warning(f"An HTTP error occurred:") + logger.warning("An HTTP error occurred:") self.printExecption(e, response) - raise e + raise except requests.exceptions.ReadTimeout as e: logger.critical("Caught read timeout.") self.printExecption(e, response) - raise RuntimeError("Read timeout while querying for status endpoint") + msg = "Read timeout while querying for status endpoint" + raise RuntimeError(msg) except: - raise RuntimeError(f"Request was not successful. Response by TOAR database: {response.text}") - - #we mage it: let's remember the status endpoint to get our results later:-) + msg = f"Request was not successful. Response by TOAR database: {response.text}" + raise RuntimeError(msg) + + # we mage it: let's remember the status endpoint to get our results later:-) self.cache.put(query_options.cache_key, status_endpoint) return status_endpoint def wait_and_get( - self, endpoint : str, query_options : Dict =None, wait_secs=None, timeout=(3.05, 20) + self, endpoint: str, query_options: dict | None = None, wait_secs=None, timeout=(3.05, 20) ) -> requests.models.Response: """accesses given endpoint @@ -388,9 +384,8 @@ class Connection: return requests.get(endpoint, params=query_options, timeout=timeout) - def printExecption(self, e : requests.exceptions.HTTPError, response : requests.Response): - """!output different infos from an exception and the corresponding response. - """ + def printExecption(self, e: requests.exceptions.HTTPError, response: requests.Response): + """!output different infos from an exception and the corresponding response.""" logger.debug(f"Status Code: {e.response.status_code}") logger.debug(f"Reason: {e.response.reason}") logger.debug(f"Text: {e.response.text}") @@ -422,7 +417,7 @@ class AnalysisService: Handles requesting and loading of data into memory as soon as they are available. In addition the data and coordinates undergo a cleanup. - + Parameters: ---------- metadata: @@ -430,15 +425,13 @@ class AnalysisService: return: Requested data and statistics, station coordinates and metadata of the request """ - + timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata) coords = self.get_clean_coords(timeseries_metadata) timeseries = self.get_clean_timeseries(timeseries, metadata) return AnalysisRequestResult(timeseries, coords, metadata) - def get_timeseries_and_metadata( - self, metadata: Metadata - ) -> tuple[pd.DataFrame, pd.DataFrame]: + def get_timeseries_and_metadata(self, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: """obtain data and metadata from TOAR database return: @@ -450,7 +443,7 @@ class AnalysisService: timeseries, timeseries_metadata = self.load_data(result.content, metadata) return timeseries, timeseries_metadata - def get_clean_coords(self, timeseries_metadata : pd.DataFrame): + def get_clean_coords(self, timeseries_metadata: pd.DataFrame): """remove all stations with invalid coordinates invalid coordinates are NaN, none etc. return: @@ -461,56 +454,54 @@ class AnalysisService: valid_coords = coords.notna().all(axis=1) return coords[valid_coords] - def get_clean_timeseries(self, timeseries : pd.DataFrame, metadata: Metadata): + def get_clean_timeseries(self, timeseries: pd.DataFrame, metadata: Metadata): """replaces all nan in the data with 0 for plotting - + timeseries: extracted time series metadata: metadata belonging ot the timeseries. - + return: timeseries without invalid numbers (none, NaN, etc) """ # TODO maybe use cf-index here already ? ##here we observe some differences in the number of timestamps. # remove data where utc -> sun/local ? time conversion leads to date shift - - #conversion from string to datetime objects for easier access - #TODO maybe check if the conversion is also valid for annual data - timeseries.columns = pd.DatetimeIndex([ datetime.strptime(val, "%Y-%m-%d") for val in timeseries.columns ]) - #now drop columns outside of our requested range: - #this is independent of the other part + # conversion from string to datetime objects for easier access + # TODO maybe check if the conversion is also valid for annual data + timeseries.columns = pd.DatetimeIndex([datetime.strptime(val, "%Y-%m-%d") for val in timeseries.columns]) + + # now drop columns outside of our requested range: + # this is independent of the other part col2Drop = [] for retrievedDate in timeseries.columns: if retrievedDate < metadata.time.start: - col2Drop.append( retrievedDate ) + col2Drop.append(retrievedDate) else: break - for i in range(-1, -len(timeseries.columns) , -1): + for i in range(-1, -len(timeseries.columns), -1): retrievedDate = timeseries.columns[i] if retrievedDate > metadata.time.end: - col2Drop.append( retrievedDate ) + col2Drop.append(retrievedDate) else: break if len(col2Drop): - logger.info(f"Dropping columns ({col2Drop}) from TOAR data to match requested date range [{metadata.time.start}, {metadata.time.end}]") + logger.info( + f"Dropping columns ({col2Drop}) from TOAR data to match requested date range [{metadata.time.start}, {metadata.time.end}]" + ) timeseries.drop(columns=col2Drop, inplace=True) - + all_na = timeseries.isna().all(axis=1) timeseries = timeseries[~all_na] - timeseries = timeseries.fillna(0) - - return timeseries + return timeseries.fillna(0) - def load_data( - self, content: bytes, metadata: Metadata - ) -> tuple[pd.DataFrame, pd.DataFrame]: + def load_data(self, content: bytes, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: """convert downloaded byte stream into pandas dataframes - throws an EmptyDataError, if the results file does not contain data. + throws an EmptyDataError, if the results file does not contain data. This is a result if there are not stations contributing to a request or if the restrictions of the analysis exclude all points of a station. Parameters: @@ -522,14 +513,15 @@ class AnalysisService: """ zip_stream = io.BytesIO(content) with ZipFile(zip_stream) as myzip: - if len(myzip.namelist())==1: - raise EmptyDataError("Data file from TOAR analysis service is empty") + if len(myzip.namelist()) == 1: + msg = "Data file from TOAR analysis service is empty" + raise EmptyDataError(msg) timeseries = self.extract_data(myzip, metadata.statistic) timeseries_metadata = self.extract_data(myzip, AnalysisService.METADATA) return timeseries, timeseries_metadata - def extract_data(self, zip_file : ZipFile, data_file : str) -> pd.DataFrame: + def extract_data(self, zip_file: ZipFile, data_file: str) -> pd.DataFrame: """extract a specific csv file from the zip file Parameters: @@ -561,16 +553,15 @@ class AnalysisServiceDownload(AnalysisService): use_downloaded: flag to control if the cache of downloaded requests is checked before extracting data from the TOARDB """ - def __init__( - self, stats_endpoint, cache_dir, sample_dir: Path, use_downloaded=True - ): + + def __init__(self, stats_endpoint, cache_dir, sample_dir: Path, use_downloaded=True): super().__init__(stats_endpoint, cache_dir) self.sample_dir = sample_dir self.use_downloaded = use_downloaded def get_timeseries_and_metadata(self, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: """loading of cached data or requesting of data from the TOARDB - + Parameters: ---------- metadata: @@ -606,6 +597,19 @@ class AnalysisServiceDownload(AnalysisService): metadata: metadata for the request. """ - addition = "_".join(f"{key}{val}" for key,val in sorted(metadata.moreOptions.items())) - addition = addition.replace("/","%2F") - return "_".join(str(i) for i in [metadata.statistic, metadata.time.sampling, metadata.variable.cf_standardname, metadata.time.start.date(), metadata.time.end.date(), addition]) + ".zip" + addition = "_".join(f"{key}{val}" for key, val in sorted(metadata.moreOptions.items())) + addition = addition.replace("/", "%2F") + return ( + "_".join( + str(i) + for i in [ + metadata.statistic, + metadata.time.sampling, + metadata.variable.cf_standardname, + metadata.time.start.date(), + metadata.time.end.date(), + addition, + ] + ) + + ".zip" + ) diff --git a/toargridding/metadata_utilities.py b/src/toargridding/toargridding/metadata_utilities.py similarity index 71% rename from toargridding/metadata_utilities.py rename to src/toargridding/toargridding/metadata_utilities.py index 73bfc3eb92db7b001bfed6d1d8cde5f3278f923b..e89c683dc24673afd4fd3548997167c15fb0cdc9 100644 --- a/toargridding/metadata_utilities.py +++ b/src/toargridding/toargridding/metadata_utilities.py @@ -1,11 +1,13 @@ -import logging +import logging +from collections import namedtuple import requests -from collections import namedtuple + logger = logging.getLogger(__name__) ControlVoc = namedtuple("ControlVoc", ["ID", "short", "long"]) + class countryCodes: """! this is a quick and dirty utility class to obtain all valid values of a specific metadata value of the stations It is created with the example of country codes in mind, to split the number of requests to the TOAD database for each specific analysis request. @@ -16,39 +18,44 @@ class countryCodes: stationMetaEndpoint: endpoint to access all stations fitting to a specific search pattern. Will be used to validate, that a metadata values provides access to stations. """ - def __init__(self, controlVocEndpoint="https://toar-data.fz-juelich.de/api/v2/controlled_vocabulary/", stationMetaEndpoint="https://toar-data.fz-juelich.de/api/v2/stationmeta/"): + + def __init__( + self, + controlVocEndpoint="https://toar-data.fz-juelich.de/api/v2/controlled_vocabulary/", + stationMetaEndpoint="https://toar-data.fz-juelich.de/api/v2/stationmeta/", + ): self.controlVocEndpoint = controlVocEndpoint if self.controlVocEndpoint[-1] != "/": self.controlVocEndpoint += "/" self.stationMetaEndpoint = stationMetaEndpoint if self.stationMetaEndpoint[-1] != "/": self.stationMetaEndpoint += "/" - def getValidVocabular(self, controlName, varName ): + + def getValidVocabular(self, controlName, varName): """get all valid values for a variable Argument: -------- controlName: name of the parameter in the control vocabulary varName: - name of the parameter in a request to another endpoint. + name of the parameter in a request to another endpoint. return: list with all valid values for the requested varName to be used within request to the TOAR database """ - #get all possible values for the given variable: - response = requests.get( f"{self.controlVocEndpoint}{controlName}") + # get all possible values for the given variable: + response = requests.get(f"{self.controlVocEndpoint}{controlName}") response.raise_for_status() controlVoc = self.convertLists(response) - #check for all valid infos, i.e. is there at least one station for this metadata + # check for all valid infos, i.e. is there at least one station for this metadata validCodes = [] for voc in controlVoc: - params = {varName : voc.short, "limit" : 1} - getNumber = requests.get(self.stationMetaEndpoint, params=params ) + params = {varName: voc.short, "limit": 1} + getNumber = requests.get(self.stationMetaEndpoint, params=params) getNumber.raise_for_status() if len(getNumber.json()) > 0: validCodes.append(voc.short) return validCodes - + def convertLists(self, response) -> list[namedtuple]: - """convert results into an easier to access data type - """ - return [ ControlVoc(id, short, long) for id, short, long in response.json()] + """convert results into an easier to access data type""" + return [ControlVoc(id, short, long) for id, short, long in response.json()] diff --git a/toargridding/toarstats_constants.py b/src/toargridding/toarstats_constants.py similarity index 98% rename from toargridding/toarstats_constants.py rename to src/toargridding/toarstats_constants.py index 9b22c849fda063ebc700c787cb0af53463128efd..3ff9c1c98accceb2e0e155ffa3510afa163a0d45 100644 --- a/toargridding/toarstats_constants.py +++ b/src/toargridding/toarstats_constants.py @@ -1,4 +1,3 @@ -import logging # taken from https://gitlab.jsc.fz-juelich.de/esde/toar-public/toarstats/-/blob/master/toarstats/metrics/constants.py#L12-21 ALLOWED_SAMPLING_VALUES = [ diff --git a/toargridding/variables.py b/src/toargridding/variables.py similarity index 69% rename from toargridding/variables.py rename to src/toargridding/variables.py index 0dc47219579e5bc386b799e5a5c35d22c7da3bb0..c47d448bc5fd7ba82ddf034cca31daf1e476498b 100644 --- a/toargridding/variables.py +++ b/src/toargridding/variables.py @@ -1,24 +1,23 @@ -import logging +import logging from dataclasses import dataclass import numpy as np import xarray as xr -from toargridding.metadata import Variables, get_cf_metadata, Metadata - -from typing import Dict +from toargridding.metadata import Metadata, Variables, get_cf_metadata logger = logging.getLogger(__name__) + @dataclass class Variable: - """full variable including data and information according to CF - + """full variable including data and information according to CF + CF: https://cfconventions.org/cf-conventions/cf-conventions.html - + Parameters: ---------- - var: + var: name of the TOAR variable data: array with data @@ -34,16 +33,14 @@ class Variable: encoding: dict[str, str] @classmethod - def from_data(cls, data : np.array, variable: Variables, metadata: Metadata | None, **kwargs): - """construction from analysis results - """ + def from_data(cls, data: np.array, variable: Variables, metadata: Metadata | None, **kwargs): + """construction from analysis results""" cf_metadata = get_cf_metadata(variable, metadata=metadata) - #print(variable.name, cf_metadata) + # print(variable.name, cf_metadata) return cls(variable, data, **cf_metadata, **kwargs) def as_data_array(self, dims) -> xr.DataArray: - """conversion to DataArray - """ + """conversion to DataArray""" da = xr.DataArray( self.data, name=self.name, @@ -55,42 +52,35 @@ class Variable: @property def name(self): - """shortcut to variable name - """ + """shortcut to variable name""" return self.var.name @property def size(self): - """shortcut to length of data array - """ + """shortcut to length of data array""" return self.data.size @property def min(self): - """shortcut to minimum of data array - """ + """shortcut to minimum of data array""" return self.data.min() @property def max(self): - """shortcut to maximum of data array - """ + """shortcut to maximum of data array""" return self.data.max() @dataclass class Coordinate(Variable): - """coordinate axis - """ + """coordinate axis""" step: float | str @classmethod - def from_resolution( - cls, variable: Variables, resolution: float, min: float, max: float, wraps : bool - ): + def from_resolution(cls, variable: Variables, resolution: float, min: float, max: float, wraps: bool): """construction from a data range and resolution - + Creates a coordinate axis between min and amx with a step size close to resolution. Parameters: @@ -108,10 +98,12 @@ class Coordinate(Variable): """ span = max - min - n = int(span / resolution) #TODO: raise error if invalid inputs ? - if n*resolution != span: - logger.warning(f"Resolution {resolution} does not provide an equidistant division of the span [{min},{max}]") - n+=1 + n = int(span / resolution) # TODO: raise error if invalid inputs ? + if n * resolution != span: + logger.warning( + f"Resolution {resolution} does not provide an equidistant division of the span [{min},{max}]" + ) + n += 1 step = span / n logger.warning(f"Adoption resolution {resolution} to {step}") else: @@ -124,12 +116,10 @@ class Coordinate(Variable): return cls.from_data(data, variable, None, step=step) def as_data_array(self): - """conversion to data array - """ + """conversion to data array""" return super().as_data_array(dims=self.name) -def get_encoding(variables: list[Variable]) -> Dict[str,Dict]: - """get dictionary encoding with encoding - """ +def get_encoding(variables: list[Variable]) -> dict[str, dict]: + """get dictionary encoding with encoding""" return {variable.name: variable.encoding for variable in variables} diff --git a/tests/__init__.py b/tests/__init__.py index 024186dd32a67ed010f3748f356f909b697b085d..9caffee885405f534fb5e41440caf988af01ce86 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +1,4 @@ +from toargridding.gridding import get_gridded_toar_data from toargridding.grids import GridDefinition, GridType +from toargridding.metadata import Metadata, TimeSample from toargridding.toar_rest_client import AnalysisService -from toargridding.metadata import TimeSample, Metadata -from toargridding.gridding import get_gridded_toar_data diff --git a/tests/benchmark.py b/tests/benchmark.py index 635859c21b73ca090f9d9ef343eaf2bc61970a5b..6074133751473f003f616cb67ebf8eaa0098a9f5 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -1,26 +1,29 @@ from datetime import datetime, timedelta + """ Script for benchmarking the performance of the TOAR Database. -The script relies on the AnalysisServiceDownload for downloading the requested data. +The script relies on the AnalysisServiceDownload for downloading the requested data. For a subsequent request, the already downloaded data are not considered. The script contains two ways to access the databse: - a manual way without downloading the data (not used) - usage of the AnalysisService created for this module -The output of this script are the durations of the requests. Each call is ended with the duration for this interval. +The output of this script are the durations of the requests. Each call is ended with the duration for this interval. Meanwhile, the status outputs of the analysis service are given. """ +import contextlib import time -import requests from pathlib import Path +import requests + +from toargridding.metadata import Metadata, TimeSample from toargridding.toar_rest_client import AnalysisServiceDownload -from toargridding.metadata import Metadata, TimeSample, TOARVariable start = datetime(2016, 3, 1) end = datetime(2016, 3, 3) @@ -32,8 +35,7 @@ TEST_ROOT = Path(__file__).parent def get_toar_response(start, end): - """manual request to the TOAR database - """ + """manual request to the TOAR database""" end_with_padding = end + timedelta(1) response = requests.get( @@ -53,51 +55,40 @@ def get_toar_response(start, end): def wait_for_data(response): - """waiting for data of a manual request - """ + """waiting for data of a manual request""" tries = 0 while True: - print(f"n tries: {tries}") if response.headers["Content-Type"] == "application/zip": break else: status_endpoint = response.json()["status"] time.sleep(60) - try: + with contextlib.suppress(ConnectionError): response = requests.get(status_endpoint) - except ConnectionError: - print("temporarly no connection") tries += 1 return tries def time_toar_response(start, end): - """benchmark with manual creation of request to TOAR database - """ - print(f"starting request for {start}-{end}") + """benchmark with manual creation of request to TOAR database""" timer_start = datetime.now() - tries = get_toar_response(start, end) + get_toar_response(start, end) timer_end = datetime.now() - response_time = timer_end - timer_start - - print(f"response time for {start}-{end}: {response_time} ({tries} tries)") + timer_end - timer_start def wait_for_client_response(client, sample): - """waiting for response. get_data throws exception after 30min, if results are not available. - """ - for half_hours in range(20): - try: + """waiting for response. get_data throws exception after 30min, if results are not available.""" + for _half_hours in range(20): + with contextlib.suppress(RuntimeError): client.get_data(sample) - except RuntimeError: - print(f"time out after 30min, try {half_hours+1}") -def time_rest_client_response(start :datetime, end:datetime): +def time_rest_client_response(start: datetime, end: datetime): """benchmark function using the AnalysisService with download of the requested data - + Parameters: ---------- start: @@ -105,12 +96,11 @@ def time_rest_client_response(start :datetime, end:datetime): end: end time point Results: - Prints duration and number of tries, i.e. number of calls of get_data, which times out after 30min. + Prints duration and number of tries, i.e. number of calls of get_data, which times out after 30min. """ - print(f"starting request for {start}-{end}") path_cache = TEST_ROOT / "temp_data_cache" - path_data = TEST_ROOT / "temp_data" + path_data = TEST_ROOT / "temp_data" path_cache.mkdir(parents=True, exist_ok=True) path_data.mkdir(parents=True, exist_ok=True) @@ -126,19 +116,16 @@ def time_rest_client_response(start :datetime, end:datetime): timer_start = datetime.now() wait_for_client_response(rest_client, sample) timer_end = datetime.now() - response_time = timer_end - timer_start - - print(f"response time for {start}-{end}: {response_time} (tries)") + timer_end - timer_start if __name__ == "__main__": time_windows = [ - #(datetime(2010, 1, 1), datetime(2010, 1, 2)),#this line seems to cause crashes + # (datetime(2010, 1, 1), datetime(2010, 1, 2)),#this line seems to cause crashes (datetime(2010, 1, 1), datetime(2010, 1, 8)), (datetime(2010, 1, 1), datetime(2010, 1, 31)), (datetime(2010, 1, 1), datetime(2010, 12, 31)), (datetime(2010, 1, 1), datetime(2015, 12, 31)), ] for start, end in time_windows: - print("rest client") time_rest_client_response(start, end) diff --git a/tests/conftest.py b/tests/conftest.py index 4b58914d79be5d71b86d178e98ac46017cca26cb..05ac128ae984e8a0210b611b1a0cdcdd95c35a88 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,10 @@ -from itertools import product from datetime import datetime from pathlib import Path import pytest from toargridding.grids import RegularGrid -from toargridding.metadata import TimeSample, Metadata, TOARVariable +from toargridding.metadata import Metadata, TimeSample, TOARVariable from toargridding.toar_rest_client import AnalysisService test_data = list((Path(__file__).parent / "data").iterdir()) diff --git a/tests/contryCode.py b/tests/contryCode.py index ffd1db17306647683f27807d5b19ab7009dfee1e..cd2d9847ff193349308e36bf32a89f73df84f0ac 100644 --- a/tests/contryCode.py +++ b/tests/contryCode.py @@ -1,16 +1,12 @@ +"""test script for obtaining valid country codes.""" -"""test script for obtaining valid country codes. -""" -from toargridding.metadata_utilities import countryCodes import requests +from toargridding.metadata_utilities import countryCodes + test = countryCodes() validCodes = test.getValidVocabular(controlName="Country Code", varName="country") -print(validCodes) -print(len(validCodes)) -print("stations per country code") for country in validCodes: - getNumber = requests.get(test.stationMetaEndpoint, params={"country" : country, "limit" : "None"} ) + getNumber = requests.get(test.stationMetaEndpoint, params={"country": country, "limit": "None"}) getNumber.raise_for_status() - print(f"\t{country}: {len(getNumber.json())}") \ No newline at end of file diff --git a/tests/conversionOfTimestamps.py b/tests/conversionOfTimestamps.py index f6a03aaf51f2f95d57bea237f4b759000b402304..a14c4ba226d4ab15f45f88907e20200864191841 100644 --- a/tests/conversionOfTimestamps.py +++ b/tests/conversionOfTimestamps.py @@ -1,46 +1,48 @@ import logging - -from datetime import datetime as dt from collections import namedtuple +from datetime import datetime as dt from pathlib import Path -from toargridding.toar_rest_client import AnalysisServiceDownload, Connection -from toargridding.grids import RegularGrid +##setting up logging +from toargridding.defaultLogging import toargridding_defaultLogging from toargridding.gridding import get_gridded_toar_data +from toargridding.grids import RegularGrid from toargridding.metadata import TimeSample +from toargridding.toar_rest_client import AnalysisServiceDownload -##setting up logging -from toargridding.defaultLogging import toargridding_defaultLogging logger = toargridding_defaultLogging() logger.addShellLogger(logging.DEBUG) logger.logExceptions() -#logger.addRotatingLogFile_scriptName(__file__) -#logger.addSysLogger(logging.DEBUG) +# logger.addRotatingLogFile_scriptName(__file__) +# logger.addSysLogger(logging.DEBUG) -#raise RuntimeError("For testing purposes") +# raise RuntimeError("For testing purposes") -#creation of request. +# creation of request. -Config = namedtuple("Config", ["grid", "time", "variables", "stats","moreOptions"]) +Config = namedtuple("Config", ["grid", "time", "variables", "stats", "moreOptions"]) varName = "country" -grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5, ) +grid = RegularGrid( + lat_resolution=1.9, + lon_resolution=2.5, +) -configs = dict() -country="AL" +configs = {} +country = "AL" valid_data = Config( grid, - TimeSample( start=dt(2000,1,1), end=dt(2018,12,31), sampling="monthly"),#possibly adopt range:-) - ["mole_fraction_of_ozone_in_air"],#variable name - [ "dma8epa" ], - {varName : country} + TimeSample(start=dt(2000, 1, 1), end=dt(2018, 12, 31), sampling="monthly"), # possibly adopt range:-) + ["mole_fraction_of_ozone_in_air"], # variable name + ["dma8epa"], + {varName: country}, ) configs[f"test_ta{country}"] = valid_data -#CAVE: this cell runs about 45minutes per requested year. therefore we increase the waiting duration to 1h per request. -#the processing is done on the server of the TOAR database. -#a restart of the cell continues the request to the REST API if the requested data are ready for download +# CAVE: this cell runs about 45minutes per requested year. therefore we increase the waiting duration to 1h per request. +# the processing is done on the server of the TOAR database. +# a restart of the cell continues the request to the REST API if the requested data are ready for download # The download can also take a few minutes stats_endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/" @@ -48,7 +50,9 @@ cache_basepath = Path("cache") result_basepath = Path("results") cache_basepath.mkdir(exist_ok=True) result_basepath.mkdir(exist_ok=True) -analysis_service = AnalysisServiceDownload(stats_endpoint=stats_endpoint, cache_dir=cache_basepath, sample_dir=result_basepath, use_downloaded=True) +analysis_service = AnalysisServiceDownload( + stats_endpoint=stats_endpoint, cache_dir=cache_basepath, sample_dir=result_basepath, use_downloaded=True +) # maybe adopt the interval for requesting the results and the total duration, before the client pauses the requests. # as the requests take about 45min, it is more suitable to wait 60min before timing out the requests than the original 30min. @@ -56,9 +60,7 @@ analysis_service.connection.setRequestTimes(interval_min=5, maxWait_min=60) createdFiles = [] -for person, config in configs.items(): - print(f"\nProcessing {person}:") - print(f"--------------------") +for config in configs.values(): try: datasets, metadatas = get_gridded_toar_data( analysis_service=analysis_service, @@ -66,10 +68,9 @@ for person, config in configs.items(): time=config.time, variables=config.variables, stats=config.stats, - **config.moreOptions + **config.moreOptions, ) - for dset in datasets: - print(dset) - except KeyError as e: - print("failed for ", person) - continue \ No newline at end of file + for _dset in datasets: + pass + except KeyError: + continue diff --git a/tests/coordinates.py b/tests/coordinates.py index 86a3ec6fbb7355da7211c5925d49d4fa96f26f36..f2dfb5054dca76957f3e0d990a4bd5a80b27141c 100644 --- a/tests/coordinates.py +++ b/tests/coordinates.py @@ -1,30 +1,26 @@ #!/bin/python3 -from toargridding.variables import Coordinate -from toargridding.metadata import Coordinates +import matplotlib.pyplot as plt import numpy as np from pandas import Series -from toargridding.grids import RegularGrid -import matplotlib.pyplot as plt -print("Script for visual inspection of coordinate results for longitude and latitude") +from toargridding.grids import RegularGrid +from toargridding.metadata import Coordinates +from toargridding.variables import Coordinate -lon = Coordinate.from_resolution(Coordinates.longitude, 1,-180,180, True) -print(lon) +lon = Coordinate.from_resolution(Coordinates.longitude, 1, -180, 180, True) -print("\n") -lat = Coordinate.from_resolution(Coordinates.latitude, 1.9,-90,90, False) -print(lat) +lat = Coordinate.from_resolution(Coordinates.latitude, 1.9, -90, 90, False) testGrid = RegularGrid(lat_resolution=1.9, lon_resolution=2.5) -testData = [x+0.1 for x in np.arange(-180,181, .5)] -testBins = testGrid.coord_to_index(Series(testData), lon.min, lon.step, len(lon.data) ) +testData = [x + 0.1 for x in np.arange(-180, 181, 0.5)] +testBins = testGrid.coord_to_index(Series(testData), lon.min, lon.step, len(lon.data)) -for val, bin in zip(testData, testBins): - print(val, "->", bin) +for _val, _bin in zip(testData, testBins, strict=False): + pass plt.plot(testData, testBins) -plt.show() \ No newline at end of file +plt.show() diff --git a/tests/test_cache.py b/tests/test_cache.py index 76cc71fd2ea6638f752658a5e40e719450b9884d..11e256c420e9c35065e1d01b604b9413d962c4a8 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,29 +1,28 @@ -#script to test the cache for the rest client. +# script to test the cache for the rest client. -import pytest -from unittest import mock -import datetime -from toargridding.toar_rest_client import Cache from pathlib import Path +from toargridding.toar_rest_client import Cache + + def test_cache(): - myTestCache = Cache( cache_dir=Path("cache"), fn="testingOfCache") - + myTestCache = Cache(cache_dir=Path("cache"), fn="testingOfCache") + myTestCache.put(key="test key 1", content="test content 1") myTestCache.put(key="test key 2", content="test content 1") - - assert "test key 1" in myTestCache - assert "test key 2" in myTestCache + + assert "test key 1" in myTestCache + assert "test key 2" in myTestCache assert myTestCache.get("test key 1") == "test content 1" myTestCache.clearCache() assert "test key 2" in myTestCache - - #edit creation date of key 2: + + # edit creation date of key 2: with myTestCache.storage_dict() as storage: storage["test key 2"] = ["other test content 2", "20240511-09:33"] - - assert "test key 2" in myTestCache + + assert "test key 2" in myTestCache assert myTestCache.get("test key 2") == "other test content 2" myTestCache.clearCache() - assert not "test key 2" in myTestCache + assert "test key 2" not in myTestCache diff --git a/tests/test_gridding.py b/tests/test_gridding.py index b3ea8a7a31fb563eda87b855232677ac67d53399..723ac46f74a0197016fc3498b3e74b47d5282f48 100644 --- a/tests/test_gridding.py +++ b/tests/test_gridding.py @@ -1,10 +1,10 @@ -from toargridding.gridding import get_gridded_toar_data - -from unittest import mock from pathlib import Path +from unittest import mock import xarray as xr -from compliance_checker.runner import ComplianceChecker, CheckSuite +from compliance_checker.runner import CheckSuite, ComplianceChecker + +from toargridding.gridding import get_gridded_toar_data def generate_compliance_report(dataset: xr.Dataset): @@ -37,9 +37,7 @@ def is_cf_compliant(dataset: xr.Dataset): return True -def test_get_gridded_toar_data_cf_compliance( - time, regular_grid, local_analysis_service -): +def test_get_gridded_toar_data_cf_compliance(time, regular_grid, local_analysis_service): datasets, metadatas = get_gridded_toar_data( local_analysis_service, regular_grid, @@ -64,9 +62,4 @@ def test_get_gridded_toar_data(mock_grid, mock_analysis_service, time): mock_analysis_service, mock_grid, time, variables=variables, stats=stats ) - print(datasets) - - print(mock_analysis_service.call_args_list) - print(mock_grid.call_args_list) - - assert False + raise AssertionError diff --git a/tests/test_toar_rest_client.py b/tests/test_toar_rest_client.py index d2eb06162385f11e3a6150c2a4f9fe8649fecb6a..12621e588807368d60b6aea7c49ce05893366456 100644 --- a/tests/test_toar_rest_client.py +++ b/tests/test_toar_rest_client.py @@ -1,8 +1,4 @@ -from unittest import mock - -from pytest import fixture - -from toargridding.toar_rest_client import AnalysisService, QueryOptions +from toargridding.toar_rest_client import QueryOptions def test_query_options_cache_key(metadata_ozone_mean):