benchmark.py

from datetime import datetime, timedelta
"""

Script for benchmarking the performance of the TOAR Database.

The script relies on the AnalysisServiceDownload for downloading the requested data. 
For a subsequent request, the already downloaded data are not considered.

The script contains two ways to access the databse:
- a manual way without downloading the data (not used)
- usage of the AnalysisService created for this module

The output of this script are the durations of the requests. Each call is ended with the duration for this interval. 
Meanwhile, the status outputs of the analysis service are given.
"""


import time
import requests
from pathlib import Path

from toargridding.toar_rest_client import AnalysisServiceDownload
from toargridding.metadata import Metadata, TimeSample, TOARVariable

start = datetime(2016, 3, 1)
end = datetime(2016, 3, 3)

SAMPLING = "daily"  # FIXME check monthly !!!
STATISTICS_ENDPOINT = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/"
STATISTIC = "mean"
TEST_ROOT = Path(__file__).parent


def get_toar_response(start, end):
    """manual request to the TOAR database
    """
    end_with_padding = end + timedelta(1)

    response = requests.get(
        STATISTICS_ENDPOINT,
        params={
            "daterange": f"{start.isoformat()},{end_with_padding.isoformat()}",  # 1-year
            "variable_id": 5,
            "statistics": STATISTIC,
            "sampling": SAMPLING,
            "min_data_capture": 0,
            "limit": "None",  # get all timeseries
            "format": "by_statistic",
            "metadata_scheme": "basic",
        },
    )
    return wait_for_data(response)


def wait_for_data(response):
    """waiting for data of a manual request
    """
    tries = 0
    while True:
        print(f"n tries: {tries}")
        if response.headers["Content-Type"] == "application/zip":
            break
        else:
            status_endpoint = response.json()["status"]
            time.sleep(60)
            try:
                response = requests.get(status_endpoint)
            except ConnectionError:
                print("temporarly no connection")
        tries += 1

    return tries


def time_toar_response(start, end):
    """benchmark with manual creation of request to TOAR database
    """
    print(f"starting request for {start}-{end}")

    timer_start = datetime.now()
    tries = get_toar_response(start, end)
    timer_end = datetime.now()
    response_time = timer_end - timer_start

    print(f"response time for {start}-{end}: {response_time} ({tries} tries)")


def wait_for_client_response(client, sample):
    """waiting for response. get_data throws exception after 30min, if results are not available.
    """
    for half_hours in range(20):
        try:
            client.get_data(sample)
        except RuntimeError:
            print(f"time out after 30min, try {half_hours+1}")


def time_rest_client_response(start :datetime, end:datetime):
    """benchmark function using the AnalysisService with download of the requested data
    
    Parameters:
    ----------
    start:
        start time point
    end:
        end time point
    Results:
        Prints duration and number of tries, i.e. number of calls of get_data, which times out after 30min. 
    """
    print(f"starting request for {start}-{end}")

    path_cache = TEST_ROOT / "temp_data_cache"
    path_data  = TEST_ROOT / "temp_data"
    path_cache.mkdir(parents=True, exist_ok=True)
    path_data.mkdir(parents=True, exist_ok=True)

    rest_client = AnalysisServiceDownload(
        STATISTICS_ENDPOINT,
        path_cache,
        path_data,
        use_downloaded=False,
    )
    time_window = TimeSample(start, end, SAMPLING)
    sample = Metadata.construct("mole_fraction_of_ozone_in_air", time_window, STATISTIC)

    timer_start = datetime.now()
    wait_for_client_response(rest_client, sample)
    timer_end = datetime.now()
    response_time = timer_end - timer_start

    print(f"response time for {start}-{end}: {response_time} (tries)")


if __name__ == "__main__":
    time_windows = [
        #(datetime(2010, 1, 1), datetime(2010, 1, 2)),#this line seems to cause crashes
        (datetime(2010, 1, 1), datetime(2010, 1, 8)),
        (datetime(2010, 1, 1), datetime(2010, 1, 31)),
        (datetime(2010, 1, 1), datetime(2010, 12, 31)),
        (datetime(2010, 1, 1), datetime(2015, 12, 31)),
    ]
    for start, end in time_windows:
        print("rest client")
        time_rest_client_response(start, end)