diff --git a/tests/test_input_checks.py b/tests/test_input_checks.py index eb1691aa5135aa985cd12c494902bd8a5e3a9d96..9cad7f733160a3940a5536895d30ae27ca36ffc3 100644 --- a/tests/test_input_checks.py +++ b/tests/test_input_checks.py @@ -4,12 +4,25 @@ import numpy as np import pandas as pd import pytest -from toarstats.input_checks import ( - check_crops, check_data, check_data_capture, check_index, - check_input_parameters, check_metadata, check_required_parameters, - check_sampling, check_seasons, check_station_climatic_zone, - check_station_latitude, check_station_longitude, check_statistics, - check_values, from_pandas, is_correct_type, is_empty, is_in_range, +from toarstats.metrics.input_checks import ( + check_crops, + check_data, + check_data_capture, + check_index, + check_input_parameters, + check_metadata, + check_required_parameters, + check_sampling, + check_seasons, + check_station_climatic_zone, + check_station_latitude, + check_station_longitude, + check_statistics, + check_values, + from_pandas, + is_correct_type, + is_empty, + is_in_range, to_collection ) @@ -260,16 +273,14 @@ class TestCheckData: start="2000", periods=3, freq="H", tz="Europe/Berlin" ) values = np.array([5., 6.1, 7]) - expected = pd.DataFrame( - {"values": values}, index=index.tz_localize(None) - ) - pd.testing.assert_frame_equal( + expected = pd.Series(values, index=index.tz_localize(None)) + pd.testing.assert_series_equal( check_data(pd.DataFrame(values, index=index), None, None), expected ) - pd.testing.assert_frame_equal( + pd.testing.assert_series_equal( check_data(pd.Series(values, index=index), None, None), expected ) - pd.testing.assert_frame_equal( + pd.testing.assert_series_equal( check_data(None, index, values), expected ) @@ -593,9 +604,8 @@ class TestCheckInputParameters: assert error_msg == str(excinfo.value) def test_check_input_parameters_correct_input(self): - data = pd.DataFrame( - {"values": range(10)}, - index=pd.date_range(start="2000", periods=10, freq="H") + data = pd.Series( + range(10), index=pd.date_range(start="2000", periods=10, freq="H") ) expected_required = namedtuple( "Required", @@ -604,7 +614,7 @@ class TestCheckInputParameters: result = check_input_parameters("annual", "median", data, *[None]*9) assert result.sampling == "annual" assert result.statistics == ["median"] - pd.testing.assert_frame_equal(result.data, data) + pd.testing.assert_series_equal(result.data, data) assert result.metadata is None assert result.seasons is None assert result.crops is None diff --git a/tests/test_interface.py b/tests/test_interface.py index d476de3d0246ae820e959c799e6efcd41fd58657..45c1d1ac920e89ed70e209d1ce9bbdff003b50db 100644 --- a/tests/test_interface.py +++ b/tests/test_interface.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from toarstats.interface import calculate_statistics +from toarstats.metrics.interface import calculate_statistics class TestCalculateStatistics: diff --git a/tests/test_stats.py b/tests/test_stats.py index 90caa08e1543205bdaa25b6a7dd18b2494a0625e..a1d169d20425a94dfbbb516877d81633ba40b35c 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,16 +2,20 @@ import numpy as np import pandas as pd import pytest -from toarstats.constants import ALLOWED_SAMPLING_VALUES, RSTAGS, SEASON_DICT -from toarstats.defaults import DEFAULT_DATA_CAPTURE -from toarstats.interface import calculate_statistics -from toarstats.stats_utils import create_reference_data_frame +from toarstats.metrics.constants import ( + ALLOWED_SAMPLING_VALUES, + RSTAGS, + SEASON_DICT +) +from toarstats.metrics.defaults import DEFAULT_DATA_CAPTURE +from toarstats.metrics.interface import calculate_statistics +from toarstats.metrics.stats_utils import create_reference_series data = pd.read_csv( "tests/time_series.csv", header=None, names=[None, "values"], index_col=0, parse_dates=True, infer_datetime_format=True ) -ref_data = create_reference_data_frame(data.index) +ref_data = create_reference_series(data.index) metadata = {"station_lat": 50.906389, "station_lon": 6.403889, "station_climatic_zone": "cool temperate moist"} @@ -52,12 +56,12 @@ def test_data_capture(): cur_ref.index += pd.Timedelta(182, "days") count = cur_data.resample(offset).count().squeeze("columns") expected[name] = count.divide( - cur_ref.resample(offset).count().squeeze("columns") + cur_ref.resample(offset).count() ).reindex(count.index) else: count = data.resample(offset).count().squeeze("columns") expected["data_capture"] = count.divide( - ref_data.resample(offset).count().squeeze("columns") + ref_data.resample(offset).count() ).reindex(count.index) pd.testing.assert_frame_equal(result, expected) @@ -132,7 +136,7 @@ def test_median(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.median().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -140,7 +144,7 @@ def test_median(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.median().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -164,7 +168,7 @@ def test_maximum(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.max().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -172,7 +176,7 @@ def test_maximum(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.max().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -196,7 +200,7 @@ def test_minimum(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.min().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -204,7 +208,7 @@ def test_minimum(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.min().reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -228,7 +232,7 @@ def test_perc05(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.05).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -236,7 +240,7 @@ def test_perc05(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.05).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -260,7 +264,7 @@ def test_perc10(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.1).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -268,7 +272,7 @@ def test_perc10(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.1).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -292,7 +296,7 @@ def test_perc25(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.25).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -300,7 +304,7 @@ def test_perc25(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.25).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -324,7 +328,7 @@ def test_perc75(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.75).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -332,7 +336,7 @@ def test_perc75(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.75).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -356,7 +360,7 @@ def test_perc90(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.9).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -364,7 +368,7 @@ def test_perc90(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.9).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -388,7 +392,7 @@ def test_perc95(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.95).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -396,7 +400,7 @@ def test_perc95(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.95).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -420,7 +424,7 @@ def test_perc98(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.98).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -428,7 +432,7 @@ def test_perc98(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.98).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -452,7 +456,7 @@ def test_perc99(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.99).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -460,7 +464,7 @@ def test_perc99(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile(0.99).reindex(frac.index) tmp_res[frac < DEFAULT_DATA_CAPTURE] = np.nan @@ -484,7 +488,7 @@ def test_percentiles1(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile( int(name.split("-", 1)[0][1:]) / 100 @@ -494,7 +498,7 @@ def test_percentiles1(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count for name in result.columns.to_list(): tmp_res = data_rs.quantile(int(name[1:]) / 100).reindex( @@ -521,7 +525,7 @@ def test_percentiles2(): cur_ref.index += pd.Timedelta(182, "days") data_rs = cur_data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = cur_ref.resample(offset).count().squeeze("columns") + ref_count = cur_ref.resample(offset).count() frac = data_count / ref_count tmp_res = data_rs.quantile( int(name.split("-", 1)[0][1:]) / 100 @@ -531,7 +535,7 @@ def test_percentiles2(): else: data_rs = data.resample(offset) data_count = data_rs.count().squeeze("columns") - ref_count = ref_data.resample(offset).count().squeeze("columns") + ref_count = ref_data.resample(offset).count() frac = data_count / ref_count for name in result.columns.to_list(): tmp_res = data_rs.quantile(int(name[1:]) / 100).reindex( diff --git a/tests/test_toarstats.py b/tests/test_toarstats.py index ce994d04c79331d8fb8c0799b8d9b39bce87a85c..609346700db3d5dcd512c1f7417909795d6a0d9e 100644 --- a/tests/test_toarstats.py +++ b/tests/test_toarstats.py @@ -23,7 +23,8 @@ import numpy as np import pandas as pd import pytest -from toarstats.interface import calculate_statistics +from tests.create_sample_data_and_reference_results import create_sample_data +from toarstats.metrics.interface import calculate_statistics def get_all_statistics(): @@ -33,11 +34,11 @@ def get_all_statistics(): """ statistics = set() for file in Path(Path(__file__).resolve().parents[1], - "toarstats").glob("*.py"): + "toarstats/metrics").glob("*.py"): for node in ast.parse(file.read_text(), file).body: if (isinstance(node, ast.FunctionDef) and [el.arg for el in node.args.args] - == ["df", "dfref", "mtype", "metadata", "seasons", + == ["ser", "ref", "mtype", "metadata", "seasons", "data_capture"]): statistics.add(node.name) return statistics @@ -50,7 +51,7 @@ def get_all_samplings(): """ samplings = set() for file in Path(Path(__file__).resolve().parents[1], - "toarstats").glob("*.py"): + "toarstats/metrics").glob("*.py"): for node in ast.parse(file.read_text(), file).body: if (isinstance(node, ast.Assign) and isinstance(node.value, ast.Dict) @@ -68,8 +69,13 @@ def sample_data(): :return: A data frame with sample data """ + sample_data_file = Path( + Path(__file__).resolve().parent, "sample_data/sample_data.csv" + ) + if not sample_data_file.is_file(): + create_sample_data(sample_data_file.parent) return pd.read_csv( - Path(Path(__file__).resolve().parent, "sample_data/sample_data.csv"), + sample_data_file, header=None, index_col=0, parse_dates=True )