diff --git a/requirements.txt b/requirements.txt
index 723a98d138b6a0742f9ff9683f18fe8ae0de7e4d..4ebb321a37d598d71a5360f7439bd939e4962a1f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-numpy==1.24.2
-pandas==1.5.3
-statsmodels==0.13.5
+numpy==1.24.3
+pandas==2.0.1
+statsmodels==0.14.0
diff --git a/tests/test_metrics/create_sample_data_and_reference_results.py b/tests/test_metrics/create_sample_data_and_reference_results.py
deleted file mode 100644
index e81ea32d776b3d98ef56a6a182b375c2b6b3f5a6..0000000000000000000000000000000000000000
--- a/tests/test_metrics/create_sample_data_and_reference_results.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python3
-
-"""Create sample data and reference results for the toarstats package.
-
-Without any arguments only sample data will be created.
-
-To run this script use:
-create_sample_data_and_reference_results.py [-h] [-t TOARSTATS]
-[-p PYTHON]
-
-optional arguments:
-  -h, --help
-    show help message and exit
-  -t TOARSTATS, --toarstats TOARSTATS
-    paths to the different toarstats versions
-  -p PYTHON, --python PYTHON
-    paths to the python interpreters which should be used for the
-    different toarstats versions
-"""
-
-from argparse import ArgumentParser
-import json
-from pathlib import Path
-import subprocess
-
-import numpy as np
-import pandas as pd
-
-
-GET_STATISTICS_AND_SAMPLINGS = """
-import ast
-import sys
-
-
-source = sys.argv[1]
-filename = sys.argv[2]
-
-statistics = set()
-samplings = set()
-for node in ast.parse(source, filename).body:
-    if (isinstance(node, ast.FunctionDef)
-            and [el.arg for el in node.args.args] == ["df", "dfref", "mtype",
-                                                      "varname", "varunits",
-                                                      "metadata", "seasons",
-                                                      "data_capture"]):
-        statistics.add(node.name)
-    elif (isinstance(node, ast.Assign) and isinstance(node.value, ast.Dict)
-            and node.targets[0].id == "RSTAGS"):
-        samplings.update([sampling.s for sampling in node.value.keys])
-if "seasonal" in samplings and "vegseason" not in samplings:
-    samplings.add("vegseason")
-print([list(statistics), list(samplings)])
-"""
-
-
-CALCULATE_STATISTICS = """
-from collections import namedtuple
-from configparser import ConfigParser
-import json
-import os.path
-import sys
-
-import pandas as pd
-
-from toarstats_version.stats_main import stats_wrapper
-
-
-class DataSlice:
-    def __init__(self, index, values):
-        self.x = index
-        self.y = values
-        self.yattr = {}
-
-
-data_path = sys.argv[1]
-metadata_path = sys.argv[2]
-results_dir = sys.argv[3]
-statistics = json.loads(sys.argv[4].replace("'", '"'))
-samplings = json.loads(sys.argv[5].replace("'", '"'))
-
-failed_combinations = []
-for statistic in statistics:
-    for sampling in samplings:
-        data = pd.read_csv(data_path, header=None, index_col=0, squeeze=True,
-                           parse_dates=True)
-        parser = ConfigParser()
-        parser.read(metadata_path)
-        Metadata = namedtuple("Metadata", ["station_lat", "station_lon",
-                                           "station_climatic_zone"])
-        metadata = Metadata(parser.getfloat("METADATA", "station_lat"),
-                            parser.getfloat("METADATA", "station_lon"),
-                            parser.getint("METADATA", "station_climatic_zone"))
-
-        try:
-            results = stats_wrapper(sampling, [statistic],
-                                    DataSlice(data.index, data.values),
-                                    metadata)
-        except ValueError:
-            failed_combinations.append([statistic, sampling])
-            continue
-        pd.DataFrame(
-            {i: pd.Series(res.y, res.x) for i, res in enumerate(results)}
-        ).to_csv(os.path.join(results_dir, statistic+"-"+sampling+".csv"),
-                 header=False)
-print(failed_combinations)
-"""
-
-
-def get_command_line_arguments():
-    """Parse command line arguments.
-
-    :return: A zip of the paths to ``toarstats`` versions and their
-             respective python interpreters
-    """
-    parser = ArgumentParser(description="Create sample data and reference"
-                                        " results for the toarstats package.")
-    parser.add_argument("-t", "--toarstats", action="append", default=[],
-                        help="paths to the different toarstats versions")
-    parser.add_argument("-p", "--python", action="append", default=[],
-                        help="paths to the python interpreters which should be"
-                             " used for the different toarstats versions")
-    args = parser.parse_args()
-    return zip([Path(el).resolve() for el in args.toarstats], args.python)
-
-
-def create_sample_data(sample_data_dir):
-    """Create sample data.
-
-    :param sample_data_dir: path to the sample data directory
-    """
-    sample_data_dir.mkdir(exist_ok=True)
-    datetime_index = pd.date_range(start="2011-04-17 09:00", periods=100000,
-                                   freq="H")
-    values = np.random.default_rng().uniform(13.4, 61.7, len(datetime_index))
-    values[np.random.default_rng().choice(values.size,
-                                          size=int(0.085*values.size),
-                                          replace=False)] = np.nan
-    pd.Series(values, datetime_index).dropna().to_csv(
-        Path(sample_data_dir, "sample_data.csv"), header=False
-    )
-    Path(sample_data_dir, "sample_metadata.cfg").write_text(
-        "[METADATA]\n"
-        "station_lat: 50.906389\n"
-        "station_lon: 6.403889\n"
-        "station_climatic_zone: 3\n",
-        encoding="utf-8"
-    )
-
-
-def get_statistics_and_samplings(toarstats_version, python_interpreter):
-    """Collect statistics and samplings from older package version.
-
-    :param toarstats_version: path to the old package version
-    :param python_interpreter: path to the interpreter to use
-
-    :return: A list of all statistics and a list of all samplings
-    """
-    all_statistics = set()
-    all_samplings = set()
-    for file in toarstats_version.glob("*.py"):
-        try:
-            content = file.read_text()
-        except UnicodeDecodeError:
-            try:
-                content = file.read_text(encoding="cp1252")
-            except UnicodeError:
-                print(f"WARNING: ignoring {file}; unknown encoding")
-                continue
-        try:
-            statistics, samplings = json.loads(subprocess.run(
-                [python_interpreter, "-c", GET_STATISTICS_AND_SAMPLINGS,
-                 content, file], capture_output=True, check=True, text=True
-            ).stdout.replace("'", '"'))
-        except subprocess.CalledProcessError:
-            statistics, samplings = json.loads(subprocess.run(
-                [python_interpreter, "-c",
-                 GET_STATISTICS_AND_SAMPLINGS.replace("el.arg", "el.id"),
-                 content, file], capture_output=True, check=True, text=True
-            ).stdout.replace("'", '"'))
-        all_statistics.update(statistics)
-        all_samplings.update(samplings)
-    return list(all_statistics), list(all_samplings)
-
-
-def create_reference_results(reference_versions, sample_data_dir):
-    """Create reference results.
-
-    :param reference_versions: zip of old package versions and
-                               interpreters to use
-    :param sample_data_dir: path to the sample data directory
-    """
-    for toarstats_version, python_interpreter in reference_versions:
-        statistics, samplings = get_statistics_and_samplings(
-            toarstats_version, python_interpreter
-        )
-        results_dir = Path(sample_data_dir.parent, "reference_results",
-                           toarstats_version.name)
-        results_dir.mkdir(parents=True, exist_ok=True)
-        cur_script = CALCULATE_STATISTICS.replace("toarstats_version",
-                                                  toarstats_version.name)
-        try:
-            failed_combinations = json.loads(subprocess.run(
-                [python_interpreter, "-c", cur_script,
-                 Path(sample_data_dir, "sample_data.csv"),
-                 Path(sample_data_dir, "sample_metadata.cfg"), results_dir,
-                 str(statistics), str(samplings)], capture_output=True,
-                cwd=toarstats_version.parent, check=True, text=True
-            ).stdout.replace("'", '"'))
-        except subprocess.CalledProcessError:
-            failed_combinations = json.loads(subprocess.run(
-                [python_interpreter, "-c",
-                 cur_script.replace("configparser", "ConfigParser"),
-                 Path(sample_data_dir, "sample_data.csv"),
-                 Path(sample_data_dir, "sample_metadata.cfg"), results_dir,
-                 str(statistics), str(samplings)], capture_output=True,
-                cwd=toarstats_version.parent, check=True, text=True
-            ).stdout.replace("u'", "'").replace("'", '"'))
-        if failed_combinations:
-            print(toarstats_version.name)
-            for combination in failed_combinations:
-                print(*combination)
-
-
-def main():
-    """Main function for the script."""
-    reference_versions = get_command_line_arguments()
-    sample_data_dir = Path(Path(__file__).resolve().parent, "sample_data")
-    create_sample_data(sample_data_dir)
-    create_reference_results(reference_versions, sample_data_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_metrics/test_input_checks.py b/tests/test_metrics/test_input_checks.py
index d1d23501f6d10c306e54e48e3a39f5647b96c51f..ab7da5a9e95cbe2a61c2fbe307f8a33cd39d3bb4 100644
--- a/tests/test_metrics/test_input_checks.py
+++ b/tests/test_metrics/test_input_checks.py
@@ -190,7 +190,7 @@ class TestFromPandas:
         np.testing.assert_array_equal([5., 2.], out_value3)
         pd.testing.assert_index_equal(out_index4, index)
         np.testing.assert_array_equal([], out_value4)
-        pd.testing.assert_index_equal(out_index5, pd.Index([]))
+        pd.testing.assert_index_equal(out_index5, pd.RangeIndex(0))
         np.testing.assert_array_equal([], out_value5)
 
 
diff --git a/tests/test_metrics/test_metrics.py b/tests/test_metrics/test_metrics.py
index d5d07d41415c9a9c0732f8538c77639497e52cf5..d7b68939d0b0b28d4aa6687306dfb9d8e6408612 100644
--- a/tests/test_metrics/test_metrics.py
+++ b/tests/test_metrics/test_metrics.py
@@ -1,9 +1,10 @@
-"""Tests for the package as a whole.
+"""Tests for the metrics subpackage as a whole.
 
 This module contains tests to check if everything from older package
 versions is implemented and if the results are still the same.
 
 This module contains the following functions:
+create_sample_data - create sample data
 get_all_statistics - get all implemented statistics
 get_all_samplings - get all implemented samplings
 sample_data - get sample data frame
@@ -23,13 +24,34 @@ import numpy as np
 import pandas as pd
 import pytest
 
-from tests.test_metrics.create_sample_data_and_reference_results import (
-    create_sample_data
-)
 from toarstats.metrics.interface import calculate_statistics
 from toarstats.metrics.stats_utils import STATS_LIST
 
 
+def create_sample_data(sample_data_dir):
+    """Create sample data.
+
+    :param sample_data_dir: path to the sample data directory
+    """
+    sample_data_dir.mkdir(exist_ok=True)
+    datetime_index = pd.date_range(start="2011-04-17 09:00", periods=100000,
+                                   freq="H")
+    values = np.random.default_rng().uniform(13.4, 61.7, len(datetime_index))
+    values[np.random.default_rng().choice(values.size,
+                                          size=int(0.085*values.size),
+                                          replace=False)] = np.nan
+    pd.Series(values, datetime_index).dropna().to_csv(
+        Path(sample_data_dir, "sample_data.csv"), header=False
+    )
+    Path(sample_data_dir, "sample_metadata.cfg").write_text(
+        "[METADATA]\n"
+        "station_lat: 50.906389\n"
+        "station_lon: 6.403889\n"
+        "station_climatic_zone: 3\n",
+        encoding="utf-8"
+    )
+
+
 def get_all_statistics():
     """Get all implemented statistics.
 
diff --git a/tests/test_metrics/test_stats.py b/tests/test_metrics/test_stats.py
index e1e64f54a069cee17d1138d458a766b681ce8776..349cf26993852c8dd4302e75ad2171cf4c4a977d 100644
--- a/tests/test_metrics/test_stats.py
+++ b/tests/test_metrics/test_stats.py
@@ -13,7 +13,7 @@ from toarstats.metrics.stats_utils import create_reference_series
 
 data = pd.read_csv(
     "tests/test_metrics/time_series.csv", header=None, names=[None, "values"],
-    index_col=0, parse_dates=True, infer_datetime_format=True
+    index_col=0, parse_dates=True, date_format="%Y-%m-%d %H:%M:%S"
 )
 ref_data = create_reference_series(data.index)
 metadata = {"station_lat": 50.906389,