Skip to content
Snippets Groups Projects
create_test_suite.py 14.1 KiB
Newer Older
# -*- coding: utf-8 -*-
"""populate TOAR2 database via REST API
   with data for testing Data Quality Control tools

   Test-Suite:
   time series, that are OK:
   - CapePoint (id: 21848)
   - Denali (id: 26030)
   - Jungfraujoch (id: 25769)
   - Alert (id: 21828)
   - Mt Waliguan (id: 48238)
   - Amundsen-Scott South Pole (id: 21919)
   - Cape Grim (id: 119939)
   - Esrange (id: 25745)
   - American Samoa (id: 21915)
   - Pfälzerwald (id: 18015)   --> already inserted (UBA)
   - Chiyodakukandatsukasachou (id: 35528)
   - Hamburg Sternschanze (id: 17550)   --> already inserted (UBA)
   - Assekrem (id: 21831)

   time series, that have issues:
   - Ushuaia GAWUSH354S00 (id: 21931)
   - Booysens RSA024 (id: 28746)
   - Mamelodi RSA005 (id: 28727)
   - Montes Chaos - Sines PT04005 (Airbase) (id: 25444)
   - Pedregal MX_PED (other) (id: 47866)
   - Lisice MK0031A (Airbase) (id; 25188)
   - SCHIVENOGLIA 302072 IT1865A (Airbase) (id: 25037)
   - DUCKWORTH & ORDINANCE 010101 (NAPS) (id: 28707)
   - 47-065-1011 (AQS Soddy Daisy High School) (id: 27893)
   - 06-059-2001 (AQS) (id: 26261)
   - 06-071-1101 (AQS) (id: 26324)

   author:
   s.schroeder@fz-juelich.de

   date: 2021/01/12

   call:
   python populate_database_via_REST.py TOKEN 
"""

import sys
import getopt
import pandas as pd
from io import StringIO

import psycopg2
import json
import requests
from urllib3.exceptions import InsecureRequestWarning

TOAR_SERVICE_URL = 'https://join-dev.fz-juelich.de:8443/'

series_ids = [21848,26030,25769,21828,48238,21919,119939,25745,21915,35528,21831,
              21931,28746,28727,25444,47866,25188,25037,28707,27893,26261,26324]

# from old database:
# station_coordinate_status:
# -1 : not checked (default value)
# 0 : verified by google earth or other means
# 1 : verification not possible, but no reason to doubt that the measurement location should be accurate to within 100 metres or so
# 2 : unspecified potential issue with the station coordinates
# 3 : obvious error in station coordinate information
# 4 : severe mismatch between reported station altitude and google elevation at station location (> 100 m) indicating wrong station coordinates
# 5 : no coordinates available – given coordinates are completely invented!

# from new database: coordinate_validation_status:
# 0: 'NotChecked', 'not checked'),  -- default value
# 1: 'Verified', 'verified'),
# 2: 'Plausible', 'plausible'),
# 3: 'Doubtful', 'doubtful'),
# 4: 'Unverifyable', 'not verifyable');

# ==> old new
#     -1  0
#      0  1
#      1  2
#      2  3
#      3  4
#      4  4
#      5  3
tr_coord_status = { -1 : 'NotChecked',
                     0 : 'Verified',
                     1 : 'Plausible',
                     2 : 'Doubtful',
                     3 : 'Unverifyable',
                     4 : 'Unverifyable',
                     5 : 'Doubtful' }

tr_climatic_zone = {
    -1 : 'Undefined',
     0 : 'Unclassified',
     1 : 'WarmTemperateMoist',
     2 : 'WarmTemperateDry',
     3 : 'CoolTemperateMoist',
     4 : 'CoolTemperateDry',
     5 : 'PolarMoist',
     6 : 'PolarDry',
     7 : 'BorealMoist',
     8 : 'BorealDry',
     9 : 'TropicalMontane',
    10 : 'TropicalWet',
    11 : 'TropicalMoist',
    12 : 'TropicalDry'  }

tr_type_of_environment = {
    'unknown'           : 'Unknown',
    ''                  : 'Unknown',
    'traffic'           : 'Traffic',
    'industrial'        : 'Industrial',
    'other'             : 'Other',
    'urban'             : 'Other',
    'commercial'        : 'Other',
    'background'        : 'Background',
    'agricultural'      : 'Background',
    'other-marine'      : 'Background',
    'other-agricultural': 'Background' }

tr_type_of_area = {
    'unknown'          : 'Unknown',
    ''                 : 'Unknown',
    'urban'            : 'Urban',
    'suburban'         : 'Suburban',
    'rural'            : 'Rural',
    'rural-remote'     : 'Rural',
    'rural-nearcity'   : 'Rural',
    'rural-regional'   : 'Rural',
    'alpine grasslands': 'Rural',
    'forest'           : 'Rural',
    'remote'           : 'Remote',
    'marine'           : 'Remote' }

tr_sampling_frequency = {
    'hourly'  : 'Hourly',
    'monthly' : 'Monthly',
    'event'   : 'Irregular2' }

tr_old_flags = {
    0 : "OK",
    1 : "inconsistent",
    2 : "doubtful",
    3 : "wrong",
    4 : "not_checked",
    5 : "changed",
    6 : "estimated",
    7 : "missing_value" }

# flagging: toar_v1 --> toar_v2
# WMO_QUALITY_FLAG | preliminary --> new flag
# OK | false --> OK
# OK | true --> OKPreliminary
# changed | false --> OKModified
# changed | true --> OKPreliminaryModified
# inconsistent | false --> Inconsistent
# inconsistent | true --> InconsistentPreliminary
# doubtful | false --> Doubtful
# doubtful | true --> DoubtfulPreliminary
# wrong | false --> Wrong
# wrong | true --> WrongPreliminary
# not_checked | true --> NotCheckedPreliminary
# estimated | true or false --> Estimated
# missing_value | true or false --> MissingValue
#
# not yet assigned:
# DoubtfulPreliminaryModified
# Changed <-> OKModified, OKPreliminaryModified ?!
#
# What about: not_checked | false ?!

# some preliminary code
tr_flagging = {
    ('OK',False): 'OK',
    ('OK',True): 'OKPreliminary',
    ('changed',False): 'OKModified',
    ('changed',True): 'OKPreliminaryModified',
    ('inconsistent',False): 'Inconsistent',
    ('inconsistent',True): 'InconsistentPreliminary',
    ('doubtful',False): 'Doubtful',
    ('doubtful',True): 'DoubtfulPreliminary',
    ('wrong',False): 'Wrong',
    ('wrong',True): 'WrongPreliminary',
    ('not_checked',False): 'OK',         #??????
    ('not_checked',True): 'NotCheckedPreliminary',
    ('estimated',False): 'Estimated',
    ('estimated',True): 'Estimated',
    ('missing_value',False): 'MissingValue',   # just for reasons of completeness
    ('missing_value',True): 'MissingValue' }  # (we did not store missing values in V1)

station_column_names = [
    'numid',
    'network_name',
    'station_id',
    'station_local_id',
    'station_type',
    'station_type_of_area',
    'station_category',
    'station_name',
    'station_country',
    'station_state',
    'station_lon',
    'station_lat',
    'station_alt',
    'station_timezone',
    'station_nightlight_5km',
    'station_climatic_zone',
    'station_wheat_production',
    'station_rice_production',
    'station_nox_emissions',
    'station_omi_no2_column',
    'station_toar_category',
    'station_htap_region',
    'station_reported_alt',
    'station_alt_flag',
    'station_coordinate_status',
    'station_google_alt',
    'station_etopo_alt',
    'station_etopo_min_alt_5km',
    'station_etopo_relative_alt',
    'station_dominant_landcover',
    'station_landcover_description',
    'station_max_nightlight_25km',
    'station_max_population_density_25km',
    'station_nightlight_1km',
    'station_population_density',
    'google_resolution',
    'station_comments',
    'station_max_population_density_5km' ]

parameter_series_column_names = [
    'id',
    'station_numid',
    'parameter_label',
    'parameter_name',
    'parameter_attribute',
    'parameter_sampling_type',
    'parameter_measurement_method',
    'parameter_original_units',
    'parameter_calibration',
    'parameter_contributor_shortname',
    'parameter_contributor',
    'parameter_contributor_country',
    'parameter_dataset_type',
    'parameter_status',
    'comments',
    'creation_date',
    'modification_date',
    'data_start_date',
    'data_end_date',
    'parameter_pi',
    'parameter_pi_email',
    'parameter_instrument_manufacturer',
    'parameter_instrument_model' ]

data_column_names = [
    'id',
    'datetime',
    'value',
    'flag',
    'preliminary' ]


if __name__ == "__main__":

    access_token = sys.argv[1]

    headers = { 'Content-Type': 'application/json',
                'accept':       'application/json',
                'Authorization': 'Token ' + access_token }
#   insecure_ssl=True
    insecure_ssl=False

# Suppress only the single warning from urllib3 needed.
    requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)


# connect to database (use .pgpass for not having explicit password here!)
    with psycopg2.connect(host="zam10131.zam.kfa-juelich.de", dbname='surface_observations_toar', user='s.schroeder') as db:
        cursor = db.cursor()
        for pid in series_ids:

            # 1. transfer given stations from old database to toardb_v2

            query = f"SELECT s.* from parameter_series p, stations s where p.station_numid=s.numid AND id={pid}"
            cursor.execute(query)
            result = cursor.fetchall()
            result_dict=dict(zip(station_column_names, result[0]))

#           # create json structure for POST request

            station_code = result_dict['station_id'].strip()

            # what about escaping special characters? (f. ex. apostroph)
            # do we need that?
            # station_name = result_dict['station_name'].strip().replace('&','%26')
            station_name = result_dict['station_name'].strip()
            stationmeta = {"codes": [station_code],
                           "name": station_name,
                           "coordinates": {"lat": result_dict['station_lat'],"lng": result_dict['station_lon'],"alt": result_dict['station_alt']},
                           "coordinate_validation_status": tr_coord_status[result_dict['station_coordinate_status']],
                           "country": result_dict['station_country'].strip(),
                           "state": result_dict['station_state'].strip(),
                           "type_of_environment": tr_type_of_environment[result_dict['station_type'].strip()],
                           "type_of_area": tr_type_of_area[result_dict['station_type_of_area'].strip()],
                           "timezone": result_dict['station_timezone'].strip(),
                           #to be done --> collect all data from old database!
                           "additional_metadata": "{}",
                           #to be done!
                           "roles": [],
                           "globalmeta": {"climatic_zone": tr_climatic_zone[result_dict['station_climatic_zone']]}
                          }
            data = {"stationmeta": stationmeta}
 
            r = requests.post(TOAR_SERVICE_URL + 'stationmeta/',
                              data=json.dumps(data),
                              headers=headers,verify=insecure_ssl)
            # to do (German station are not critical!):
            # if code already exists, is it really the same station?
            msg = r.text
            print(f"{msg}\n")
 
            # 2. transfer given timeseries from old database to toardb_v2

            # a. get metadata from old database

            query = f"SELECT p.* from parameter_series p, stations s where p.station_numid=s.numid AND id={pid}"
            cursor.execute(query)
            result = cursor.fetchall()
            result_dict=dict(zip(parameter_series_column_names, result[0]))
 
            # b. get id of station
            r = requests.get(TOAR_SERVICE_URL + f'stationmeta/{station_code}',verify=insecure_ssl)
            station_id = r.json()['id']

            # c. get variable_id
            parameter = result_dict['parameter_name'].strip()
            r = requests.get(TOAR_SERVICE_URL + f'variables/{parameter}',verify=insecure_ssl)
            variable_id=r.json()['id']

            # d. create timeseries dictionary
            # at the moment! (preliminary!!!)
            parameter_label = ''
            entry = {}
            entry['label'] = parameter_label
            entry['order'] = 1
            entry['access_rights'] = "ByAttribution"
            sampling_frequency = result_dict['parameter_dataset_type'].strip()
            entry['sampling_frequency'] = tr_sampling_frequency[sampling_frequency]
            entry['aggregation'] = "Mean"
            entry['source'] = "Measurement"
            entry['sampling_height'] = 2
            entry['measurement_method'] = 'UnknownInstrument'
            entry['data_start_date'] = result_dict['data_start_date'].strftime("%Y-%m-%d %H:%M:%S+00")
            entry['data_end_date'] = result_dict['data_start_date'].strftime("%Y-%m-%d %H:%M:%S+00")
            entry['date_added'] = result_dict['creation_date'].strftime("%Y-%m-%d %H:%M:%S+00")
            entry['date_modified'] = result_dict['modification_date'].strftime("%Y-%m-%d %H:%M:%S+00")
            entry['station_id'] = station_id
            entry['variable_id'] = variable_id
 
            # e. get/create roles (TBD!!!)
            # r = requests.get(TOAR_SERVICE_URL + f'contacts/orga_name/{parameter_contributor}',verify=insecure_ssl)
            # data = r.json()
            # contact_id_contributor=data['id']
            # entry['roles'] = [{"role": "Contributor", "contact_id": contact_id_contributor, "status": "active"}]
            entry['additional_metadata'] = "{}"
            timeseries_data = {}
            timeseries_data['timeseries'] = entry
            r = requests.post(TOAR_SERVICE_URL + 'timeseries/',
                              data=json.dumps(timeseries_data),
                              headers=headers, verify=insecure_ssl)
            print(f"data added for old time series {pid}: ", r.json())

            # 3. now add data itself

            r = requests.get(TOAR_SERVICE_URL + f'timeseries/unique/?station_id={station_id}&variable_id={variable_id}&resource_provider=&label=',verify=insecure_ssl)
            series_id = r.json()['id']
            print(f'toar_v1: {pid} --> toar_v2: {series_id}')

            query = f"SELECT * from {parameter}_{sampling_frequency} WHERE id={pid}"
            cursor.execute(query)
            result = cursor.fetchall()
            for row in result:
                result_dict=dict(zip(data_column_names, row))
                db_datetime=result_dict['datetime']
                db_val=result_dict['value']
                old_flag = tr_old_flags[result_dict['flag']]
                db_flag=tr_flagging[(old_flag,result_dict['preliminary'])]
                r = requests.post(TOAR_SERVICE_URL + f'data/record/?series_id={series_id}&datetime={db_datetime}&value={db_val}&flag={db_flag}',verify=insecure_ssl)