Skip to content
Snippets Groups Projects
get_timeseries_meta.sh 18.73 KiB
#!/bin/bash
# example script for transfering ozone timeseries from UBA stations
# (country='Germany' and network_name='UBA')
# station_numid (now: station_id) has changed --> see log file!
# also id will change for timeseries --> log mapping: old_id <-> new_id
#
# TBD: 
#
# matching fields:
# old                                 | new                                  | new table
# ====================================|======================================|==========================|
# station_numid                       | station_id                           | timeseries               |
# parameter_label                     | - (to be newly defined: label)       | timeseries               |
# parameter_name                      | variable_id                          | timeseries               |
# parameter_attribute                 | additional_metadata                  | timeseries               |???
#                                     | - should some of it also go to label?|                          |???
#                                     | - REA should go to source=modeled!   |                          |???
# parameter_sampling_type             | additional_metadata                  | timeseries               |???
# parameter_measurement_method        | measurement_method                   | timeseries               |
# parameter_original_units            | additional_metadata                  | timeseries               |???
# parameter_calibration               | pointing to annotations              | timeseries_annotations   |???
# parameter_contributor_shortname     | pointing to role (contrib,org)       | contacts/timeseries_roles|
# parameter_contributor               | pointing to role (contrib,org)       | contacts/timeseries_roles|
# parameter_contributor_country       | pointing to role (contrib,org)       | contacts/timeseries_roles|
# parameter_dataset_type              | sampling_frequency                   | timeseries               |
# parameter_status                    | additional_metadata                  | timeseries               |???
# comments                            | pointing to annotations              | timeseries_annotations   |???
# creation_date                       | date_added                           | timeseries               |
# modification_date                   | date_modified                        | timeseries               |
# data_start_date                     | data_start_date                      | timeseries               |
# data_end_date                       | data_end_date                        | timeseries               |
# parameter_pi                        | pointing to role (PI, person)        | contacts/timeseries_roles|
# parameter_pi_email                  | pointing to role (PI, person)        | contacts/timeseries_roles|
# parameter_instrument_manufacturer   | additional_metadata                  | timeseries               |???
# parameter_instrument_model          | additional_metadata                  | timeseries               |???
#
# converting from timestamp without timezone to timestamp with timezone
# converting sampling_frequency to controlled vocabulary (SF_vocabulary) -- UBA: 0 (Hourly)
# converting source to controlled vocabulary (DS_vocabulary) -- UBA (see below)
# converting access_rights to controlled vocabulary (DA_vocabulary) -- UBA: 0 (ByAttribution)
# converting aggregation to controlled vocabulary (AT_vocabulary) -- UBA: 1 (Mean1Of2)
# converting measurement_method to controlled vocabulary (MM_vocabulary)
# converting kind to controlled vocabulary (OK_vocabulary) -- UBA: 1 (Government)
# converting role to controlled vocabulary (RC_vocabulary) -- UBA: 1 (PrincipalInvestigator) and 3 (Contributor)
#
# new (not available in old DB):
# - order (now set to 1)
# - access_rights (now set to 0 (ByAttribution) -- DA_vocabulary)
# - aggregation (UBA: 0 (Mean), if label == REA, 1 (Mean1Of2) else -- AT_vocabulary)
# - sampling height (now set to 2 (m))
# - programme_id (now set to 0 (None-programme))
# - source (UBA: 0 (Model), if label == REA, 1 (Measurement) else -- MM_vocabulary) 
#   It was double-checked that all data with parameter_contributor_shortname='MIUB' have label='REA'!
# - kind (UBA: 1 (Government) -- OK_vocabulary)
# - role (UBA: 1 (PrincipalInvestigator) and 3 (Contributor) -- RC_vocabulary)
# - status (UBA: 2 (unknown for PI) and 0 (active for Contributor) -- RS_vocabulary)
#
# for UBA: use parameter_attribute for LABEL information
# --> measurements: LABEL=''
# --> models: LABEL='REA'
# ==> they have to be distinguishable (AND unique constraint on (station_id,variable_id,label)
#
# author: s.schroeder@fz-juelich.de
# date:   2020-07-05

PARAMETER_NAME=temp
SAMPLING=hourly

ORDERED_COLUMNS="id,station_numid,parameter_label,parameter_name,parameter_attribute,parameter_sampling_type,\
                 parameter_measurement_method,parameter_original_units,parameter_calibration,parameter_contributor_shortname,\
                 parameter_contributor,parameter_contributor_country,parameter_dataset_type,parameter_status,comments,\
                 creation_date,modification_date,data_start_date,data_end_date,parameter_pi,parameter_pi_email,\
                 parameter_instrument_manufacturer,parameter_instrument_model"
#just for now (see above)
ORDER=1
ACCESS_RIGHTS=0
SAMPLING_HEIGHT=2
SAMPLING_FREQUENCY=0
PROGRAMME_ID=0
KIND=1
PI_ROLE=1
PI_STATUS=2
CONTRIBUTOR_ROLE=3
CONTRIBUTOR_STATUS=0
AGGREGATION=1

res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM variables\
	                WHERE name='$PARAMETER_NAME';"`
VARIABLE_ID=$(echo $res | cut -f2 | cut -f3 -d' ')

while read -a line
do
    OLD_STATIONID=${line[0]}
    NEW_STATIONID=${line[1]}

    COUNTER=0
    while IFS='|' read -a row
    do
        if [ $COUNTER -eq 2 ]
        then
            ID=${row[0]}
            echo "ID: $ID"
            STATION_NUMID=${row[1]}
            echo "STATION_NUMID: $STATION_NUMID"
            PARAMETER_LABEL=${row[2]}
            echo "PARAMETER_LABEL: $PARAMETER_LABEL"
# We already set PARAMETER_NAME manually!
# The following two lines can be used when debugging...
# (and we do not want to overwrite PARAMETER_NAME with untrimmed value)
#           PARAMETER_NAME=${row[3]}
#           echo "PARAMETER_NAME: $PARAMETER_NAME"
            PARAMETER_ATTRIBUTE=${row[4]}
            echo "PARAMETER_ATTRIBUTE: $PARAMETER_ATTRIBUTE"
            PARAMETER_SAMPLING_TYPE=${row[5]}
            echo "PARAMETER_SAMPLING_TYPE: $PARAMETER_SAMPLING_TYPE"
            PARAMETER_MEASUREMENT_METHOD=${row[6]}
            echo "PARAMETER_MEASUREMENT_METHOD: $PARAMETER_MEASUREMENT_METHOD"
            PARAMETER_ORIGINAL_UNITS=${row[7]}
            echo "PARAMETER_ORIGINAL_UNITS: $PARAMETER_ORIGINAL_UNITS"
            PARAMETER_CALIBRATION=${row[8]}
            echo "PARAMETER_CALIBRATION: $PARAMETER_CALIBRATION"
            PARAMETER_CONTRIBUTOR_SHORTNAME=${row[9]}
            echo "PARAMETER_CONTRIBUTOR_SHORTNAME: $PARAMETER_CONTRIBUTOR_SHORTNAME"
            PARAMETER_CONTRIBUTOR=${row[10]}
            echo "PARAMETER_CONTRIBUTOR: $PARAMETER_CONTRIBUTOR"
            PARAMETER_CONTRIBUTOR_COUNTRY=${row[11]}
            echo "PARAMETER_CONTRIBUTOR_COUNTRY: $PARAMETER_CONTRIBUTOR_COUNTRY"
            PARAMETER_DATASET_TYPE=${row[12]}
            echo "PARAMETER_DATASET_TYPE: $PARAMETER_DATASET_TYPE"
            PARAMETER_STATUS=${row[13]}
            echo "PARAMETER_STATUS: $PARAMETER_STATUS"
            COMMENTS=${row[14]}
            echo "COMMENTS: $COMMENTS"
            CREATION_DATE=${row[15]}
            echo "CREATION_DATE: $CREATION_DATE"
            MODIFICATION_DATE=${row[16]}
            echo "MODIFICATION_DATE: $MODIFICATION_DATE"
            DATA_START_DATE=${row[17]}
            echo "DATA_START_DATE: $DATA_START_DATE"
            DATA_END_DATE=${row[18]}
            echo "DATA_END_DATE: $DATA_END_DATE"
            PARAMETER_PI=${row[19]}
            echo "PARAMETER_PI: $PARAMETER_PI"
            PARAMETER_PI_EMAIL=${row[20]}
            echo "PARAMETER_PI_EMAIL: $PARAMETER_PI_EMAIL"
            PARAMETER_INSTRUMENT_MANUFACTURER=${row[21]}
            echo "PARAMETER_INSTRUMENT_MANUFACTURER: $PARAMETER_INSTRUMENT_MANUFACTURER"
            PARAMETER_INSTRUMENT_MODEL=${row[22]}
            echo "PARAMETER_INSTRUMENT_MODEL: $PARAMETER_INSTRUMENT_MODEL"
        fi
        let COUNTER=COUNTER+1
    done < <(echo "SELECT ${ORDERED_COLUMNS} FROM parameter_series WHERE station_numid=${OLD_STATIONID} AND parameter_name='${PARAMETER_NAME}' AND parameter_dataset_type='${SAMPLING}'" | psql -h zam10131.zam.kfa-juelich.de -d surface_observations_toar -U s.schroeder)

    if [ "$ID" != "(0 rows)" ]
    then

        # 1. contacts/timeseries_roles
        # ============================

        # 1. a) persons
        # =============

        EMAIL=`echo -n "${PARAMETER_PI_EMAIL//[[:space:]]/}"`
        NAME=`echo -e $PARAMETER_PI | awk '{$1=$1};1'`
        PHONE=''
        ISPRIVATE=True

	# at least for UBA data: email='unknown', name='unknown' given for undefined
	# ==> to be checked for other providers!!!
	PERSON_ID=-1
	if [ "$EMAIL" != "unknown" ] || [ "$NAME" != "unknown" ]
	then
            # check, whether person exists already in database
#           res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM persons\
            res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM persons\
                WHERE name='$NAME' AND EMAIL='$EMAIL';"`
            PERS_EXISTED=1
            PERSON_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
            if [ "$PERSON_ID" == "(0" ]
            then
                PERS_EXISTED=0
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO persons\
                    (name,email,phone,isprivate) VALUES
                    ('$NAME','$EMAIL','$PHONE',$ISPRIVATE)\
                    RETURNING id;"`
                PERSON_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
	    fi
	fi

        # 1. b) organisations
        # ===================

        NAME=`echo -n "${PARAMETER_CONTRIBUTOR_SHORTNAME//[[:space:]]/}"`
        LONGNAME=`echo -e $PARAMETER_CONTRIBUTOR | awk '{$1=$1};1'`
        COUNTRY=`echo -e $PARAMETER_CONTRIBUTOR_COUNTRY | awk '{$1=$1};1'`
        CITY=''
        POSTCODE=''
        STREET_ADDRESS=''
        HOMEPAGE=''

	ORGANISATION_ID=-1
	if [ "$NAME" != "unknown" ] || [ "$LONGNAME" != "unknown" ]
	then
            # check, whether organisation exists already in database
#           res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM organisations\
            res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM organisations\
                WHERE name='$NAME' AND longname='$LONGNAME';"`
            ORG_EXISTED=1
            ORGANISATION_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
            if [ "$ORGANISATION_ID" == "(0" ]
            then
                ORG_EXISTED=0
#               res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO organisations\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO organisations\
                    (name,longname,kind,city,postcode,street_address,country,homepage) VALUES \
       	            ('$NAME','$LONGNAME',$KIND,'$CITY','$POSTCODE','$STREET_ADDRESS','$COUNTRY','$HOMEPAGE') \
       	            RETURNING id;"`
                ORGANISATION_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
	    fi
        fi

        # 1. c) contacts
        # ==============

        if [ $PERSON_ID -ne -1 ]
        then
	    if [ $PERS_EXISTED -eq 1 ]
	    then
#               res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM contacts\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM contacts\
        	    WHERE person_id=$PERSON_ID;"`
            else
#               res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO contacts\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO contacts\
        	    (person_id,organisation_id) VALUES \
        	    ($PERSON_ID,0) \
        	    RETURNING id;"`
	    fi
            PI_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
        fi

        if [ $ORGANISATION_ID -ne -1 ]
        then
	    if [ $ORG_EXISTED -eq 1 ]
	    then
#               res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM contacts\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM contacts\
        	    WHERE organisation_id=$ORGANISATION_ID;"`
	    else
#               res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO contacts\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO contacts\
	            (person_id,organisation_id) VALUES \
               	    (0,$ORGANISATION_ID) \
        	    RETURNING id;"`
	    fi
            CONTRIBUTOR_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
        fi

        # 1. d) timeseries_roles
        # ======================

        if [ $PERSON_ID -ne -1 ]
        then
            # check, whether timeseries_role already exists
#           res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM timeseries_roles\
            res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM timeseries_roles\
                WHERE role=$PI_ROLE AND status=$PI_STATUS AND contact_id=$PI_ID;"`
            ROLE1=$(echo $res | cut -f2 | cut -f3 -d' ')
            if [ "$ROLE1" == "(0" ]
            then
#               res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO timeseries_roles\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO timeseries_roles\
        		(role,status,contact_id) VALUES \
        		($PI_ROLE,$PI_STATUS,$PI_ID)\
        		RETURNING id;"`
                ROLE1=$(echo $res | cut -f2 | cut -f3 -d' ')
            fi
        fi
        if [ $ORGANISATION_ID -ne -1 ]
        then
            # check, whether timeseries_role already exists
#           res=`psql -h localhost -d toardb -U toaradmin -c "SELECT id FROM timeseries_roles\
            res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "SELECT id FROM timeseries_roles\
                WHERE role=$CONTRIBUTOR_ROLE AND status=$CONTRIBUTOR_STATUS AND contact_id=$CONTRIBUTOR_ID;"`
            ROLE2=$(echo $res | cut -f2 | cut -f3 -d' ')
            if [ "$ROLE2" == "(0" ]
            then
#               res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO timeseries_roles\
                res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO timeseries_roles\
        		(role,status,contact_id) VALUES \
        		($CONTRIBUTOR_ROLE,$CONTRIBUTOR_STATUS,$CONTRIBUTOR_ID) \
        		RETURNING id;"`
                ROLE2=$(echo $res | cut -f2 | cut -f3 -d' ')
            fi
        fi

        # 2. timeseries_annotations
        # =========================

        # parameter_calibration: none given for UBA measured ozone
        # comments: none given for UBA measured ozone

        # 3. timeseries
        # =============

        # "order" (column name of table timeseries) is a reserved word of PostgreSQL!!!

        # fill additional_metadata
        # accessing special fields within PostgreSQL is then possible via (f. ex.)
        # select additional_metadata->>'original_units' from timeseries;
        TRIMMED_PARAMETER_ATTRIBUTE=`echo -n "${PARAMETER_ATTRIBUTE//[[:space:]]/}"`
        TRIMMED_PARAMETER_SAMPLING_TYPE=`echo -n "${PARAMETER_SAMPLING_TYPE//[[:space:]]/}"`
        TRIMMED_PARAMETER_ORIGINAL_UNITS=`echo -e $PARAMETER_ORIGINAL_UNITS | awk '{$1=$1};1'`
        TRIMMED_PARAMETER_STATUS=`echo -n "${PARAMETER_STATUS//[[:space:]]/}"`
        TRIMMED_PARAMETER_INSTRUMENT_MANUFACTURER=`echo -e $PARAMETER_INSTRUMENT_MANUFACTURER | awk '{$1=$1};1'`
        TRIMMED_PARAMETER_INSTRUMENT_MODEL=`echo -e $PARAMETER_INSTRUMENT_MODEL | awk '{$1=$1};1'`
        ADDITIONAL_METADATA="{\"parameter_attribute\":\"$TRIMMED_PARAMETER_ATTRIBUTE\",\
                              \"parameter_sampling_type\":\"$TRIMMED_PARAMETER_SAMPLING_TYPE\",\
                              \"parameter_original_units\":\"$TRIMMED_PARAMETER_ORIGINAL_UNITS\",\
                              \"parameter_status\":\"$TRIMMED_PARAMETER_STATUS\",\
                              \"parameter_instrument_manufacturer\":\"$TRIMMED_PARAMETER_INSTRUMENT_MANUFACTURER\",\
                              \"parameter_instrument_model\":\"$TRIMMED_PARAMETER_INSTRUMENT_MODEL\"}"
        DATE_ADDED=$CREATION_DATE
        DATE_MODIFIED=$MODIFICATION_DATE

	# what about lower_case?????

        if [[ $PARAMETER_MEASUREMENT_METHOD == *"UV"* ]]
        then
            MEASUREMENT_METHOD=0
        else
            MEASUREMENT_METHOD=1
        fi
        SOURCE=1
	if [ "$TRIMMED_PARAMETER_ATTRIBUTE" == "REA" ]
	then
	    SOURCE=0
	fi
	LABEL=$TRIMMED_PARAMETER_ATTRIBUTE
#       res=`psql -h localhost -d toardb -U toaradmin -c "INSERT INTO timeseries \
        res=`psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO timeseries\
         (label,\"order\",access_rights,sampling_frequency,aggregation,data_start_date,data_end_date,sampling_height,\
         additional_metadata,date_added,date_modified,station_id,variable_id,source,measurement_method,programme_id) VALUES \
         ('$LABEL',$ORDER,$ACCESS_RIGHTS,$SAMPLING_FREQUENCY,$AGGREGATION,'$DATA_START_DATE','$DATA_END_DATE',$SAMPLING_HEIGHT,\
         '$ADDITIONAL_METADATA','$DATE_ADDED','$DATE_MODIFIED',$NEW_STATIONID,$VARIABLE_ID,$SOURCE,$MEASUREMENT_METHOD,$PROGRAMME_ID) \
          RETURNING id;"`
        # log new id
        NEW_ID=$(echo $res | cut -f2 | cut -f3 -d' ')
        echo "$ID $NEW_ID $PARAMETER_NAME $SAMPLING" >>"timeseries_oldID_newID_${PARAMETER_NAME}_${SAMPLING}.txt"

        # 4. many-to-many relations
        # =========================

        # 4. a) timeseries_timeseries_roles
        # =================================

        if [ $PERSON_ID -ne -1 ]
	then
#           psql -h localhost -d toardb -U toaradmin -c "INSERT INTO timeseries_timeseries_roles\
            psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO timeseries_timeseries_roles\
                    (timeseries_id,role_id) VALUES \
                    ($NEW_ID,$ROLE1);"
	fi
        if [ $ORGANISATION_ID -ne -1 ]
	then
#           psql -h localhost -d toardb -U toaradmin -c "INSERT INTO timeseries_timeseries_roles\
            psql -h zam10116.zam.kfa-juelich.de -d toardb_v2 -U s.schroeder -c "INSERT INTO timeseries_timeseries_roles\
                    (timeseries_id,role_id) VALUES \
                    ($NEW_ID,$ROLE2);"
        fi


        # 4. b) timeseries_timeseries_annotations
        # =======================================

        # for UBA: none (see above)

    fi

done < stations_oldID_newID.txt