diff --git a/prepare.sh b/prepare.sh index 79d250cfb400a771d4ae9acc9ccef3b9b5bf59fd..79b9776c7aa1c98ae01953a9bcb3db1aa9367882 100755 --- a/prepare.sh +++ b/prepare.sh @@ -5,7 +5,7 @@ # venv is created in this directory. # check if we are really in the ozone-mapping directory -S="ozone-mapping" +S="aq-bench" if [[ $(pwd) == *$S ]] then echo "Prepare..." diff --git a/source/dataset_retrieval.py b/source/dataset_retrieval.py index b1dc6e037f9f7d1020acf90a73f4a0157a50fd09..9b0501ed9430457e8b0b04f1967df6d6e98dec0f 100644 --- a/source/dataset_retrieval.py +++ b/source/dataset_retrieval.py @@ -193,7 +193,8 @@ class AQbench(): save_data_to_file(self.data, self.data_dir+'AQbench.csv') logging.warning("""Do not forget to improve the population density of id 4589 and throw out station id 4587 - because of very high ozone values reported there""") + because of very high ozone values reported there. + Also, drop station without metadata.""") logging.info('AQbench complete') @@ -418,6 +419,43 @@ class MetricsRow(): + ' ' + print_metric + ': ' + str(self.row[print_metric])) +def data_capture(): + """ + A simple function to look at the data capture of our metrics. + """ + import pdb + # df = pd.read_csv(resources_dir+'yearly_metrics.csv') + # df['hourly_samples'] = [0] * len(df) + # df['capture'] = [0] * len(df) + df = pd.read_csv(resources_dir+'intermediate_at_cap5500.csv') + for idx, row in df.iterrows(): + if idx > 5500: + id_tuple = row['o3_series_id'] + id_string_list = [str(id_) for id_ in eval(id_tuple)] + query = f""" + SELECT + datetime, value + FROM o3_hourly + WHERE id IN ({','.join(id_string_list)}) + AND datetime between '2010-01-01 00:00:00' + AND '2014-12-31 23:59:59'; + """ + result = query_db(query) + result.drop_duplicates(subset='datetime', inplace=True, + ignore_index=True) + count = len(result) + df.loc[idx, 'hourly_samples'] = count + df.loc[idx, 'capture'] = count / 43824 + print(count, count/43824) + if (idx > 1) and (idx % 500 == 0): + df.to_csv(resources_dir+f'intermediate_at_cap{idx}.csv', + index=False) + + df.to_csv(resources_dir+f'yearly_metrics_cap.csv', + index=False) + + pdb.set_trace() + def full_aqbench(): """ start one retrieval. @@ -457,4 +495,5 @@ if __name__ == '__main__': logging.StreamHandler()]) # start retrieval - full_aqbench() + # full_aqbench() + data_capture()