diff --git a/all_functions.py b/all_functions.py index 346863f36b1bf619cf191ec93bcc41c9993150f8..9f4a337d339c9c6c44cb71df76ff7bb86cc73968 100644 --- a/all_functions.py +++ b/all_functions.py @@ -8,6 +8,7 @@ from datetime import datetime import math + # from osgeo import gdal # from pyproj import Proj # from pystac_client import Client @@ -363,30 +364,34 @@ def acquisition_download_links_to_csv(tile_id, output_file_name, acq_idx = -1 for acquisition in acquisitions: acq_idx += 1 - for i in range(0,len(acquisition['links']),2): - if "https://data" in acquisition['links'][i]['href'] and "tif" in acquisition['links'][i]['href']: - download_links = download_links + [acquisition['links'][i]['href']] + link_list = [acquisition['links'][i]['href'] for i in range(len(acquisition['links']))] + df_current = pd.DataFrame(link_list, columns=["download"]) + df_current = df_current.loc[df_current['download'].str.contains("https://data")] + df_current["product"] = [acquisition['producer_granule_id'][0:7]] * len(df_current) + df_current = df_current[df_current.apply(lambda row: filter_bands(row), axis=1)] + num_links = len(df_current) +# df_current["product"] = [acquisition['producer_granule_id'][0:7]] * num_links + # acq_idx += 1 + # for i in range(0,len(acquisition['links']),2): + # if "https://data" in acquisition['links'][i]['href'] and "tif" in acquisition['links'][i]['href']: + # download_links = download_links + [acquisition['links'][i]['href']] # download_links = download_links[0:2] - num_links = len(download_links) - df_current = pd.DataFrame(download_links, index=None, columns=["download"]) df_current["cloud"] = [acquisition['cloud_cover']] * num_links df_current["id"] = [acquisition['producer_granule_id']] * num_links df_current["date"] = [acquisition['time_start'][0:10]] * num_links df_current["time"] = [acquisition['time_start'][11:-5]] * num_links - df_current["product"] = [acquisition['producer_granule_id'][0:7]] * num_links df_current["tile"] = [tile_id] * num_links df_current["acq_coords"] = "" * num_links -# print(len(acquisitions_coords), acq_idx) -# print(acquisitions_coords[acq_idx]) df_current["acq_coords"] = [acquisitions_coords[acq_idx]] * num_links df_current["tile_coords"] =[tile_coords] * num_links + df_current.index = range(len(df_current)) if ifComplete and acquisition == acquisitions[-1]: status = "complete" df_current["status"] = [status] * num_links df_current = df_current[df_columns] df = pd.concat([df, df_current], ignore_index=True) - df_unique = df.drop_duplicates(subset=["download"], keep="first") - df = df_unique + # df_unique = df.drop_duplicates(subset=["download"], keep="first") + # df = df_unique if True: if os.path.isfile(output_file_name): df.to_csv(output_file_name, mode = "a", header = False, index = False) @@ -436,53 +441,59 @@ def tile_completeness_check_with_all_acquisitions(df, tile_id= [], if_printout = if not if_complete: print(status_message) return if_complete, ids, date_from, date_to, cloud_coverage_max # plot_union_polygon(tile_id, union_polygon, tile_polygon) -def tile_completeness_check_with_two_acquisitions(df, tile_id= [], if_printout = True): - df_fmask = df.loc[df['download'].str.contains("FMask", case=False, na=False)] - df_fmask.index = range(0,len(df_fmask.index)) - df_acq_coords = df_fmask['acq_coords'] - tile_coords = ast.literal_eval(df_fmask['tile_coords'].loc[0]) - tile_polygon = Polygon(tile_coords) - cloud_coverage_max = 0 - completeness_check = "incomplete" - if_complete = False - ids = "" - for first_idx in range(0, len(df_fmask)-1): - ids = [df_fmask['id'].loc[first_idx]] - print("here: ", ids) - union_polygon = Polygon([]) - if_complete = False - cloud_coverage_first = df_fmask['cloud'].loc[first_idx] - for second_idx in range(first_idx,len(df_fmask)): - cloud_coverage_second = df_fmask['cloud'].loc[second_idx] - acq_coords = ast.literal_eval(df_fmask['acq_coords'].loc[second_idx]) - # coord_tmp = [[float(coord_tmp[i+1]),float(coord_tmp[i])] for i in range(0,int(len(coord_tmp)),2)] - acquisition_polygon = Polygon(acq_coords) - union_polygon = union_polygon.union(acquisition_polygon) - union_polygon = tile_polygon.intersection(union_polygon) - polygon_surface_relative_diff_percent = (tile_polygon.area - union_polygon.area) / tile_polygon.area*100 - if polygon_surface_relative_diff_percent < 1e-1: - if_complete = True - ids.append(df_fmask['id'].loc[second_idx]) - cloud_coverage_max = max(cloud_coverage_first, cloud_coverage_second) - print(cloud_coverage_max) - break - if if_complete: - print(cloud_coverage_max) - completeness_check = "complete" - break - - print(cloud_coverage_max) + +# def tile_completeness_check_with_two_acquisitions(df, tile_id= [], if_printout = True): +# df_fmask = df.loc[df['download'].str.contains("FMask", case=False, na=False)] +# df_fmask.index = range(0,len(df_fmask.index)) +# df_acq_coords = df_fmask['acq_coords'] +# tile_coords = ast.literal_eval(df_fmask['tile_coords'].loc[0]) +# tile_polygon = Polygon(tile_coords) +# cloud_coverage_max = 0 +# completeness_check = "incomplete" +# if_complete = False +# for first_idx in range(0, len(df_fmask)-1): +# ids = [df_fmask['id'].loc[first_idx]] +# union_polygon = Polygon([]) +# if_complete = False +# cloud_coverage_first = df_fmask['cloud'].loc[first_idx] +# for second_idx in range(first_idx,len(df_fmask)): +# cloud_coverage_second = df_fmask['cloud'].loc[second_idx] +# acq_coords = ast.literal_eval(df_fmask['acq_coords'].loc[second_idx]) +# acquisition_polygon = Polygon(acq_coords) +# union_polygon = union_polygon.union(acquisition_polygon) +# union_polygon = tile_polygon.intersection(union_polygon) +# polygon_surface_relative_diff_percent = (tile_polygon.area - union_polygon.area) / tile_polygon.area*100 +# if polygon_surface_relative_diff_percent < 1e-1: +# if_complete = True +# ids.append(df_fmask['id'].loc[second_idx]) +# cloud_coverage_max = max(cloud_coverage_first, cloud_coverage_second) +# break +# if if_complete: +# completeness_check = "complete" +# break +# # date_to = max(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx]) # date_from = min(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx]) - date_from = "2020-01-01" - date_to = "2024-12-31" - status_message = f"{tile_id}, from {date_from} to {date_to}, maximum cloud coverage: {cloud_coverage_max}, {completeness_check}!" - if if_printout: print(status_message, end="\r") - # print(status_message) - return if_complete, ids, date_from, date_to, cloud_coverage_max - # plot_union_polygon(tile_id, union_polygon, tile_polygon) +# status_message = f"{tile_id}, from {date_from} to {date_to}, maximum cloud coverage: {cloud_coverage_max}, {completeness_check}!" +# if if_printout: print(status_message, end="\r") +# return if_complete, ids, date_from, date_to, cloud_coverage_max +# plot_union_polygon(tile_id, union_polygon, tile_polygon) -def filter_bands(row, bands): +def check_if_bands_are_correct(df): + ids = list(df['id'].unique()) + for id in ids: + df_id = df.loc[df['id'] == id] + tile_id = list(df_id['tile'])[0] + tile_date = list(df_id['date'])[0] +# tile_id = df_id['tile'] + if len(df_id) != 7: + print(f"{tile_id}-{tile_date}: bands are not correct!") +# raise ValueError(f"{tile_id}-{tile_date}: bands are not correct!") +def filter_bands(row): + bands = { + 'l30': ["B02", "B03", "B04", "B05", "B06", "B07", "Fmask"], + 's30': ["B02", "B03", "B04", "B8A", "B11", "B12", "Fmask"] + } product_key = row["product"].split('.')[-1].lower() # Extract 's30' or 'l30' if product_key in bands: return any(b in row["download"] for b in bands[product_key]) diff --git a/check_query_lists.py b/check_query_lists.py index 79d9a56ed0785dbebb72119fac316b2f7e702dcb..ee4cae8c4bec604d977b67407f9277e2bf8eef99 100755 --- a/check_query_lists.py +++ b/check_query_lists.py @@ -7,7 +7,9 @@ from all_functions import plot_histogram_of_tiles from all_functions import filter_bands from all_functions import filter_ids from all_functions import tile_completeness_check_with_all_acquisitions as tile_completeness_check +# from all_functions import tile_completeness_check_with_two_acquisitions as tile_completeness_check from all_functions import time_elapsed +from all_functions import check_if_bands_are_correct def analyze_query_list(cloud_coverage_step = 10): input_file = f"amazon-download-links_cloud-coverage-step-{cloud_coverage_step}.csv" @@ -15,25 +17,24 @@ def analyze_query_list(cloud_coverage_step = 10): print(f"There exists no such file as {input_file}") return df = pd.read_csv(input_file) - bands = { - 'l30': ["B02", "B03", "B04", "B05", "B06", "B07", "Fmask"], - 's30': ["B02", "B03", "B04", "B8A", "B11", "B12", "Fmask"] - } - df_band_filtered = df[df.apply(lambda row: filter_bands(row, bands), axis=1)] - df_sorted = df_band_filtered.loc[df_band_filtered['date'].sort_values().index] +# df_band_filtered = df[df.apply(lambda row: filter_bands(row), axis=1)] +# df_sorted = df_band_filtered.loc[df_band_filtered['date'].sort_values().index] tile_id_list = list(df['tile'].unique()) # for tile_id in tile_id_list: print(tile_id) - df_selected = pd.DataFrame(columns=df.columns) - df = df_band_filtered - print(f"Clodud step size:{cloud_coverage_step}, number of files:{len(df)}") +# df_selected = pd.DataFrame(columns=df.columns) +# df = df_band_filtered + df_unique = df.drop_duplicates(subset=['download']) +# df = df_unique + print(f"Cloud step size:{cloud_coverage_step}, number of files:{len(df)}") time_interval_list = [] cloud_coverage_max_list = [] incomplete_tile_list = [] for tile_id in tile_id_list: df_tile = df.loc[df['tile'] == tile_id] + check_if_bands_are_correct(df_tile) if_complete, ids, date_from, date_to, cloud_coverage_max_current = tile_completeness_check(df_tile, tile_id, if_printout = False) - df_tile_selected = df_tile[df_tile.apply(lambda row: filter_ids(row, ids), axis=1)] - df_selected = pd.concat([df_selected, df_tile_selected], ignore_index=True) +# df_tile_selected = df_tile[df_tile.apply(lambda row: filter_ids(row, ids), axis=1)] +# df_selected = pd.concat([df_selected, df_tile_selected], ignore_index=True) if not if_complete: incomplete_tile_list.append(tile_id) months_elapsed, days_elapsed = time_elapsed(date_from, date_to) @@ -44,7 +45,7 @@ def analyze_query_list(cloud_coverage_step = 10): image_output_file = f"histogram-cloud-step-size-{cloud_coverage_step}.png" if os.path.isdir(image_output_dir): image_output_file = image_output_dir+"/"+image_output_file - df_selected.to_csv("final_"+input_file, header=True, index=False) +# df_unique.to_csv("final_"+input_file, header=True, index=False) plot_histogram_of_tiles(time_interval_list, cloud_coverage_max_list, cloud_coverage_step, image_output_file) if len(incomplete_tile_list) > 0: df_incomplete = pd.DataFrame(incomplete_tile_list, columns=['tile']) diff --git a/hls.py b/hls-qurey-tiles.py similarity index 100% rename from hls.py rename to hls-qurey-tiles.py