check the number of bands

62d15ef3 · Ehsan · 26cacaff · 62d15ef3 · 62d15ef3 · 62d15ef3
Commit 62d15ef3 authored 4 months ago by Ehsan
--- a/all_functions.py
+++ b/all_functions.py
@@ -8,6 +8,7 @@
 from datetime import datetime
 import math

+
 # from osgeo import gdal
 # from pyproj import Proj
 # from pystac_client import Client
@@ -363,30 +364,34 @@ def acquisition_download_links_to_csv(tile_id, output_file_name,
    acq_idx = -1
    for acquisition in acquisitions:
        acq_idx += 1
-        for i in range(0,len(acquisition['links']),2):
-            if "https://data" in acquisition['links'][i]['href'] and "tif" in acquisition['links'][i]['href']:
-                download_links = download_links + [acquisition['links'][i]['href']]
+        link_list = [acquisition['links'][i]['href'] for i in range(len(acquisition['links']))]
+        df_current = pd.DataFrame(link_list, columns=["download"])
+        df_current = df_current.loc[df_current['download'].str.contains("https://data")]
+        df_current["product"] = [acquisition['producer_granule_id'][0:7]] * len(df_current)
+        df_current = df_current[df_current.apply(lambda row: filter_bands(row), axis=1)]
+        num_links = len(df_current)
+#         df_current["product"] = [acquisition['producer_granule_id'][0:7]] * num_links
+        # acq_idx += 1
+        # for i in range(0,len(acquisition['links']),2):
+        #     if "https://data" in acquisition['links'][i]['href'] and "tif" in acquisition['links'][i]['href']:
+        #         download_links = download_links + [acquisition['links'][i]['href']]
        # download_links = download_links[0:2]
-        num_links = len(download_links)
-        df_current = pd.DataFrame(download_links, index=None, columns=["download"])
        df_current["cloud"] = [acquisition['cloud_cover']] * num_links
        df_current["id"] = [acquisition['producer_granule_id']] * num_links
        df_current["date"] = [acquisition['time_start'][0:10]] * num_links
        df_current["time"] = [acquisition['time_start'][11:-5]] * num_links
-        df_current["product"] = [acquisition['producer_granule_id'][0:7]] * num_links
        df_current["tile"] = [tile_id] * num_links
        df_current["acq_coords"] = "" * num_links
-#         print(len(acquisitions_coords), acq_idx)
-#         print(acquisitions_coords[acq_idx])
        df_current["acq_coords"] = [acquisitions_coords[acq_idx]] * num_links
        df_current["tile_coords"] =[tile_coords] * num_links
+        df_current.index = range(len(df_current))
        if ifComplete and acquisition == acquisitions[-1]:
            status = "complete"
        df_current["status"] = [status] * num_links
        df_current = df_current[df_columns]
        df = pd.concat([df, df_current], ignore_index=True)
-        df_unique = df.drop_duplicates(subset=["download"], keep="first")
-        df = df_unique
+        # df_unique = df.drop_duplicates(subset=["download"], keep="first")
+        # df = df_unique
    if True:
        if os.path.isfile(output_file_name):
            df.to_csv(output_file_name, mode = "a", header = False, index = False)
@@ -436,53 +441,59 @@ def tile_completeness_check_with_all_acquisitions(df, tile_id= [], if_printout =
    if not if_complete: print(status_message)
    return if_complete, ids, date_from, date_to, cloud_coverage_max
    # plot_union_polygon(tile_id, union_polygon, tile_polygon)
-def tile_completeness_check_with_two_acquisitions(df, tile_id= [], if_printout = True):
-    df_fmask = df.loc[df['download'].str.contains("FMask", case=False, na=False)]
-    df_fmask.index = range(0,len(df_fmask.index))
-    df_acq_coords = df_fmask['acq_coords']
-    tile_coords = ast.literal_eval(df_fmask['tile_coords'].loc[0])
-    tile_polygon = Polygon(tile_coords)
-    cloud_coverage_max = 0
-    completeness_check = "incomplete"
-    if_complete = False
-    ids = ""
-    for first_idx in range(0, len(df_fmask)-1):
-        ids = [df_fmask['id'].loc[first_idx]]
-        print("here:  ", ids)
-        union_polygon = Polygon([])
-        if_complete = False
-        cloud_coverage_first = df_fmask['cloud'].loc[first_idx]
-        for second_idx in range(first_idx,len(df_fmask)):
-            cloud_coverage_second = df_fmask['cloud'].loc[second_idx]
-            acq_coords = ast.literal_eval(df_fmask['acq_coords'].loc[second_idx])
-            # coord_tmp = [[float(coord_tmp[i+1]),float(coord_tmp[i])] for i in range(0,int(len(coord_tmp)),2)]
-            acquisition_polygon = Polygon(acq_coords)
-            union_polygon = union_polygon.union(acquisition_polygon)
-            union_polygon = tile_polygon.intersection(union_polygon)
-            polygon_surface_relative_diff_percent = (tile_polygon.area - union_polygon.area) / tile_polygon.area*100 
-            if polygon_surface_relative_diff_percent < 1e-1:
-                if_complete = True
-                ids.append(df_fmask['id'].loc[second_idx])
-                cloud_coverage_max = max(cloud_coverage_first, cloud_coverage_second)
-                print(cloud_coverage_max)
-                break
-        if if_complete: 
-            print(cloud_coverage_max)
-            completeness_check = "complete"
-            break
    
-    print(cloud_coverage_max)
+# def tile_completeness_check_with_two_acquisitions(df, tile_id= [], if_printout = True):
+#     df_fmask = df.loc[df['download'].str.contains("FMask", case=False, na=False)]
+#     df_fmask.index = range(0,len(df_fmask.index))
+#     df_acq_coords = df_fmask['acq_coords']
+#     tile_coords = ast.literal_eval(df_fmask['tile_coords'].loc[0])
+#     tile_polygon = Polygon(tile_coords)
+#     cloud_coverage_max = 0
+#     completeness_check = "incomplete"
+#     if_complete = False
+#     for first_idx in range(0, len(df_fmask)-1):
+#         ids = [df_fmask['id'].loc[first_idx]]
+#         union_polygon = Polygon([])
+#         if_complete = False
+#         cloud_coverage_first = df_fmask['cloud'].loc[first_idx]
+#         for second_idx in range(first_idx,len(df_fmask)):
+#             cloud_coverage_second = df_fmask['cloud'].loc[second_idx]
+#             acq_coords = ast.literal_eval(df_fmask['acq_coords'].loc[second_idx])
+#             acquisition_polygon = Polygon(acq_coords)
+#             union_polygon = union_polygon.union(acquisition_polygon)
+#             union_polygon = tile_polygon.intersection(union_polygon)
+#             polygon_surface_relative_diff_percent = (tile_polygon.area - union_polygon.area) / tile_polygon.area*100 
+#             if polygon_surface_relative_diff_percent < 1e-1:
+#                 if_complete = True
+#                 ids.append(df_fmask['id'].loc[second_idx])
+#                 cloud_coverage_max = max(cloud_coverage_first, cloud_coverage_second)
+#                 break
+#         if if_complete: 
+#             completeness_check = "complete"
+#             break
+# 
 #     date_to = max(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx])
 #     date_from = min(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx])
-    date_from = "2020-01-01"
-    date_to = "2024-12-31"
-    status_message = f"{tile_id}, from {date_from} to {date_to}, maximum cloud coverage: {cloud_coverage_max}, {completeness_check}!"
-    if if_printout: print(status_message, end="\r")
-    # print(status_message)
-    return if_complete, ids, date_from, date_to, cloud_coverage_max
+#     status_message = f"{tile_id}, from {date_from} to {date_to}, maximum cloud coverage: {cloud_coverage_max}, {completeness_check}!"
+#     if if_printout: print(status_message, end="\r")
+#     return if_complete, ids, date_from, date_to, cloud_coverage_max
 # plot_union_polygon(tile_id, union_polygon, tile_polygon)
        
-def filter_bands(row, bands):
+def check_if_bands_are_correct(df):
+    ids = list(df['id'].unique())
+    for id in ids:
+        df_id = df.loc[df['id'] == id]
+        tile_id = list(df_id['tile'])[0]
+        tile_date = list(df_id['date'])[0]
+#         tile_id = df_id['tile']
+        if len(df_id) != 7: 
+            print(f"{tile_id}-{tile_date}: bands are not correct!")
+#             raise ValueError(f"{tile_id}-{tile_date}: bands are not correct!")
+def filter_bands(row):
+    bands = {
+        'l30': ["B02", "B03", "B04", "B05", "B06", "B07", "Fmask"],
+        's30': ["B02", "B03", "B04", "B8A", "B11", "B12", "Fmask"]
+    }
    product_key = row["product"].split('.')[-1].lower()  # Extract 's30' or 'l30'
    if product_key in bands:
        return any(b in row["download"] for b in bands[product_key])

--- a/check_query_lists.py
+++ b/check_query_lists.py
@@ -7,7 +7,9 @@ from all_functions import plot_histogram_of_tiles
 from all_functions import filter_bands
 from all_functions import filter_ids
 from all_functions import tile_completeness_check_with_all_acquisitions as tile_completeness_check
+# from all_functions import tile_completeness_check_with_two_acquisitions as tile_completeness_check
 from all_functions import time_elapsed
+from all_functions import check_if_bands_are_correct

 def analyze_query_list(cloud_coverage_step = 10):
    input_file = f"amazon-download-links_cloud-coverage-step-{cloud_coverage_step}.csv"
@@ -15,25 +17,24 @@ def analyze_query_list(cloud_coverage_step = 10):
        print(f"There exists no such file as {input_file}")
        return
    df = pd.read_csv(input_file)
-    bands = {
-        'l30': ["B02", "B03", "B04", "B05", "B06", "B07", "Fmask"],
-        's30': ["B02", "B03", "B04", "B8A", "B11", "B12", "Fmask"]
-    }
-    df_band_filtered = df[df.apply(lambda row: filter_bands(row, bands), axis=1)]
-    df_sorted = df_band_filtered.loc[df_band_filtered['date'].sort_values().index]
+#     df_band_filtered = df[df.apply(lambda row: filter_bands(row), axis=1)]
+#     df_sorted = df_band_filtered.loc[df_band_filtered['date'].sort_values().index]
    tile_id_list = list(df['tile'].unique())
    # for tile_id in tile_id_list: print(tile_id)
-    df_selected = pd.DataFrame(columns=df.columns)
-    df = df_band_filtered
-    print(f"Clodud step size:{cloud_coverage_step}, number of files:{len(df)}")
+#     df_selected = pd.DataFrame(columns=df.columns)
+#     df = df_band_filtered
+    df_unique = df.drop_duplicates(subset=['download'])
+#     df = df_unique
+    print(f"Cloud step size:{cloud_coverage_step}, number of files:{len(df)}")
    time_interval_list = []
    cloud_coverage_max_list = []
    incomplete_tile_list = [] 
    for tile_id in tile_id_list: 
        df_tile = df.loc[df['tile'] == tile_id]
+        check_if_bands_are_correct(df_tile)
        if_complete, ids, date_from, date_to, cloud_coverage_max_current = tile_completeness_check(df_tile, tile_id, if_printout = False)
-        df_tile_selected = df_tile[df_tile.apply(lambda row: filter_ids(row, ids), axis=1)]
-        df_selected = pd.concat([df_selected, df_tile_selected], ignore_index=True)
+#         df_tile_selected = df_tile[df_tile.apply(lambda row: filter_ids(row, ids), axis=1)]
+#         df_selected = pd.concat([df_selected, df_tile_selected], ignore_index=True)
        if not if_complete:
            incomplete_tile_list.append(tile_id)
        months_elapsed, days_elapsed = time_elapsed(date_from, date_to)
@@ -44,7 +45,7 @@ def analyze_query_list(cloud_coverage_step = 10):
        image_output_file = f"histogram-cloud-step-size-{cloud_coverage_step}.png"
        if os.path.isdir(image_output_dir):
          image_output_file = image_output_dir+"/"+image_output_file
-    df_selected.to_csv("final_"+input_file, header=True, index=False)
+#     df_unique.to_csv("final_"+input_file, header=True, index=False)
    plot_histogram_of_tiles(time_interval_list, cloud_coverage_max_list, cloud_coverage_step, image_output_file)
    if len(incomplete_tile_list) > 0:
        df_incomplete = pd.DataFrame(incomplete_tile_list, columns=['tile'])

--- a/hls.py
+++ b/hls.py