Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hls-download-pipeline
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ehsan Zandi
hls-download-pipeline
Commits
62d15ef3
Commit
62d15ef3
authored
4 months ago
by
Ehsan
Browse files
Options
Downloads
Patches
Plain Diff
check the number of bands
parent
26cacaff
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
all_functions.py
+65
-54
65 additions, 54 deletions
all_functions.py
check_query_lists.py
+13
-12
13 additions, 12 deletions
check_query_lists.py
hls-qurey-tiles.py
+0
-0
0 additions, 0 deletions
hls-qurey-tiles.py
with
78 additions
and
66 deletions
all_functions.py
+
65
−
54
View file @
62d15ef3
...
...
@@ -8,6 +8,7 @@
from
datetime
import
datetime
import
math
# from osgeo import gdal
# from pyproj import Proj
# from pystac_client import Client
...
...
@@ -363,30 +364,34 @@ def acquisition_download_links_to_csv(tile_id, output_file_name,
acq_idx
=
-
1
for
acquisition
in
acquisitions
:
acq_idx
+=
1
for
i
in
range
(
0
,
len
(
acquisition
[
'
links
'
]),
2
):
if
"
https://data
"
in
acquisition
[
'
links
'
][
i
][
'
href
'
]
and
"
tif
"
in
acquisition
[
'
links
'
][
i
][
'
href
'
]:
download_links
=
download_links
+
[
acquisition
[
'
links
'
][
i
][
'
href
'
]]
link_list
=
[
acquisition
[
'
links
'
][
i
][
'
href
'
]
for
i
in
range
(
len
(
acquisition
[
'
links
'
]))]
df_current
=
pd
.
DataFrame
(
link_list
,
columns
=
[
"
download
"
])
df_current
=
df_current
.
loc
[
df_current
[
'
download
'
].
str
.
contains
(
"
https://data
"
)]
df_current
[
"
product
"
]
=
[
acquisition
[
'
producer_granule_id
'
][
0
:
7
]]
*
len
(
df_current
)
df_current
=
df_current
[
df_current
.
apply
(
lambda
row
:
filter_bands
(
row
),
axis
=
1
)]
num_links
=
len
(
df_current
)
# df_current["product"] = [acquisition['producer_granule_id'][0:7]] * num_links
# acq_idx += 1
# for i in range(0,len(acquisition['links']),2):
# if "https://data" in acquisition['links'][i]['href'] and "tif" in acquisition['links'][i]['href']:
# download_links = download_links + [acquisition['links'][i]['href']]
# download_links = download_links[0:2]
num_links
=
len
(
download_links
)
df_current
=
pd
.
DataFrame
(
download_links
,
index
=
None
,
columns
=
[
"
download
"
])
df_current
[
"
cloud
"
]
=
[
acquisition
[
'
cloud_cover
'
]]
*
num_links
df_current
[
"
id
"
]
=
[
acquisition
[
'
producer_granule_id
'
]]
*
num_links
df_current
[
"
date
"
]
=
[
acquisition
[
'
time_start
'
][
0
:
10
]]
*
num_links
df_current
[
"
time
"
]
=
[
acquisition
[
'
time_start
'
][
11
:
-
5
]]
*
num_links
df_current
[
"
product
"
]
=
[
acquisition
[
'
producer_granule_id
'
][
0
:
7
]]
*
num_links
df_current
[
"
tile
"
]
=
[
tile_id
]
*
num_links
df_current
[
"
acq_coords
"
]
=
""
*
num_links
# print(len(acquisitions_coords), acq_idx)
# print(acquisitions_coords[acq_idx])
df_current
[
"
acq_coords
"
]
=
[
acquisitions_coords
[
acq_idx
]]
*
num_links
df_current
[
"
tile_coords
"
]
=
[
tile_coords
]
*
num_links
df_current
.
index
=
range
(
len
(
df_current
))
if
ifComplete
and
acquisition
==
acquisitions
[
-
1
]:
status
=
"
complete
"
df_current
[
"
status
"
]
=
[
status
]
*
num_links
df_current
=
df_current
[
df_columns
]
df
=
pd
.
concat
([
df
,
df_current
],
ignore_index
=
True
)
df_unique
=
df
.
drop_duplicates
(
subset
=
[
"
download
"
],
keep
=
"
first
"
)
df
=
df_unique
#
df_unique = df.drop_duplicates(subset=["download"], keep="first")
#
df = df_unique
if
True
:
if
os
.
path
.
isfile
(
output_file_name
):
df
.
to_csv
(
output_file_name
,
mode
=
"
a
"
,
header
=
False
,
index
=
False
)
...
...
@@ -436,53 +441,59 @@ def tile_completeness_check_with_all_acquisitions(df, tile_id= [], if_printout =
if
not
if_complete
:
print
(
status_message
)
return
if_complete
,
ids
,
date_from
,
date_to
,
cloud_coverage_max
# plot_union_polygon(tile_id, union_polygon, tile_polygon)
def
tile_completeness_check_with_two_acquisitions
(
df
,
tile_id
=
[],
if_printout
=
True
):
df_fmask
=
df
.
loc
[
df
[
'
download
'
].
str
.
contains
(
"
FMask
"
,
case
=
False
,
na
=
False
)]
df_fmask
.
index
=
range
(
0
,
len
(
df_fmask
.
index
))
df_acq_coords
=
df_fmask
[
'
acq_coords
'
]
tile_coords
=
ast
.
literal_eval
(
df_fmask
[
'
tile_coords
'
].
loc
[
0
])
tile_polygon
=
Polygon
(
tile_coords
)
cloud_coverage_max
=
0
completeness_check
=
"
incomplete
"
if_complete
=
False
ids
=
""
for
first_idx
in
range
(
0
,
len
(
df_fmask
)
-
1
):
ids
=
[
df_fmask
[
'
id
'
].
loc
[
first_idx
]]
print
(
"
here:
"
,
ids
)
union_polygon
=
Polygon
([])
if_complete
=
False
cloud_coverage_first
=
df_fmask
[
'
cloud
'
].
loc
[
first_idx
]
for
second_idx
in
range
(
first_idx
,
len
(
df_fmask
)):
cloud_coverage_second
=
df_fmask
[
'
cloud
'
].
loc
[
second_idx
]
acq_coords
=
ast
.
literal_eval
(
df_fmask
[
'
acq_coords
'
].
loc
[
second_idx
])
# coord_tmp = [[float(coord_tmp[i+1]),float(coord_tmp[i])] for i in range(0,int(len(coord_tmp)),2)]
acquisition_polygon
=
Polygon
(
acq_coords
)
union_polygon
=
union_polygon
.
union
(
acquisition_polygon
)
union_polygon
=
tile_polygon
.
intersection
(
union_polygon
)
polygon_surface_relative_diff_percent
=
(
tile_polygon
.
area
-
union_polygon
.
area
)
/
tile_polygon
.
area
*
100
if
polygon_surface_relative_diff_percent
<
1e-1
:
if_complete
=
True
ids
.
append
(
df_fmask
[
'
id
'
].
loc
[
second_idx
])
cloud_coverage_max
=
max
(
cloud_coverage_first
,
cloud_coverage_second
)
print
(
cloud_coverage_max
)
break
if
if_complete
:
print
(
cloud_coverage_max
)
completeness_check
=
"
complete
"
break
print
(
cloud_coverage_max
)
# def tile_completeness_check_with_two_acquisitions(df, tile_id= [], if_printout = True):
# df_fmask = df.loc[df['download'].str.contains("FMask", case=False, na=False)]
# df_fmask.index = range(0,len(df_fmask.index))
# df_acq_coords = df_fmask['acq_coords']
# tile_coords = ast.literal_eval(df_fmask['tile_coords'].loc[0])
# tile_polygon = Polygon(tile_coords)
# cloud_coverage_max = 0
# completeness_check = "incomplete"
# if_complete = False
# for first_idx in range(0, len(df_fmask)-1):
# ids = [df_fmask['id'].loc[first_idx]]
# union_polygon = Polygon([])
# if_complete = False
# cloud_coverage_first = df_fmask['cloud'].loc[first_idx]
# for second_idx in range(first_idx,len(df_fmask)):
# cloud_coverage_second = df_fmask['cloud'].loc[second_idx]
# acq_coords = ast.literal_eval(df_fmask['acq_coords'].loc[second_idx])
# acquisition_polygon = Polygon(acq_coords)
# union_polygon = union_polygon.union(acquisition_polygon)
# union_polygon = tile_polygon.intersection(union_polygon)
# polygon_surface_relative_diff_percent = (tile_polygon.area - union_polygon.area) / tile_polygon.area*100
# if polygon_surface_relative_diff_percent < 1e-1:
# if_complete = True
# ids.append(df_fmask['id'].loc[second_idx])
# cloud_coverage_max = max(cloud_coverage_first, cloud_coverage_second)
# break
# if if_complete:
# completeness_check = "complete"
# break
#
# date_to = max(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx])
# date_from = min(df_fmask['date'].loc[first_idx], df_fmask['date'].loc[second_idx])
date_from
=
"
2020-01-01
"
date_to
=
"
2024-12-31
"
status_message
=
f
"
{
tile_id
}
, from
{
date_from
}
to
{
date_to
}
, maximum cloud coverage:
{
cloud_coverage_max
}
,
{
completeness_check
}
!
"
if
if_printout
:
print
(
status_message
,
end
=
"
\r
"
)
# print(status_message)
return
if_complete
,
ids
,
date_from
,
date_to
,
cloud_coverage_max
# status_message = f"{tile_id}, from {date_from} to {date_to}, maximum cloud coverage: {cloud_coverage_max}, {completeness_check}!"
# if if_printout: print(status_message, end="\r")
# return if_complete, ids, date_from, date_to, cloud_coverage_max
# plot_union_polygon(tile_id, union_polygon, tile_polygon)
def
filter_bands
(
row
,
bands
):
def
check_if_bands_are_correct
(
df
):
ids
=
list
(
df
[
'
id
'
].
unique
())
for
id
in
ids
:
df_id
=
df
.
loc
[
df
[
'
id
'
]
==
id
]
tile_id
=
list
(
df_id
[
'
tile
'
])[
0
]
tile_date
=
list
(
df_id
[
'
date
'
])[
0
]
# tile_id = df_id['tile']
if
len
(
df_id
)
!=
7
:
print
(
f
"
{
tile_id
}
-
{
tile_date
}
: bands are not correct!
"
)
# raise ValueError(f"{tile_id}-{tile_date}: bands are not correct!")
def
filter_bands
(
row
):
bands
=
{
'
l30
'
:
[
"
B02
"
,
"
B03
"
,
"
B04
"
,
"
B05
"
,
"
B06
"
,
"
B07
"
,
"
Fmask
"
],
'
s30
'
:
[
"
B02
"
,
"
B03
"
,
"
B04
"
,
"
B8A
"
,
"
B11
"
,
"
B12
"
,
"
Fmask
"
]
}
product_key
=
row
[
"
product
"
].
split
(
'
.
'
)[
-
1
].
lower
()
# Extract 's30' or 'l30'
if
product_key
in
bands
:
return
any
(
b
in
row
[
"
download
"
]
for
b
in
bands
[
product_key
])
...
...
This diff is collapsed.
Click to expand it.
check_query_lists.py
+
13
−
12
View file @
62d15ef3
...
...
@@ -7,7 +7,9 @@ from all_functions import plot_histogram_of_tiles
from
all_functions
import
filter_bands
from
all_functions
import
filter_ids
from
all_functions
import
tile_completeness_check_with_all_acquisitions
as
tile_completeness_check
# from all_functions import tile_completeness_check_with_two_acquisitions as tile_completeness_check
from
all_functions
import
time_elapsed
from
all_functions
import
check_if_bands_are_correct
def
analyze_query_list
(
cloud_coverage_step
=
10
):
input_file
=
f
"
amazon-download-links_cloud-coverage-step-
{
cloud_coverage_step
}
.csv
"
...
...
@@ -15,25 +17,24 @@ def analyze_query_list(cloud_coverage_step = 10):
print
(
f
"
There exists no such file as
{
input_file
}
"
)
return
df
=
pd
.
read_csv
(
input_file
)
bands
=
{
'
l30
'
:
[
"
B02
"
,
"
B03
"
,
"
B04
"
,
"
B05
"
,
"
B06
"
,
"
B07
"
,
"
Fmask
"
],
'
s30
'
:
[
"
B02
"
,
"
B03
"
,
"
B04
"
,
"
B8A
"
,
"
B11
"
,
"
B12
"
,
"
Fmask
"
]
}
df_band_filtered
=
df
[
df
.
apply
(
lambda
row
:
filter_bands
(
row
,
bands
),
axis
=
1
)]
df_sorted
=
df_band_filtered
.
loc
[
df_band_filtered
[
'
date
'
].
sort_values
().
index
]
# df_band_filtered = df[df.apply(lambda row: filter_bands(row), axis=1)]
# df_sorted = df_band_filtered.loc[df_band_filtered['date'].sort_values().index]
tile_id_list
=
list
(
df
[
'
tile
'
].
unique
())
# for tile_id in tile_id_list: print(tile_id)
df_selected
=
pd
.
DataFrame
(
columns
=
df
.
columns
)
df
=
df_band_filtered
print
(
f
"
Clodud step size:
{
cloud_coverage_step
}
, number of files:
{
len
(
df
)
}
"
)
# df_selected = pd.DataFrame(columns=df.columns)
# df = df_band_filtered
df_unique
=
df
.
drop_duplicates
(
subset
=
[
'
download
'
])
# df = df_unique
print
(
f
"
Cloud step size:
{
cloud_coverage_step
}
, number of files:
{
len
(
df
)
}
"
)
time_interval_list
=
[]
cloud_coverage_max_list
=
[]
incomplete_tile_list
=
[]
for
tile_id
in
tile_id_list
:
df_tile
=
df
.
loc
[
df
[
'
tile
'
]
==
tile_id
]
check_if_bands_are_correct
(
df_tile
)
if_complete
,
ids
,
date_from
,
date_to
,
cloud_coverage_max_current
=
tile_completeness_check
(
df_tile
,
tile_id
,
if_printout
=
False
)
df_tile_selected
=
df_tile
[
df_tile
.
apply
(
lambda
row
:
filter_ids
(
row
,
ids
),
axis
=
1
)]
df_selected
=
pd
.
concat
([
df_selected
,
df_tile_selected
],
ignore_index
=
True
)
#
df_tile_selected = df_tile[df_tile.apply(lambda row: filter_ids(row, ids), axis=1)]
#
df_selected = pd.concat([df_selected, df_tile_selected], ignore_index=True)
if
not
if_complete
:
incomplete_tile_list
.
append
(
tile_id
)
months_elapsed
,
days_elapsed
=
time_elapsed
(
date_from
,
date_to
)
...
...
@@ -44,7 +45,7 @@ def analyze_query_list(cloud_coverage_step = 10):
image_output_file
=
f
"
histogram-cloud-step-size-
{
cloud_coverage_step
}
.png
"
if
os
.
path
.
isdir
(
image_output_dir
):
image_output_file
=
image_output_dir
+
"
/
"
+
image_output_file
df_
selected
.
to_csv
(
"
final_
"
+
input_file
,
header
=
True
,
index
=
False
)
#
df_
unique
.to_csv("final_"+input_file, header=True, index=False)
plot_histogram_of_tiles
(
time_interval_list
,
cloud_coverage_max_list
,
cloud_coverage_step
,
image_output_file
)
if
len
(
incomplete_tile_list
)
>
0
:
df_incomplete
=
pd
.
DataFrame
(
incomplete_tile_list
,
columns
=
[
'
tile
'
])
...
...
This diff is collapsed.
Click to expand it.
hls.py
→
hls
-qurey-tiles
.py
+
0
−
0
View file @
62d15ef3
File moved
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment