Skip to content
Snippets Groups Projects
Commit a91dfe85 authored by Ehsan's avatar Ehsan
Browse files

download script added

parent 3d23dad7
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from urllib3.util import Retry
from time import sleep
import logging
import gzip
import shutil
import sys
import os
import pandas as pd
import argparse
import glob
# Define function to download a single file
def download_file(url):
# Get file name from URL
filename = url.split('/')[-1]
if url.find("segments") != -1:
filepath = download_path + "/" + url[url.find("segments"):]
else:
filepath = download_path + "/" + url.split("/")[-1]
if os.path.exists(filepath):
return filename
filedir = str.join("/", filepath.split('/')[:-1])
if not os.path.exists(filedir):
os.makedirs(filedir, exist_ok=True)
response = requests.head(url)
file_size = int(response.headers["content-length"])
with tqdm(unit="B", unit_scale=True, total=file_size, leave=False, desc=filename, dynamic_ncols=True) as pbar:
retry_count = 1000
while retry_count:
try:
with requests.get(url, stream=True) as r:
r.raise_for_status()
# Write file to disk
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=1048576):
f.write(chunk)
pbar.update(len(chunk))
retry_count = 0
except:
os.remove(filepath)
retry_count -= 1
sleep(1)
return filename
# Define function to download files in parallel with a specified number of workers
def download_files(urls, num_workers, number_of_existing_files = 0):
# Use ThreadPoolExecutor to download files in parallel
with ThreadPoolExecutor(max_workers=num_workers) as executor:
# Map download_file function to URLs
futures = {executor.submit(download_file, url): url for url in urls}
# Monitor progress of each download
for future in tqdm(as_completed(futures), total=len(futures)-number_of_existing_files, desc="Total", dynamic_ncols=True):
url = futures[future]
try:
filename = future.result()
#print(f'Downloaded {filename}')
except Exception as e:
print(f'Failed to download {url}: {e}')
# Call download_files function with the desired number of workers
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Query an area for HLS granules.")
parser.add_argument("-c","--cloud-step", type=int, default=10, help="The step size for the cloud coverage threshold (default: 10)")
parser.add_argument("-n","--number-workers", type=str, default=24, help="The number of download workers (default: 24)")
parser.add_argument("-p","--download-path", type=str, default='downloaded-tiles-amazon', help="The path to download images (default: downloaded-tiles-amazon)")
# parser.add_argument("-r","--remove-old-output", action="store_true", help="If given, it remove the existing output csv file.")
args = parser.parse_args()
# query_the_area (kml_file_path = args.kml_file, date_from = "2020-01-01", date_to = "2024-12-31")
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
download_path = args.download_path
cloud_coverage_step = args.cloud_step
num_workers= int(args.number_workers)
os.makedirs(download_path, exist_ok=True)
input_file = f"amazon-download-links_cloud-coverage-step-{cloud_coverage_step}.csv"
if not os.path.isfile(input_file):
print(f"There exists no such file as {input_file}")
df = pd.read_csv(input_file)
# df = df.loc[df['tile'] == "T17LQL"]
urls = df['download']
number_of_existing_files = len(glob.glob(download_path+"/HLS*"))
download_files(urls, num_workers, number_of_existing_files)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment