From a91dfe85468f70a234aaecbfea65d839ba1b0fc1 Mon Sep 17 00:00:00 2001 From: Ehsan <e.zandi@gmail.com> Date: Tue, 8 Apr 2025 14:23:56 +0200 Subject: [PATCH] download script added --- download-hls-tiles.py | 89 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 download-hls-tiles.py diff --git a/download-hls-tiles.py b/download-hls-tiles.py new file mode 100755 index 0000000..758181c --- /dev/null +++ b/download-hls-tiles.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +import requests +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm +from urllib3.util import Retry +from time import sleep +import logging +import gzip +import shutil +import sys +import os +import pandas as pd +import argparse +import glob + +# Define function to download a single file +def download_file(url): + + # Get file name from URL + filename = url.split('/')[-1] + if url.find("segments") != -1: + filepath = download_path + "/" + url[url.find("segments"):] + else: + filepath = download_path + "/" + url.split("/")[-1] + if os.path.exists(filepath): + return filename + filedir = str.join("/", filepath.split('/')[:-1]) + if not os.path.exists(filedir): + os.makedirs(filedir, exist_ok=True) + + response = requests.head(url) + file_size = int(response.headers["content-length"]) + + with tqdm(unit="B", unit_scale=True, total=file_size, leave=False, desc=filename, dynamic_ncols=True) as pbar: + retry_count = 1000 + while retry_count: + try: + with requests.get(url, stream=True) as r: + r.raise_for_status() + # Write file to disk + with open(filepath, 'wb') as f: + for chunk in r.iter_content(chunk_size=1048576): + f.write(chunk) + pbar.update(len(chunk)) + retry_count = 0 + except: + os.remove(filepath) + retry_count -= 1 + sleep(1) + return filename +# Define function to download files in parallel with a specified number of workers +def download_files(urls, num_workers, number_of_existing_files = 0): + # Use ThreadPoolExecutor to download files in parallel + with ThreadPoolExecutor(max_workers=num_workers) as executor: + # Map download_file function to URLs + futures = {executor.submit(download_file, url): url for url in urls} + # Monitor progress of each download + for future in tqdm(as_completed(futures), total=len(futures)-number_of_existing_files, desc="Total", dynamic_ncols=True): + url = futures[future] + try: + filename = future.result() + #print(f'Downloaded {filename}') + except Exception as e: + print(f'Failed to download {url}: {e}') +# Call download_files function with the desired number of workers +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Query an area for HLS granules.") + parser.add_argument("-c","--cloud-step", type=int, default=10, help="The step size for the cloud coverage threshold (default: 10)") + parser.add_argument("-n","--number-workers", type=str, default=24, help="The number of download workers (default: 24)") + parser.add_argument("-p","--download-path", type=str, default='downloaded-tiles-amazon', help="The path to download images (default: downloaded-tiles-amazon)") +# parser.add_argument("-r","--remove-old-output", action="store_true", help="If given, it remove the existing output csv file.") + args = parser.parse_args() +# query_the_area (kml_file_path = args.kml_file, date_from = "2020-01-01", date_to = "2024-12-31") + logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + + download_path = args.download_path + cloud_coverage_step = args.cloud_step + num_workers= int(args.number_workers) + os.makedirs(download_path, exist_ok=True) + input_file = f"amazon-download-links_cloud-coverage-step-{cloud_coverage_step}.csv" + if not os.path.isfile(input_file): + print(f"There exists no such file as {input_file}") + df = pd.read_csv(input_file) +# df = df.loc[df['tile'] == "T17LQL"] + urls = df['download'] + number_of_existing_files = len(glob.glob(download_path+"/HLS*")) + download_files(urls, num_workers, number_of_existing_files) + -- GitLab