download script added

a91dfe85 · Ehsan · 3d23dad7 · a91dfe85
Commit a91dfe85 authored 3 months ago by Ehsan
--- a/download-hls-tiles.py
+++ b/download-hls-tiles.py
+#!/usr/bin/env python3
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+from urllib3.util import Retry
+from time import sleep
+import logging
+import gzip
+import shutil
+import sys
+import os
+import pandas as pd
+import argparse
+import glob
+# Define function to download a single file
+def download_file(url):
+    # Get file name from URL
+    filename = url.split('/')[-1]
+    if url.find("segments") != -1:
+        filepath = download_path + "/" + url[url.find("segments"):]
+    else:
+        filepath = download_path + "/" + url.split("/")[-1]
+    if os.path.exists(filepath):
+        return filename
+    filedir = str.join("/", filepath.split('/')[:-1])
+    if not os.path.exists(filedir):
+       os.makedirs(filedir, exist_ok=True)
+    response = requests.head(url)
+    file_size = int(response.headers["content-length"])
+    with tqdm(unit="B", unit_scale=True, total=file_size, leave=False, desc=filename, dynamic_ncols=True) as pbar:
+        retry_count = 1000
+        while retry_count:
+            try:
+                with requests.get(url, stream=True) as r:
+                    r.raise_for_status()
+                    # Write file to disk
+                    with open(filepath, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=1048576):
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+                retry_count = 0
+            except:
+               os.remove(filepath)
+               retry_count -= 1
+               sleep(1)
+    return filename
+# Define function to download files in parallel with a specified number of workers
+def download_files(urls, num_workers, number_of_existing_files = 0):
+    # Use ThreadPoolExecutor to download files in parallel
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Map download_file function to URLs
+        futures = {executor.submit(download_file, url): url for url in urls}
+        # Monitor progress of each download
+        for future in tqdm(as_completed(futures), total=len(futures)-number_of_existing_files, desc="Total", dynamic_ncols=True):
+            url = futures[future]
+            try:
+                filename = future.result()
+                #print(f'Downloaded {filename}')
+            except Exception as e:
+                print(f'Failed to download {url}: {e}')
+# Call download_files function with the desired number of workers
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Query an area for HLS granules.")
+    parser.add_argument("-c","--cloud-step", type=int, default=10, help="The step size for the cloud coverage threshold (default: 10)")
+    parser.add_argument("-n","--number-workers",   type=str, default=24, help="The number of download workers (default: 24)")
+    parser.add_argument("-p","--download-path",   type=str, default='downloaded-tiles-amazon', help="The path to download images (default: downloaded-tiles-amazon)")
+#     parser.add_argument("-r","--remove-old-output", action="store_true", help="If given, it remove the existing output csv file.")
+    args = parser.parse_args()
+#     query_the_area (kml_file_path = args.kml_file, date_from = "2020-01-01", date_to = "2024-12-31")
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
+    download_path = args.download_path
+    cloud_coverage_step = args.cloud_step
+    num_workers= int(args.number_workers)
+    os.makedirs(download_path, exist_ok=True)
+    input_file = f"amazon-download-links_cloud-coverage-step-{cloud_coverage_step}.csv"
+    if not os.path.isfile(input_file):
+        print(f"There exists no such file as {input_file}")
+    df = pd.read_csv(input_file)
+#     df = df.loc[df['tile'] == "T17LQL"]
+    urls = df['download']
+    number_of_existing_files = len(glob.glob(download_path+"/HLS*"))
+    download_files(urls, num_workers, number_of_existing_files)