From 8e394a2bde3b707846792664c9bf96769c8781aa Mon Sep 17 00:00:00 2001 From: mwinkens <m.winkens@fz-juelich.de> Date: Fri, 12 May 2023 12:36:44 +0200 Subject: [PATCH] add a fully working jupyter notebook for HPC with b2drop --- .gitignore | 4 + b2drop-webdav.ipynb | 276 ++++++++++++++++++++++++++++++++++++++++ data/processed/.gitkeep | 0 data/raw/.gitkeep | 0 4 files changed, 280 insertions(+) create mode 100644 data/processed/.gitkeep create mode 100644 data/raw/.gitkeep diff --git a/.gitignore b/.gitignore index 751bf49..dbd8500 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,7 @@ fingerhuete_*/ __pycache__ .idea .eudat-key +data/raw/* +data/processed/* +!data/raw/.gitkeep +!data/processed/.gitkeep diff --git a/b2drop-webdav.ipynb b/b2drop-webdav.ipynb index e69de29..2554a95 100644 --- a/b2drop-webdav.ipynb +++ b/b2drop-webdav.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Connecting data management services to HPC systems\n", + "\n", + "3 parts:\n", + " Download\n", + " Process\n", + " Upload\n", + "\n", + "Example for B2DROP: \n", + "- B2DROP uses the webdav protocol in order to pull/push data with sync-clients\n", + "- webdavclient3 package" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n", + " Cloning https://github.com/ezhov-evgeny/webdav-client-python-3.git (to revision develop) to c:\\users\\marvin\\appdata\\local\\temp\\pip-req-build-1i7baoxo\n", + " Resolved https://github.com/ezhov-evgeny/webdav-client-python-3.git to commit 98c23d1abd15efc3db9cfc756429f00041578bc2\n", + " Preparing metadata (setup.py): started\n", + " Preparing metadata (setup.py): finished with status 'done'\n", + "Requirement already satisfied: requests in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.30.0)\n", + "Requirement already satisfied: lxml in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (4.9.2)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from python-dateutil->webdavclient3==3.14.6) (1.16.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2.0.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2023.5.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.1.0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " Running command git clone --filter=blob:none --quiet https://github.com/ezhov-evgeny/webdav-client-python-3.git 'C:\\Users\\Marvin\\AppData\\Local\\Temp\\pip-req-build-1i7baoxo'\n" + ] + } + ], + "source": [ + "!pip install git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add some helper functions:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from webdav3.client import Client\n", + "\n", + "client: Client\n", + "\n", + "\n", + "def connect_webdav_b2drop(username, webdav_pw):\n", + " global client\n", + "\n", + " options = {\n", + " 'webdav_hostname': f\"https://b2drop.eudat.eu/remote.php/dav/files/{username}\",\n", + " 'webdav_login': username,\n", + " 'webdav_password': webdav_pw,\n", + " 'verbose': True,\n", + " 'webdav_timeout': 60 * 4, # 10 minutes\n", + " }\n", + " client = Client(options)\n", + "\n", + "\n", + "def list_b2drop():\n", + " return client.list(get_info=True)\n", + "\n", + "\n", + "def get_info(cloud_path: str):\n", + " return client.info(cloud_path)\n", + "\n", + "\n", + "def pull(cloud_path, local_path):\n", + " return client.pull(cloud_path, local_path)\n", + "\n", + "\n", + "def push(cloud_path, local_path):\n", + " return client.push(cloud_path, local_path)\n", + "\n", + "\n", + "def upload(cloud_path, local_path):\n", + " return client.upload(cloud_path, local_path)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Connect to webdav:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "username = \"1686b184-4d7a-45ba-8cba-4c8aef619886\"\n", + "try:\n", + " f = open(\".eudat-key\", \"r\")\n", + "except OSError as e:\n", + " print(e)\n", + " print(\"Make sure the file .eudat-key does exist and contains your B2DROP key\")\n", + "webdav_pw = f.read()\n", + "f.close()\n", + "\n", + "connect_webdav_b2drop(username, webdav_pw)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download, Process, Upload data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "cloud_path_in = \"b2drop_test_raw\" # \"/cloudpath/b2drop/raw\"\n", + "local_path_in = \"data/raw\" # \"/local/path/raw\"\n", + "local_path_out = \"data/processed\" # \"/local/path/processed\"\n", + "cloud_path_out = \"b2drop_test_processed\" # \"/cloudpath/b2drop/processed\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODO: Nice image\n", + "\n", + "DATA_ON_CLOUD_RAW \n", + "\n", + "---- download ----> \n", + "\n", + "LOCAL_DATA_RAW \n", + "\n", + "---- process ----> \n", + "\n", + "LOCAL_DATA_PROCESSED \n", + "\n", + "---- upload ----> \n", + "\n", + "DATA_ON_CLOUD_PROCESSED" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download from b2drop\n", + "pull(cloud_path_in, local_path_in)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def process_file(filename):\n", + " # your file processing\n", + " # example just copying file over\n", + " import shutil\n", + " shutil.copyfile(f\"{local_path_in}/{filename}\", f\"{local_path_out}/copy_{filename}\")\n", + "\n", + "files_to_process = os.listdir(local_path_in)\n", + "for filename in files_to_process:\n", + " process_file(filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Making Directory b2drop_test_processed\n", + "Made Directory b2drop_test_processed\n" + ] + } + ], + "source": [ + "# Upload to b2drop\n", + "upload(cloud_path_out, local_path_out)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Future Prospects:\n", + "\n", + "- Track processed files \n", + " - by name\n", + " - by hash\n", + "- Automatically pull, then process and upload\n", + " - some pipeline management\n", + " - DVC - Data Version Control?\n", + " - https://dvc.org/doc/user-guide/data-management/remote-storage/webdav#webdav\n", + "- multithreading\n", + " - process dependend\n", + " - upload and download limited by webdav protocol" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "b2drop-webdav-python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/raw/.gitkeep b/data/raw/.gitkeep new file mode 100644 index 0000000..e69de29 -- GitLab