From 8e394a2bde3b707846792664c9bf96769c8781aa Mon Sep 17 00:00:00 2001
From: mwinkens <m.winkens@fz-juelich.de>
Date: Fri, 12 May 2023 12:36:44 +0200
Subject: [PATCH] add a fully working jupyter notebook for HPC with b2drop

---
 .gitignore              |   4 +
 b2drop-webdav.ipynb     | 276 ++++++++++++++++++++++++++++++++++++++++
 data/processed/.gitkeep |   0
 data/raw/.gitkeep       |   0
 4 files changed, 280 insertions(+)
 create mode 100644 data/processed/.gitkeep
 create mode 100644 data/raw/.gitkeep

diff --git a/.gitignore b/.gitignore
index 751bf49..dbd8500 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,7 @@ fingerhuete_*/
 __pycache__
 .idea
 .eudat-key
+data/raw/*
+data/processed/*
+!data/raw/.gitkeep
+!data/processed/.gitkeep
diff --git a/b2drop-webdav.ipynb b/b2drop-webdav.ipynb
index e69de29..2554a95 100644
--- a/b2drop-webdav.ipynb
+++ b/b2drop-webdav.ipynb
@@ -0,0 +1,276 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Connecting data management services to HPC systems\n",
+    "\n",
+    "3 parts:\n",
+    "    Download\n",
+    "    Process\n",
+    "    Upload\n",
+    "\n",
+    "Example for B2DROP: \n",
+    "- B2DROP uses the webdav protocol in order to pull/push data with sync-clients\n",
+    "- webdavclient3 package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n",
+      "  Cloning https://github.com/ezhov-evgeny/webdav-client-python-3.git (to revision develop) to c:\\users\\marvin\\appdata\\local\\temp\\pip-req-build-1i7baoxo\n",
+      "  Resolved https://github.com/ezhov-evgeny/webdav-client-python-3.git to commit 98c23d1abd15efc3db9cfc756429f00041578bc2\n",
+      "  Preparing metadata (setup.py): started\n",
+      "  Preparing metadata (setup.py): finished with status 'done'\n",
+      "Requirement already satisfied: requests in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.30.0)\n",
+      "Requirement already satisfied: lxml in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (4.9.2)\n",
+      "Requirement already satisfied: python-dateutil in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from python-dateutil->webdavclient3==3.14.6) (1.16.0)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2.0.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2023.5.7)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.4)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.1.0)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  Running command git clone --filter=blob:none --quiet https://github.com/ezhov-evgeny/webdav-client-python-3.git 'C:\\Users\\Marvin\\AppData\\Local\\Temp\\pip-req-build-1i7baoxo'\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n",
+    "import os"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Add some helper functions:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from webdav3.client import Client\n",
+    "\n",
+    "client: Client\n",
+    "\n",
+    "\n",
+    "def connect_webdav_b2drop(username, webdav_pw):\n",
+    "    global client\n",
+    "\n",
+    "    options = {\n",
+    "        'webdav_hostname': f\"https://b2drop.eudat.eu/remote.php/dav/files/{username}\",\n",
+    "        'webdav_login': username,\n",
+    "        'webdav_password': webdav_pw,\n",
+    "        'verbose': True,\n",
+    "        'webdav_timeout': 60 * 4,  # 10 minutes\n",
+    "    }\n",
+    "    client = Client(options)\n",
+    "\n",
+    "\n",
+    "def list_b2drop():\n",
+    "    return client.list(get_info=True)\n",
+    "\n",
+    "\n",
+    "def get_info(cloud_path: str):\n",
+    "    return client.info(cloud_path)\n",
+    "\n",
+    "\n",
+    "def pull(cloud_path, local_path):\n",
+    "    return client.pull(cloud_path, local_path)\n",
+    "\n",
+    "\n",
+    "def push(cloud_path, local_path):\n",
+    "    return client.push(cloud_path, local_path)\n",
+    "\n",
+    "\n",
+    "def upload(cloud_path, local_path):\n",
+    "    return client.upload(cloud_path, local_path)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Connect to webdav:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "username = \"1686b184-4d7a-45ba-8cba-4c8aef619886\"\n",
+    "try:\n",
+    "    f = open(\".eudat-key\", \"r\")\n",
+    "except OSError as e:\n",
+    "    print(e)\n",
+    "    print(\"Make sure the file .eudat-key does exist and contains your B2DROP key\")\n",
+    "webdav_pw = f.read()\n",
+    "f.close()\n",
+    "\n",
+    "connect_webdav_b2drop(username, webdav_pw)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download, Process, Upload data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cloud_path_in = \"b2drop_test_raw\" # \"/cloudpath/b2drop/raw\"\n",
+    "local_path_in = \"data/raw\" # \"/local/path/raw\"\n",
+    "local_path_out = \"data/processed\" # \"/local/path/processed\"\n",
+    "cloud_path_out = \"b2drop_test_processed\" # \"/cloudpath/b2drop/processed\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TODO: Nice image\n",
+    "\n",
+    "DATA_ON_CLOUD_RAW \n",
+    "\n",
+    "---- download ----> \n",
+    "\n",
+    "LOCAL_DATA_RAW \n",
+    "\n",
+    "---- process ----> \n",
+    "\n",
+    "LOCAL_DATA_PROCESSED \n",
+    "\n",
+    "---- upload ----> \n",
+    "\n",
+    "DATA_ON_CLOUD_PROCESSED"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Download from b2drop\n",
+    "pull(cloud_path_in, local_path_in)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_file(filename):\n",
+    "    # your file processing\n",
+    "    # example just copying file over\n",
+    "    import shutil\n",
+    "    shutil.copyfile(f\"{local_path_in}/{filename}\", f\"{local_path_out}/copy_{filename}\")\n",
+    "\n",
+    "files_to_process = os.listdir(local_path_in)\n",
+    "for filename in files_to_process:\n",
+    "    process_file(filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Making Directory b2drop_test_processed\n",
+      "Made Directory b2drop_test_processed\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Upload to b2drop\n",
+    "upload(cloud_path_out, local_path_out)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Future Prospects:\n",
+    "\n",
+    "- Track processed files \n",
+    "    - by name\n",
+    "    - by hash\n",
+    "- Automatically pull, then process and upload\n",
+    "    - some pipeline management\n",
+    "    - DVC - Data Version Control?\n",
+    "        - https://dvc.org/doc/user-guide/data-management/remote-storage/webdav#webdav\n",
+    "- multithreading\n",
+    "    - process dependend\n",
+    "    - upload and download limited by webdav protocol"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "b2drop-webdav-python",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/data/raw/.gitkeep b/data/raw/.gitkeep
new file mode 100644
index 0000000..e69de29
-- 
GitLab