Skip to content
Snippets Groups Projects
Commit 8e394a2b authored by Marvin Winkens's avatar Marvin Winkens
Browse files

add a fully working jupyter notebook for HPC with b2drop

parent 40906944
Branches master
No related tags found
No related merge requests found
...@@ -2,3 +2,7 @@ fingerhuete_*/ ...@@ -2,3 +2,7 @@ fingerhuete_*/
__pycache__ __pycache__
.idea .idea
.eudat-key .eudat-key
data/raw/*
data/processed/*
!data/raw/.gitkeep
!data/processed/.gitkeep
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Connecting data management services to HPC systems\n",
"\n",
"3 parts:\n",
" Download\n",
" Process\n",
" Upload\n",
"\n",
"Example for B2DROP: \n",
"- B2DROP uses the webdav protocol in order to pull/push data with sync-clients\n",
"- webdavclient3 package"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n",
" Cloning https://github.com/ezhov-evgeny/webdav-client-python-3.git (to revision develop) to c:\\users\\marvin\\appdata\\local\\temp\\pip-req-build-1i7baoxo\n",
" Resolved https://github.com/ezhov-evgeny/webdav-client-python-3.git to commit 98c23d1abd15efc3db9cfc756429f00041578bc2\n",
" Preparing metadata (setup.py): started\n",
" Preparing metadata (setup.py): finished with status 'done'\n",
"Requirement already satisfied: requests in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.30.0)\n",
"Requirement already satisfied: lxml in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (4.9.2)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from webdavclient3==3.14.6) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from python-dateutil->webdavclient3==3.14.6) (1.16.0)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2.0.2)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (2023.5.7)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.4)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\marvin\\.conda\\envs\\b2drop-webdav-python\\lib\\site-packages (from requests->webdavclient3==3.14.6) (3.1.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" Running command git clone --filter=blob:none --quiet https://github.com/ezhov-evgeny/webdav-client-python-3.git 'C:\\Users\\Marvin\\AppData\\Local\\Temp\\pip-req-build-1i7baoxo'\n"
]
}
],
"source": [
"!pip install git+https://github.com/ezhov-evgeny/webdav-client-python-3.git@develop\n",
"import os"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Add some helper functions:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from webdav3.client import Client\n",
"\n",
"client: Client\n",
"\n",
"\n",
"def connect_webdav_b2drop(username, webdav_pw):\n",
" global client\n",
"\n",
" options = {\n",
" 'webdav_hostname': f\"https://b2drop.eudat.eu/remote.php/dav/files/{username}\",\n",
" 'webdav_login': username,\n",
" 'webdav_password': webdav_pw,\n",
" 'verbose': True,\n",
" 'webdav_timeout': 60 * 4, # 10 minutes\n",
" }\n",
" client = Client(options)\n",
"\n",
"\n",
"def list_b2drop():\n",
" return client.list(get_info=True)\n",
"\n",
"\n",
"def get_info(cloud_path: str):\n",
" return client.info(cloud_path)\n",
"\n",
"\n",
"def pull(cloud_path, local_path):\n",
" return client.pull(cloud_path, local_path)\n",
"\n",
"\n",
"def push(cloud_path, local_path):\n",
" return client.push(cloud_path, local_path)\n",
"\n",
"\n",
"def upload(cloud_path, local_path):\n",
" return client.upload(cloud_path, local_path)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Connect to webdav:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"username = \"1686b184-4d7a-45ba-8cba-4c8aef619886\"\n",
"try:\n",
" f = open(\".eudat-key\", \"r\")\n",
"except OSError as e:\n",
" print(e)\n",
" print(\"Make sure the file .eudat-key does exist and contains your B2DROP key\")\n",
"webdav_pw = f.read()\n",
"f.close()\n",
"\n",
"connect_webdav_b2drop(username, webdav_pw)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Download, Process, Upload data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"cloud_path_in = \"b2drop_test_raw\" # \"/cloudpath/b2drop/raw\"\n",
"local_path_in = \"data/raw\" # \"/local/path/raw\"\n",
"local_path_out = \"data/processed\" # \"/local/path/processed\"\n",
"cloud_path_out = \"b2drop_test_processed\" # \"/cloudpath/b2drop/processed\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"TODO: Nice image\n",
"\n",
"DATA_ON_CLOUD_RAW \n",
"\n",
"---- download ----> \n",
"\n",
"LOCAL_DATA_RAW \n",
"\n",
"---- process ----> \n",
"\n",
"LOCAL_DATA_PROCESSED \n",
"\n",
"---- upload ----> \n",
"\n",
"DATA_ON_CLOUD_PROCESSED"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download from b2drop\n",
"pull(cloud_path_in, local_path_in)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def process_file(filename):\n",
" # your file processing\n",
" # example just copying file over\n",
" import shutil\n",
" shutil.copyfile(f\"{local_path_in}/{filename}\", f\"{local_path_out}/copy_{filename}\")\n",
"\n",
"files_to_process = os.listdir(local_path_in)\n",
"for filename in files_to_process:\n",
" process_file(filename)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Making Directory b2drop_test_processed\n",
"Made Directory b2drop_test_processed\n"
]
}
],
"source": [
"# Upload to b2drop\n",
"upload(cloud_path_out, local_path_out)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Future Prospects:\n",
"\n",
"- Track processed files \n",
" - by name\n",
" - by hash\n",
"- Automatically pull, then process and upload\n",
" - some pipeline management\n",
" - DVC - Data Version Control?\n",
" - https://dvc.org/doc/user-guide/data-management/remote-storage/webdav#webdav\n",
"- multithreading\n",
" - process dependend\n",
" - upload and download limited by webdav protocol"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "b2drop-webdav-python",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment