Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
Data Logistics Service
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
eFlows4HPC WP2
Data Logistics Service
Commits
0bdb4cd7
Commit
0bdb4cd7
authored
2 years ago
by
Maria Petrova-El Sayed
Browse files
Options
Downloads
Patches
Plain Diff
Added integration with the datacat for stage in and stage out. Docker cmd revised
parent
a9251696
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Pipeline
#112667
passed
2 years ago
Stage: test
Stage: build
Stage: deploy
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
dags/docker_cmd.py
+9
-12
9 additions, 12 deletions
dags/docker_cmd.py
dags/docker_in_worker.py
+171
-114
171 additions, 114 deletions
dags/docker_in_worker.py
with
180 additions
and
126 deletions
dags/docker_cmd.py
+
9
−
12
View file @
0bdb4cd7
import
os
WORKER_DATA_LOCATION
=
'
/wf_pipeline_data/userdata
'
def
get_dockercmd
(
params
:
dict
,
location
):
...
...
@@ -5,9 +7,8 @@ def get_dockercmd(params:dict, location):
Args:
image(str): contianer image
stagein_args (list): a list of files necesarry for the executeion
stageout_args (list): a list of files which are results from the execution
string
_args (str): a string of further arguments which might be needed for the task execution
job
_args (str): a string of further arguments which might be needed for the task execution
entrypoint (str): specify or overwrite the docker entrypoint
command(str): you can specify or override the command to be executed
args_to_dockerrun(str): docker options
...
...
@@ -15,9 +16,8 @@ def get_dockercmd(params:dict, location):
"""
image
=
params
.
get
(
'
image
'
)
# {"image": 'ghcr.io/helmholtz-analytics/heat:1.1.1-alpha'}
stagein_args
=
params
.
get
(
'
stagein_args
'
,
[])
# {"stagein_args": ["demo_knn.py", "iris.h5"]}
stageout_args
=
params
.
get
(
'
stageout_args
'
,
[])
# {"stageout_args": ["result.out"]}
string_args
=
params
.
get
(
'
string_args
'
,
''
)
job_args
=
params
.
get
(
'
job_args
'
,
''
)
entrypoint
=
params
.
get
(
'
entrypoint
'
,
''
)
# {"entrypoint": "/bin/bash"}
command
=
params
.
get
(
'
command
'
,
''
)
# {"command": "python"}
args_to_dockerrun
=
params
.
get
(
'
args_to_docker
'
,
''
)
...
...
@@ -26,13 +26,10 @@ def get_dockercmd(params:dict, location):
entrypoint_part
=
f
"
--entrypoint=
{
entrypoint
}
"
if
entrypoint
else
''
working_dir
=
"
/data
"
file_args
=
stagein_args
+
stageout_args
args
=
"
"
args
=
args
.
join
(
file_args
)
args
=
args
+
string_args
cmd_part
=
f
"
-c
\"
{
command
}
{
args
}
\"
"
if
command
else
args
volumes
=
f
"
-v
{
location
}
:
{
working_dir
}
-w=
{
working_dir
}
"
if
file_args
else
''
cmd
=
f
'
userid=$(
{
user_id
}
) ; docker run
{
args_to_dockerrun
}
--user=$userid --rm --name=
"
test
"
{
volumes
}
{
entrypoint_part
}
{
image
}
{
cmd_part
}
'
cmd_part
=
f
"
-c
\"
{
command
}
\"
"
if
command
else
''
volumes
=
f
"
-v
{
location
}
:
{
working_dir
}
-w=
{
working_dir
}
"
cmd
=
f
'
userid=$(
{
user_id
}
) ; docker run
{
args_to_dockerrun
}
--user=$userid --rm --name=
"
test
"
{
volumes
}
{
entrypoint_part
}
{
image
}
{
cmd_part
}
{
job_args
}
>
{
location
}
/stdout.txt
'
return
cmd
\ No newline at end of file
This diff is collapsed.
Click to expand it.
dags/docker_in_worker.py
+
171
−
114
View file @
0bdb4cd7
...
...
@@ -4,71 +4,87 @@ from airflow.utils.dates import days_ago
from
airflow.models.connection
import
Connection
from
airflow.models
import
Variable
from
airflow.operators.python
import
get_current_context
from
datacat_integration.hooks
import
DataCatalogHook
from
datacat_integration.connection
import
DataCatalogEntry
from
b2shareoperator
import
(
download_file
,
get_file_list
,
get_object_md
,
get_objects
,
get_record_template
,
create_draft_record
,
add_file
,
submit_draft
)
get_record_template
,
create_draft_record
,
add_file
,
submit_draft
)
from
decors
import
get_connection
import
docker_cmd
as
doc
from
docker_cmd
import
WORKER_DATA_LOCATION
import
os
import
uuid
import
tempfile
"""
This piplines is a test case for starting a clusterting algorithm with HeAT, running in a Docker environment.
A test set of parameters with a HeAT example:
{
"
oid
"
:
"
b143bf73efd24d149bba4c081964b459
"
,
"
image
"
:
"
ghcr.io/helmholtz-analytics/heat:1.1.1-alpha
"
,
"
stagein_args
"
: [
"
demo_knn.py
"
,
"
iris.h5
"
],
"
stageout_args
"
: [
"
result.out
"
],
"
entrypoint
"
:
"
/bin/bash
"
,
"
command
"
:
"
python
"
}
Data Catalog Integration example: {
"
oid
"
:
"
e13bcab6-3664-4090-bebb-defdb58483e0
"
,
"
image
"
:
"
ghcr.io/helmholtz-analytics/heat:1.1.1-alpha
"
,
"
entrypoint
"
:
"
/bin/bash
"
,
"
command
"
:
"
python demo_knn.py iris.h5 calc_res.txt
"
,
"
register
"
:
"
True
"
}
Data Catalog Integration example: {
"
oid
"
:
"
e13bcab6-3664-4090-bebb-defdb58483e0
"
,
"
image
"
:
"
hello-world
"
,
"
register
"
:
"
True
"
}
Params:
oid (str): oid of the data
oid (str): oid of the data
(e.g, from data catalog)
image (str): a docker contianer image
stagein_args (list): a list of stage in files necesarry for the executeion
stageout_args (list): a list of stage out files which are results from the execution
string_args (str): a string of further arguments which might be needed for the task execution
entrypoint (str): you can specify or overwrite the docker entrypoint
command (str): you can specify or override the command to be executed
args_to_dockerrun (str): docker run additional options
job_args (str):
Optional: a string of further arguments which might be needed for the task execution
entrypoint (str):
Optional: you can specify or overwrite the docker entrypoint
command (str):
Optional: you can specify or override the command to be executed
args_to_dockerrun (str):
Optional: docker run additional arguments
register (True, False):
Optional, default is False: register the resulsts in the data catalog
"""
default_args
=
{
'
owner
'
:
'
airflow
'
,
}
@dag
(
default_args
=
default_args
,
schedule_interval
=
None
,
start_date
=
days_ago
(
2
),
tags
=
[
'
example
'
,
'
docker
'
])
@dag
(
default_args
=
default_args
,
schedule_interval
=
None
,
start_date
=
days_ago
(
2
),
tags
=
[
'
example
'
,
'
docker
'
,
'
datacat
'
])
def
docker_in_worker
():
DW_CONNECTION_ID
=
"
docker_worker
"
@task
(
multiple_outputs
=
True
)
def
extract
(
**
kwargs
):
"""
#### Extract task
A simple Extract task to get data ready for the rest of the data
pipeline. In this case, getting data is simulated by reading from a
b2share connection.
:param oid: ID of the file to be extracted
"""
connection
=
Connection
.
get_connection_from_secrets
(
'
default_b2share
'
)
server
=
connection
.
get_uri
()
print
(
f
"
Rereiving data from
{
server
}
"
)
@task
()
def
stagein
(
**
kwargs
):
"""
stage in task
This task gets the
'
datacat_oid
'
or
'
oid
'
from the DAG params to retreive a connection from it (b2share for now).
It then downloads all data from the b2share entry to the local disk, and returns a mapping of these files to the local download location,
which can be used by the following tasks.
"""
params
=
kwargs
[
'
params
'
]
datacat_hook
=
DataCatalogHook
()
if
'
oid
'
not
in
params
:
# {"oid": "b143bf73efd24d149bba4c081964b459"}
if
'
datacat_oid
'
not
in
params
:
print
(
"
Missing object id in pipeline parameters
"
)
lst
=
get_objects
(
server
=
server
)
flist
=
{
o
[
'
id
'
]:
[
f
[
'
key
'
]
for
f
in
o
[
'
files
'
]]
for
o
in
lst
}
print
(
f
"
Objects on server:
{
flist
}
"
)
return
-
1
# non zero exit code is a task failure
else
:
params
[
'
oid
'
]
=
params
[
'
datacat_oid
'
]
oid_split
=
params
[
'
oid
'
].
split
(
"
/
"
)
type
=
'
dataset
'
oid
=
'
placeholder_text
'
if
len
(
oid_split
)
is
2
:
type
=
oid_split
[
0
]
oid
=
oid_split
[
1
]
elif
len
(
oid_split
)
is
1
:
oid
=
oid_split
[
0
]
else
:
print
(
"
Malformed oid passed as parameter.
"
)
return
-
1
entry
=
DataCatalogEntry
.
from_json
(
datacat_hook
.
get_entry
(
type
,
oid
))
oid
=
params
[
'
oid
'
]
print
(
f
"
using entry:
{
entry
}
"
)
b2share_server_uri
=
entry
.
url
# TODO general stage in based on type metadata
# using only b2share for now
b2share_oid
=
entry
.
metadata
[
'
b2share_oid
'
]
obj
=
get_object_md
(
server
=
server
,
oid
=
oid
)
obj
=
get_object_md
(
server
=
b2share_
server
_uri
,
oid
=
b2share_
oid
)
print
(
f
"
Retrieved object
{
oid
}
:
{
obj
}
"
)
flist
=
get_file_list
(
obj
)
return
flist
@task
(
multiple_outputs
=
True
)
def
transform
(
flist
:
dict
):
"""
#### Transform task
A Transform task which takes in the collection of data, retrieved from the connection, downloads the files
and returns a map of the filename with the corresponding filepath.
"""
name_mappings
=
{}
tmp_dir
=
Variable
.
get
(
"
working_dir
"
,
default_var
=
'
/tmp/
'
)
print
(
f
"
Local working dir is:
{
tmp_dir
}
"
)
...
...
@@ -81,44 +97,52 @@ def docker_in_worker():
return
name_mappings
@task
()
def
load
(
files
:
dict
,
**
kwargs
):
"""
This task copies the data to
a location
,
def
move_to_docker_host
(
files
:
dict
,
**
kwargs
):
"""
This task copies the data
on
to
the remote docker worker
,
which will enable the following tasks an access to the data
Args:
files (dict): the files that will be stored on
another system
files (dict): the files that will be stored on
the docker worker
Returns:
list
: the location
s
of the
newly loaded files
target_dir
: the location of the
files on the docker worker
"""
print
(
f
"
Using
{
DW_CONNECTION_ID
}
connection
"
)
ssh_hook
=
get_connection
(
conn_id
=
DW_CONNECTION_ID
)
user_dir_name
=
str
(
uuid
.
uuid4
())
target_dir
=
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
user_dir_name
)
with
ssh_hook
.
get_conn
()
as
ssh_client
:
sftp_client
=
ssh_client
.
open_sftp
()
sftp_client
.
mkdir
(
target_dir
,
mode
=
0o755
)
for
[
truename
,
local
]
in
files
.
items
():
print
(
f
"
Copying
{
local
}
-->
{
DW_CONNECTION_ID
}
:
{
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
truename
)
}
"
)
sftp_client
.
put
(
local
,
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
truename
))
f
"
Copying
{
local
}
-->
{
DW_CONNECTION_ID
}
:
{
os
.
path
.
join
(
target_dir
,
truename
)
}
"
)
sftp_client
.
put
(
local
,
os
.
path
.
join
(
target_dir
,
truename
))
# or separate cleanup task?
os
.
unlink
(
local
)
# loaded_files = []
# for [truename, local_path] in files.items():
# destination = shutil.copy(local_path, os.path.join(DATA_LOCATION, truename))
# print(f"Copying {local_path} --> copying to: {destination};")
# loaded_files.append(destination)
# os.unlink(local_path)
# return loaded_files
return
target_dir
@task
def
run_container
(
data_locations
,
**
kwargs
):
def
run_container
(
data_location
,
**
kwargs
):
"""
A task which runs in the docker worker and spins up a docker container with the an image and giver parameters.
Args:
image (str): a docker contianer image
job_args (str):
Optional: a string of further arguments which might be needed for the task execution
entrypoint (str):
Optional: you can specify or overwrite the docker entrypoint
command (str):
Optional: you can specify or override the command to be executed
args_to_dockerrun (str):
Optional: docker run additional arguments
"""
params
=
kwargs
[
'
params
'
]
stageout_fnames
=
params
.
get
(
'
stageout_args
'
,
[])
cmd
=
doc
.
get_dockercmd
(
params
,
WORKER_DATA_LOCATION
)
cmd
=
doc
.
get_dockercmd
(
params
,
data_location
)
print
(
f
"
Executing docker command
{
cmd
}
"
)
print
(
f
"
Using
{
DW_CONNECTION_ID
}
connection
"
)
...
...
@@ -133,15 +157,15 @@ def docker_in_worker():
context
=
get_current_context
()
task_calculate
.
execute
(
context
)
return
stageout_fnames
return
data_location
@task
def
ls_results
(
output_
files
:
list
):
if
not
output_
files
:
def
ls_results
(
output_
dir
):
if
not
output_
dir
:
return
"
No output to stage out. Nothing more to do.
"
hook
=
get_connection
(
conn_id
=
DW_CONNECTION_ID
)
sp
=
"
"
cmd
=
f
"
cd
{
WORKER_DATA_LOCATION
}
; ls -al
{
sp
.
join
(
output_
files
)
}
"
cmd
=
f
"
ls -al
{
output_
dir
}
"
process
=
SSHOperator
(
task_id
=
"
print_results
"
,
ssh_hook
=
hook
,
...
...
@@ -151,54 +175,66 @@ def docker_in_worker():
process
.
execute
(
context
)
@task
()
def
retrieve_res
(
fnam
es
:
lis
t
,
**
kwargs
):
def
retrieve_res
(
output_dir
:
str
,
input_fil
es
:
dic
t
,
**
kwargs
):
"""
This task copies the data from the remote docker worker back to airflow workspace
Args:
fnames (li
st): the f
iles to be retrieved from
the docker worker
output_dir (
st
r
): the f
older containing all the user files for the executed task, located on
the docker worker
Returns:
local_fpath (list): the path of the files copied back to the airflow host
"""
local_tmp
_dir
=
Variable
.
get
(
"
working_dir
"
,
default_var
=
'
/tmp/
'
)
local_fpath
=
[]
working
_dir
=
Variable
.
get
(
"
working_dir
"
,
default_var
=
'
/tmp/
'
)
name_mappings
=
{}
print
(
f
"
Using
{
DW_CONNECTION_ID
}
connection
"
)
ssh_hook
=
get_connection
(
conn_id
=
DW_CONNECTION_ID
)
with
ssh_hook
.
get_conn
()
as
ssh_client
:
sftp_client
=
ssh_client
.
open_sftp
()
for
name
in
fnames
:
l
=
os
.
path
.
join
(
local_tmp_dir
,
name
)
print
(
f
"
Copying
{
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
name
)
}
to
{
l
}
"
)
sftp_client
.
get
(
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
name
),
l
)
local_fpath
.
append
(
l
)
return
local_fpath
for
fname
in
sftp_client
.
listdir
(
output_dir
):
if
fname
not
in
input_files
.
keys
():
tmpname
=
tempfile
.
mktemp
(
dir
=
working_dir
)
local
=
os
.
path
.
join
(
working_dir
,
tmpname
)
print
(
f
"
Copying
{
os
.
path
.
join
(
output_dir
,
fname
)
}
to
{
local
}
"
)
sftp_client
.
get
(
os
.
path
.
join
(
output_dir
,
fname
),
local
)
name_mappings
[
fname
]
=
local
return
name_mappings
@task
()
def
cleanup_doc_worker
(
files
,
**
kwargs
):
def
cleanup_doc_worker
(
res_fpaths_local
,
data_on_worker
,
**
kwargs
):
"""
This task deletes all the files from the docker worker
# Args:
# fnames (list): the result files to be deleted on the docker worker
Args:
res_fpaths_local: used only to define the order of tasks within the DAG, i.e. wait for previos task to complete before cleaning the worker space
data_on_worker (str): delete the folder with the user data from the docker worker
"""
params
=
kwargs
[
'
params
'
]
stagein_fnames
=
params
.
get
(
'
stagein_args
'
,
[])
stageout_fnames
=
params
.
get
(
'
stageout_args
'
,
[])
all_fnames
=
stagein_fnames
+
stageout_fnames
print
(
f
"
Using
{
DW_CONNECTION_ID
}
connection
"
)
ssh_hook
=
get_connection
(
conn_id
=
DW_CONNECTION_ID
)
with
ssh_hook
.
get_conn
()
as
ssh_client
:
sftp_client
=
ssh_client
.
open_sftp
()
for
file
in
all_fnames
:
print
(
f
"
Deleting file
{
DW_CONNECTION_ID
}
:
{
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
file
)
}
"
)
sftp_client
.
remove
(
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
file
))
d
=
os
.
path
.
join
(
WORKER_DATA_LOCATION
,
data_on_worker
)
for
f
in
sftp_client
.
listdir
(
d
):
print
(
f
"
Deleting file
{
f
}
"
)
sftp_client
.
remove
(
os
.
path
.
join
(
d
,
f
))
print
(
f
"
Deleting directory
{
DW_CONNECTION_ID
}
:
{
d
}
"
)
sftp_client
.
rmdir
(
d
)
@task
def
stageout_results
(
output_files
:
list
):
if
not
output_files
:
def
stageout_results
(
output_mappings
:
dict
):
"""
This task transfers the output files to b2share
Args:
output_mappings (dict): {true_filename, local_path} a dictionary of the output files to be submitted to the remote storage, e.g., b2share
Returns:
a b2share record
"""
if
not
output_mappings
:
print
(
"
No output to stage out. Nothing more to do.
"
)
return
-
1
connection
=
Connection
.
get_connection_from_secrets
(
'
default_b2share
'
)
...
...
@@ -218,44 +254,65 @@ def docker_in_worker():
print
(
'
Something went wrong with registration
'
,
r
,
r
.
text
)
return
-
1
for
f
in
output_files
:
print
(
f
"
Uploading
{
f
}
"
)
_
=
add_file
(
record
=
r
,
fname
=
f
,
token
=
token
,
remote
=
f
)
for
[
truename
,
local
]
in
output_mappings
.
items
()
:
print
(
f
"
Uploading
{
truename
}
"
)
_
=
add_file
(
record
=
r
,
fname
=
local
,
token
=
token
,
remote
=
truename
)
# delete local
#
os.unlink(local)
os
.
unlink
(
local
)
print
(
"
Submitting record for pubication
"
)
submitted
=
submit_draft
(
record
=
r
,
token
=
token
)
print
(
f
"
Record created
{
submitted
}
"
)
return
submitted
[
'
links
'
][
'
publication
'
]
# context = get_current_context()
# process.execute(context)
#TODO a cleanup job
@task
def
cleanup_local
(
errcode
,
res_fpaths
):
if
type
(
errcode
)
==
int
:
print
(
"
The data could not be staged out in the repository. Cleaning up
"
)
for
f
in
res_fpaths
:
print
(
f
"
Deleting file:
{
f
}
"
)
os
.
remove
(
f
)
#delete local copies of file
@task
()
def
register
(
object_url
,
additional_metadata
=
{},
**
kwargs
):
"""
This task registers the b2share record into the data catalog
Args:
object_url: from b2share
additional_metadata
"""
params
=
kwargs
[
'
params
'
]
reg
=
params
.
get
(
'
register
'
,
False
)
if
not
reg
:
print
(
"
Skipping registration as
'
register
'
parameter is not set
"
)
return
0
hook
=
DataCatalogHook
()
print
(
"
Connected to datacat via hook
"
)
if
not
additional_metadata
.
get
(
'
author
'
,
False
):
additional_metadata
[
'
author
'
]
=
"
DLS on behalft of eFlows
"
if
not
additional_metadata
.
get
(
'
access
'
,
False
):
additional_metadata
[
'
access
'
]
=
"
hook-based
"
entry
=
DataCatalogEntry
(
name
=
f
"
DLS results
{
kwargs
[
'
run_id
'
]
}
"
,
url
=
object_url
,
metadata
=
additional_metadata
)
try
:
r
=
hook
.
create_entry
(
datacat_type
=
'
dataset
'
,
entry
=
entry
)
print
(
"
Hook registration returned:
"
,
r
)
return
f
"
{
hook
.
base_url
}
/dataset/
{
r
}
"
except
ConnectionError
as
e
:
print
(
'
Registration failed
'
,
e
)
return
-
1
data
=
extract
()
files
=
transform
(
data
)
data_locations
=
load
(
files
)
output_fnames
=
run_container
(
data_locations
)
ls_results
(
output_fnames
)
res_fpaths
=
retrieve_res
(
output_fnames
)
cleanup_doc_worker
(
res_fpaths
)
errcode
=
stageout_results
(
res_fpaths
)
cleanup_local
(
errcode
,
res_fpaths
)
input_files
=
stagein
()
data_location
=
move_to_docker_host
(
input_files
)
data_on_worker
=
run_container
(
data_location
)
ls_results
(
data_on_worker
)
res_fpaths
=
retrieve_res
(
data_on_worker
,
input_files
)
cleanup_doc_worker
(
res_fpaths
,
data_on_worker
)
url_or_errcode
=
stageout_results
(
res_fpaths
)
register
(
url_or_errcode
)
#
data >>
files >> data_locations >> output_fnames >> ls_results(output_fnames) >> files >> stageout_results(files) >> cleanup()
# files >> data_locations >> output_fnames >> ls_results(output_fnames) >> files >> stageout_results(files) >> cleanup()
dag
=
docker_in_worker
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment