Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
ozone-imputation
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
esde
machine-learning
ozone-imputation
Commits
35771cb2
Commit
35771cb2
authored
1 year ago
by
Clara Betancourt
Browse files
Options
Downloads
Plain Diff
Merge branch 'clara_issue_39_try_new_base_models' into devel
parents
f2000f51
45950de4
Branches
devel
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
source/models/simple.py
+262
-36
262 additions, 36 deletions
source/models/simple.py
source/postprocessing/evaluation_tools.py
+11
-8
11 additions, 8 deletions
source/postprocessing/evaluation_tools.py
source/visualizations/plots_for_paper.py
+9
-9
9 additions, 9 deletions
source/visualizations/plots_for_paper.py
with
282 additions
and
53 deletions
source/models/simple.py
+
262
−
36
View file @
35771cb2
...
...
@@ -20,7 +20,8 @@ import torch
# sklearn
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.linear_model
import
LinearRegression
from
sklearn.ensemble
import
RandomForestRegressor
,
GradientBoostingRegressor
from
sklearn.metrics
import
mean_squared_error
,
r2_score
from
sklearn.neighbors
import
NearestNeighbors
...
...
@@ -40,7 +41,7 @@ class SpatiotemporalMean:
"""
A model that just returns the mean of all labels
"""
def
__init__
(
self
,
use_val
=
False
):
def
__init__
(
self
,
use_val
=
False
,
cv_seed
=
None
):
"""
Initializing the class
...
...
@@ -104,7 +105,7 @@ class SpatialMean:
If there is no measurement at all, return the mean of
the CAMS reanalyses of that time step.
"""
def
__init__
(
self
,
use_val
=
False
):
def
__init__
(
self
,
use_val
=
False
,
cv_seed
=
None
):
"""
Initializing the class
...
...
@@ -185,11 +186,100 @@ class SpatialMean:
self
.
y_hat
=
torch
.
load
(
self
.
prediction_path
)
class
Linear
:
"""
Training and using random forests.
"""
def
__init__
(
self
,
use_val
=
False
,
cv_seed
=
None
):
"""
Initializing the class
use_val = True means the validation set is used for fitting
"""
self
.
use_val
=
use_val
self
.
cv_seed
=
cv_seed
if
use_val
:
prediction_file
=
'
linear_predictions_useval.pt
'
else
:
prediction_file
=
'
linear_predictions.pt
'
if
cv_seed
is
not
None
:
directory
=
settings
.
resources_dir
+
'
cross_validation/
'
+
\
str
(
self
.
cv_seed
)
+
'
/
'
+
'
models/
'
else
:
directory
=
settings
.
resources_dir
+
'
models/
'
self
.
prediction_path
=
directory
+
prediction_file
def
__str__
(
self
):
"""
The name of the simple model.
"""
return
'
Linear
'
def
read_predictions
(
self
):
"""
Predictions were saved for future use
"""
if
not
os
.
path
.
exists
(
self
.
prediction_path
):
self
.
save_predictions
()
self
.
y_hat
=
torch
.
load
(
self
.
prediction_path
)
def
save_predictions
(
self
,
features
=
None
,
print_
=
True
):
"""
Train the linear model, then save model and predictions
features can be a list of features to use
print_ indicates whether to print info or not
"""
if
print_
:
print
(
'
getting linear predictions...
'
)
# read in data
ug
=
UBAGraph
()
x_df
=
pd
.
read_csv
(
ug
.
x_path
,
index_col
=
0
)
y_df
=
pd
.
read_csv
(
ug
.
y_path
,
index_col
=
0
)
if
self
.
cv_seed
is
not
None
:
p1
=
settings
.
resources_dir
p2
=
settings
.
resources_dir
+
f
'
cross_validation/
{
self
.
cv_seed
}
/
'
ug
.
mask_path
=
ug
.
mask_path
.
replace
(
p1
,
p2
)
mask_df
=
pd
.
read_csv
(
ug
.
mask_path
,
index_col
=
0
)
# use standard features from feature selection if no features given
if
not
features
:
features
=
[
'
cams_o3
'
,
'
nightlight
'
,
'
day_of_year
'
,
'
alt
'
,
'
hour_of_day
'
,
'
pbl_height
'
,
'
temperature
'
,
'
day_of_week
'
,
'
relative_alt
'
,
'
cams_no2
'
,
'
type
'
,
'
relative_humidity
'
,
'
wind_v
'
,
'
cams_nox_emissions
'
,
'
population_density
'
,
'
type_of_area
'
,
'
cloud_cover
'
,
'
wind_u
'
]
# prepare data
x_df
=
x_df
[
features
]
x_df
=
get_one_hot
(
x_df
)
x
=
x_df
.
values
if
self
.
use_val
:
mask
=
mask_df
.
train_mask
|
mask_df
.
val_mask
else
:
mask
=
mask_df
.
train_mask
x_train
=
x_df
[
mask
].
values
y_train
=
y_df
[
mask
].
values
.
reshape
(
-
1
)
# fit model
rf
=
LinearRegression
()
rf
.
fit
(
x_train
,
y_train
)
# save predictions
y_hat
=
rf
.
predict
(
x
)
y_hat_pytorch
=
torch
.
tensor
(
y_hat
.
astype
(
np
.
float32
)).
view
(
-
1
,
1
)
torch
.
save
(
y_hat_pytorch
,
self
.
prediction_path
)
if
print_
:
print
(
f
'
written to
{
self
.
prediction_path
}
'
)
class
Cams
:
"""
Use the modeled ozone values from CAMS
"""
def
__init__
(
self
,
use_val
=
False
):
def
__init__
(
self
,
use_val
=
False
,
cv_seed
=
None
):
"""
Initializing the class
...
...
@@ -459,6 +549,99 @@ class RandomForest:
print
(
f
'
written to
{
self
.
prediction_path
}
'
)
class
GradientBoostingTrees
:
"""
Training and using gradient boosting trees.
"""
def
__init__
(
self
,
use_val
=
False
,
cv_seed
=
None
):
"""
Initializing the class
use_val = True means the validation set is used for fitting
"""
self
.
use_val
=
use_val
self
.
cv_seed
=
cv_seed
if
use_val
:
prediction_file
=
'
gradientboostingtrees_predictions_useval.pt
'
else
:
prediction_file
=
'
gradientboostingtrees_predictions.pt
'
if
cv_seed
is
not
None
:
directory
=
settings
.
resources_dir
+
'
cross_validation/
'
+
\
str
(
self
.
cv_seed
)
+
'
/
'
+
'
models/
'
else
:
directory
=
settings
.
resources_dir
+
'
models/
'
self
.
prediction_path
=
directory
+
prediction_file
def
__str__
(
self
):
"""
The name of the simple model.
"""
return
'
GradientBoostingTrees
'
def
read_predictions
(
self
):
"""
Predictions were saved for future use
"""
if
not
os
.
path
.
exists
(
self
.
prediction_path
):
self
.
save_predictions
()
self
.
y_hat
=
torch
.
load
(
self
.
prediction_path
)
def
save_predictions
(
self
,
print_
=
True
,
n_estimators
=
300
,
learning_rate
=
0.1
,
max_depth
=
3
):
"""
Train the random gradient boosting, then save predictions
print_ indicates whether to print info or not
"""
if
print_
:
print
(
'
getting gradient boosted predictions...
'
)
# read in data
ug
=
UBAGraph
()
x_df
=
pd
.
read_csv
(
ug
.
x_path
,
index_col
=
0
)
y_df
=
pd
.
read_csv
(
ug
.
y_path
,
index_col
=
0
)
if
self
.
cv_seed
is
not
None
:
p1
=
settings
.
resources_dir
p2
=
settings
.
resources_dir
+
f
'
cross_validation/
{
self
.
cv_seed
}
/
'
ug
.
mask_path
=
ug
.
mask_path
.
replace
(
p1
,
p2
)
mask_df
=
pd
.
read_csv
(
ug
.
mask_path
,
index_col
=
0
)
# use standard features from feature selection
features
=
[
'
cams_o3
'
,
'
nightlight
'
,
'
day_of_year
'
,
'
alt
'
,
'
hour_of_day
'
,
'
pbl_height
'
,
'
temperature
'
,
'
day_of_week
'
,
'
relative_alt
'
,
'
cams_no2
'
,
'
type
'
,
'
relative_humidity
'
,
'
wind_v
'
,
'
cams_nox_emissions
'
,
'
population_density
'
,
'
type_of_area
'
,
'
cloud_cover
'
,
'
wind_u
'
]
# prepare data
x_df
=
x_df
[
features
]
x_df
=
get_one_hot
(
x_df
)
x
=
x_df
.
values
if
self
.
use_val
:
mask
=
mask_df
.
train_mask
|
mask_df
.
val_mask
else
:
mask
=
mask_df
.
train_mask
x_train
=
x_df
[
mask
].
values
y_train
=
y_df
[
mask
].
values
.
reshape
(
-
1
)
# fit model
rf
=
GradientBoostingRegressor
(
n_estimators
=
n_estimators
,
learning_rate
=
learning_rate
,
max_depth
=
max_depth
,
random_state
=
settings
.
random_seed
,
)
rf
.
fit
(
x_train
,
y_train
)
# save predictions
y_hat
=
rf
.
predict
(
x
)
y_hat_pytorch
=
torch
.
tensor
(
y_hat
.
astype
(
np
.
float32
)).
view
(
-
1
,
1
)
torch
.
save
(
y_hat_pytorch
,
self
.
prediction_path
)
if
print_
:
print
(
f
'
written to
{
self
.
prediction_path
}
'
)
def
tune_nnh
():
"""
tuning the samples parameter of NearestNeighborHybrid
...
...
@@ -484,6 +667,37 @@ def tune_nnh():
print
(
'
\n\n\n
'
)
def
tune_gradient_boosted_trees
():
"""
tuning the samples parameter of NearestNeighborHybrid
"""
print
(
'
tune gradient boosted tree model...
'
)
# UBA Graph data
ug
=
UBAGraph
()
ug
.
get_dataset
()
mask_df
=
pd
.
read_csv
(
ug
.
mask_path
,
index_col
=
0
)
y
=
ug
[
0
].
y
.
numpy
().
reshape
(
-
1
)[
mask_df
[
'
val_mask
'
]]
# for n_estimators in [100, 200, 300, 400, 500]:
# print('\nn_estimators=', n_estimators)
# for max_depth=3 in [4, 6, 8, 10]:
# print('\nmax_depth=', max_depth)
for
learning_rate
in
[
0.2
,
0.25
,
0.05
,
0.1
,
0.15
]:
print
(
'
\n
learning_rate=
'
,
learning_rate
)
gbt
=
GradientBoostingTrees
()
# gbt.save_predictions(n_estimators=n_estimators)
# gbt.save_predictions(max_depth=max_depth)
gbt
.
save_predictions
(
learning_rate
=
learning_rate
)
gbt
.
read_predictions
()
y_hat
=
gbt
.
y_hat
.
numpy
().
reshape
(
-
1
)[
mask_df
[
'
val_mask
'
]]
rmse
=
(
mean_squared_error
(
y
,
y_hat
))
**
.
5
r2
=
r2_score
(
y
,
y_hat
)
print
(
f
'
val RMSE:
{
rmse
:
.
3
f
}
, val R2:
{
r2
:
.
3
f
}
'
)
print
(
'
\n
'
)
def
select_rf_features
():
"""
Choose features for the simple model.
...
...
@@ -587,8 +801,9 @@ def evaluate_models(cv_seed=None):
models
=
[
NearestNeighborHybrid
,
RandomForest
]
use_vals
=
[
True
]
else
:
models
=
[
SpatiotemporalMean
,
SpatialMean
,
Cams
,
NearestNeighborHybrid
,
RandomForest
]
models
=
[
SpatiotemporalMean
,
SpatialMean
,
Linear
,
Cams
,
NearestNeighborHybrid
,
RandomForest
,
GradientBoostingTrees
]
use_vals
=
[
False
,
True
]
specs
=
[(
model
,
use_val
)
for
model
in
models
for
use_val
in
use_vals
]
...
...
@@ -633,34 +848,45 @@ def tune_and_save_models():
Also save the predictions where the validation set is used
for training.
"""
model
=
SpatiotemporalMean
()
model
.
save_predictions
()
model
=
SpatiotemporalMean
(
use_val
=
True
)
model
.
save_predictions
()
# model = SpatiotemporalMean()
# model.save_predictions()
# model = SpatiotemporalMean(use_val=True)
# model.save_predictions()
# model = SpatialMean()
# model.save_predictions()
# model = SpatialMean(use_val=True)
# model.save_predictions()
# model = Linear()
# model.save_predictions()
# model = Linear(use_val=True)
# model.save_predictions()
model
=
SpatialMean
()
model
.
save_predictions
()
model
=
SpatialMean
(
use_val
=
True
)
model
.
save_predictions
()
#
model =
Cams
()
#
model.save_predictions()
#
model =
Cams
(use_val=True)
#
model.save_predictions()
model
=
Cams
()
model
.
save_predictions
()
model
=
Cams
(
use_val
=
True
)
model
.
save_predictions
()
# tune_nnh()
# model = NearestNeighborHybrid()
# model.save_predictions()
# model = NearestNeighborHybrid(use_val=True)
# model.save_predictions()
tune_nnh
()
model
=
NearestNeighborHybrid
()
model
.
save_predictions
()
model
=
NearestNeighborHybrid
(
use_val
=
True
)
model
.
save_predictions
()
# select_rf_features
()
#
model =
RandomForest
()
#
model.save_predictions()
#
model =
RandomForest
(use_val=True)
#
model.save_predictions()
select_rf_featur
es
()
model
=
RandomForest
()
model
.
save_predictions
()
model
=
RandomForest
(
use_val
=
True
)
model
.
save_predictions
()
tune_gradient_boosted_tre
es
()
#
model =
GradientBoostingTrees
()
#
model.save_predictions()
#
model =
GradientBoostingTrees
(use_val=True)
#
model.save_predictions()
evaluate_models
()
#
evaluate_models()
def
save_models_for_cross_validation
():
...
...
@@ -683,5 +909,5 @@ def save_models_for_cross_validation():
if
__name__
==
'
__main__
'
:
#
tune_and_save_models()
save_models_for_cross_validation
()
tune_and_save_models
()
#
save_models_for_cross_validation()
This diff is collapsed.
Click to expand it.
source/postprocessing/evaluation_tools.py
+
11
−
8
View file @
35771cb2
...
...
@@ -364,9 +364,11 @@ def get_characteristics_df(how='read', cv_seed=None):
lin_file
=
'
nearestneighborhybrid_predictions_useval.pt
'
stm_cs_file
=
'
SpatiotemporalMean_cs_predictions_useval.pyt
'
sm_cs_file
=
'
SpatialMean_cs_predictions_useval.pyt
'
cams_cs_file
=
'
Cams_cs_predictions_useval.pyt
'
nnh_cs_file
=
'
NearestNeighborHybrid_cs_predictions_useval.pyt
'
rf_cs_file
=
'
RandomForest_cs_predictions_useval.pyt
'
lin_imp
=
torch
.
load
(
model_path
+
lin_file
).
numpy
().
reshape
(
-
1
)
cams_cs_imp
=
torch
.
load
(
model_path
+
cams_cs_file
).
numpy
().
reshape
(
-
1
)
nnh_cs_imp
=
torch
.
load
(
model_path
+
nnh_cs_file
).
numpy
().
reshape
(
-
1
)
rf_cs_imp
=
torch
.
load
(
model_path
+
rf_cs_file
).
numpy
().
reshape
(
-
1
)
if
cv_seed
is
None
:
...
...
@@ -374,8 +376,8 @@ def get_characteristics_df(how='read', cv_seed=None):
sm_cs_imp
=
torch
.
load
(
model_path
+
sm_cs_file
).
numpy
().
reshape
(
-
1
)
# set up df
columns
=
[
'
y_true
'
,
'
y_imputed
'
,
'
y_s
t
m_cs
'
,
'
y_s
m
_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
,
columns
=
[
'
y_true
'
,
'
y_imputed
'
,
'
y_stm_cs
'
,
'
y_sm_cs
'
,
'
y_
cam
s_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
,
'
gap_len
'
,
'
correlated
'
,
'
station_id
'
,
'
n_neighbors
'
,
'
test_mask
'
]
df
=
pd
.
DataFrame
(
columns
=
columns
,
index
=
y_true_df
.
index
)
...
...
@@ -383,6 +385,7 @@ def get_characteristics_df(how='read', cv_seed=None):
# columns that we can take from other data frames
df
.
y_true
=
y_true_df
.
y
df
.
y_nnh_cs
=
nnh_cs_imp
df
.
y_cams_cs
=
cams_cs_imp
df
.
y_rf_cs
=
rf_cs_imp
df
.
station_id
=
reg_df
.
station_id
df
.
test_mask
=
mask_df
.
test_mask
...
...
@@ -403,8 +406,8 @@ def get_characteristics_df(how='read', cv_seed=None):
for
gap_idx
,
gap_row
in
gap_df_filtered
.
iterrows
():
if
gap_idx
%
5000
==
0
:
print
(
f
'
{
gap_idx
/
len
(
gap_df_filtered
)
*
100
:
.
2
f
}
%
'
)
df
.
to_csv
(
save_path
)
print
(
f
'
intermediate save to
{
save_path
}
'
)
#
df.to_csv(save_path)
#
print(f'intermediate save to {save_path}')
start_idx
=
gap_row
.
start_idx
end_idx
=
gap_row
.
start_idx
+
gap_row
.
len
-
1
len_
=
gap_row
.
len
...
...
@@ -500,8 +503,8 @@ if __name__ == '__main__':
test_bootstrap_
=
False
count_exceedances_
=
False
number_of_neighbors_
=
False
get_characteristics_df_
=
Fals
e
evaluate_cross_validation_
=
Tru
e
get_characteristics_df_
=
Tru
e
evaluate_cross_validation_
=
Fals
e
if
index_of_agreement_
:
# test index of agreement should be.959
...
...
@@ -546,8 +549,8 @@ if __name__ == '__main__':
if
get_characteristics_df_
:
# df = get_characteristics_df(how='read')
df
=
get_characteristics_df
(
how
=
'
prepare
'
)
for
cv_seed
in
settings
.
cv_seeds
:
get_characteristics_df
(
how
=
'
prepare
'
,
cv_seed
=
cv_seed
)
#
for cv_seed in settings.cv_seeds:
#
get_characteristics_df(how='prepare', cv_seed=cv_seed)
if
evaluate_cross_validation_
:
evaluate_cross_validation
()
...
...
This diff is collapsed.
Click to expand it.
source/visualizations/plots_for_paper.py
+
9
−
9
View file @
35771cb2
...
...
@@ -690,8 +690,8 @@ def n_neighbors_vs_r2():
df
=
get_characteristics_df
()
n_list
=
np
.
unique
(
df
.
n_neighbors
).
tolist
()
r2_n_dict
=
{}
models
=
[
'
y_stm_cs
'
,
'
y_sm_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
]
colors
=
[
'
rosybrown
'
,
'
lightcoral
'
,
'
firebrick
'
,
'
blue
'
]
models
=
[
'
y_stm_cs
'
,
'
y_sm_cs
'
,
'
y_cams_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
]
colors
=
[
'
rosybrown
'
,
'
lightcoral
'
,
'
red
'
,
'
firebrick
'
,
'
blue
'
]
for
model
in
models
:
r2_n_list
=
[]
for
n
in
n_list
:
...
...
@@ -722,7 +722,7 @@ def n_neighbors_vs_r2():
ax
.
set_ylim
(
-
.
1
,
1.1
)
ax
.
set_xlabel
(
'
# neighbors
'
)
ax
.
set_ylabel
(
'
R2 score
'
)
ax
.
legend
()
#
ax.legend()
# save
n_neigh_vs_r2_path
=
settings
.
output_dir
+
'
n_neigh_vs_r2.png
'
...
...
@@ -741,8 +741,8 @@ def gap_len_vs_r2():
df
=
get_characteristics_df
()
len_list
=
list
(
range
(
1
,
51
))
r2_len_dict
=
{}
models
=
[
'
y_stm_cs
'
,
'
y_sm_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
]
colors
=
[
'
rosybrown
'
,
'
lightcoral
'
,
'
firebrick
'
,
'
blue
'
]
models
=
[
'
y_stm_cs
'
,
'
y_sm_cs
'
,
'
y_cams_cs
'
,
'
y_nnh_cs
'
,
'
y_rf_cs
'
]
colors
=
[
'
rosybrown
'
,
'
lightcoral
'
,
'
red
'
,
'
firebrick
'
,
'
blue
'
]
for
model
in
models
:
r2_len_list
=
[]
for
len_
in
len_list
:
...
...
@@ -776,7 +776,7 @@ def gap_len_vs_r2():
linestyle
=
'
--
'
)
plt
.
xlim
(
-
.
9
,
30
)
plt
.
ylim
(
-
.
3
,
1.1
)
plt
.
legend
()
#
plt.legend()
plt
.
xlabel
(
'
gap length
'
)
plt
.
ylabel
(
'
R2 score
'
)
...
...
@@ -899,12 +899,12 @@ if __name__ == '__main__':
station_ids_on_map_
=
False
o3_value_matrix_
=
False
mask_matrix_
=
False
time_series_
=
Tru
e
time_series_
=
Fals
e
basic_statistics_and_histogram_
=
False
exceedances_
=
False
true_vs_imputed_
=
False
n_neighbors_vs_r2_
=
Fals
e
gap_len_vs_r2_
=
Fals
e
n_neighbors_vs_r2_
=
Tru
e
gap_len_vs_r2_
=
Tru
e
imputed_dataset_matrix_
=
False
modeled_time_series_
=
False
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment