diff --git a/Code/JSTARS_code.py.py b/Code/JSTARS_code.py.py index c1c4f136b56fadbd6d79e545716047bdeea71ed5..5dd418deadd12c2a8d6fa63049f826b525cac670 100644 --- a/Code/JSTARS_code.py.py +++ b/Code/JSTARS_code.py.py @@ -20,19 +20,14 @@ from sklearn.model_selection import KFold from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV from sklearn.kernel_ridge import KernelRidge - - import embedding as emb import library as lib - #%% - ##### IMPORTANT ##### change the dataset path with the location where the dataset is stored ##### in the code the dataset is tored int he arrays X and y that are numpy ndarrays - dataset_path='insert datset path here' print('dataset_path:') print(dataset_path) @@ -40,9 +35,7 @@ print(dataset_path) X=np.load(dataset_path+'X.npy') y=np.load(dataset_path+'y.npy') - #%% - sample_dimension=200 sample_size=sample_dimension n_train_samples=sample_dimension @@ -50,25 +43,15 @@ n_test_samples=sample_dimension initial_seed=0 final_seed=10 - selected_seeds=list(range(initial_seed,final_seed)) - -#%% - A_mu=0 A_sigma=0.01 #0.5 - b_min=0 b_max=1 #3 - n_reads=1000 solver_parameters={'num_reads':n_reads} - -#%% b_choice=False #whether or not to add the term b in the quantum encoding -#%% - r2_storing={} mse_storing={} predictions_storing={} @@ -77,9 +60,7 @@ domain=['log','original'] #results_table=np.zeros((len(seeds),6)) models=['SVR','Kernel Ridge', 'GPR'] - hp_storing={m:{} for m in models} - metric_scorer=make_scorer(r2_score) #%% @@ -97,7 +78,7 @@ total_E=samples_per_run*E #if such an embedding is not found, a new one is calculated and saved #IMPORTANT: the same embedding might not work for different Advantage systems (Advantage4.1, Advantage6.1, Advantage5.2) -embedding_saving_path=emb.embedding_ind_path #modify with the path where ypu want to store the embeddings +embedding_saving_path=emb.embedding_ind_path #embedding_ind_path shpuld contain the location in which the embeddings are saved path=embedding_saving_path+'N_{}_q_{}\\'.format(E*samples_per_run,q) print('problem shape: {} {}'.format(q,total_E)) @@ -112,27 +93,21 @@ else: os.mkdir(path) emb.save_embedding(embedding, path) -#%% if num_samples % samples_per_run==0: print('number of samples can be divided by the number of runs') n_iter=int(num_samples/samples_per_run) print('number of iterations: ',n_iter) -#%% - nodes_table=np.zeros((total_E,q)) for i in range(total_E): for j in range(q): nodes_table[i,j]= i*q +j - - + test_d={} for it in range(samples_per_run): test_d[it]=nodes_table[E*it:E*(it+1)] - - #%% generates the list of E A matrixes of size qxP #P is the dimensionality of the feature vector @@ -147,8 +122,6 @@ for i in range(E): b_list.append(b_gen.uniform(low=b_min, high=b_max, size=(q,1))) #%% - - seeds=selected_seeds for random_seed in selected_seeds: @@ -167,8 +140,6 @@ for random_seed in selected_seeds: y_train_dict={} y_test_dict={} - #%% - X_train_mean=np.mean(X_train, axis=0) X_train_std=np.std(X_train, axis=0) @@ -186,14 +157,11 @@ for random_seed in selected_seeds: y_train_initial=y_train.copy() y_test_initial=y_test.copy() - #%% - X_train_original=np.exp(X_train_initial) X_test_original=np.exp(X_test_initial) y_train_original=np.exp(y_train_initial) y_test_original=np.exp(y_test_initial) - X_train_dict={'log':X_train_initial, 'original':np.exp(X_train_initial)} @@ -206,19 +174,14 @@ for random_seed in selected_seeds: y_test_dict={'log':y_test_initial, 'original':np.exp(y_test_initial)} - - - #%% - + X_train=X_train_initial.copy() y_train=y_train_initial.copy() X_test=X_test_initial.copy() y_test=y_test_initial.copy() - #%% tr_matrix=np.zeros((N,E*q)) test_matrix=np.zeros((X_test.shape[0],E*q)) - #%% print('Number of training samples',X_train.shape[0]) @@ -252,7 +215,7 @@ for random_seed in selected_seeds: record=sampleset.record sample=record.sample n_occ=record.num_occurrences - + #sample_mean=np.average(sample, axis=0) sample_mean=np.average(sample, axis=0, weights=n_occ) @@ -262,7 +225,6 @@ for random_seed in selected_seeds: tr_matrix[samples_per_run*it:(it+1)*samples_per_run]=it_matrix.copy() - #%% test_phase print('Number of test samples',X_test.shape[0]) @@ -296,38 +258,27 @@ for random_seed in selected_seeds: record=sampleset.record sample=record.sample n_occ=record.num_occurrences - #sample_mean=np.average(sample, axis=0) sample_mean=np.average(sample, axis=0, weights=n_occ) - it_matrix=np.zeros((samples_per_run,E*q)) for s in range(samples_per_run): it_matrix[s,:]=sample_mean[s*(E*q):(s+1)*(E*q)] test_matrix[samples_per_run*it:(it+1)*samples_per_run]=it_matrix.copy() - #%% - n_e_check=True if n_e_check==True: - print('normalizing vectors') - print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') + print('normalizing vectors') tr_matrix=tr_matrix*(1/np.sqrt(E)) test_matrix=test_matrix*(1/np.sqrt(E)) - #%% - print('Running hyperparameter optimization for SVR') - kf=KFold(n_splits=5) - C_values=np.arange(-8,8) C_values=np.power(2.0,C_values) - - epsilon_values=[0.1,0.01,0.001] #epsilon_values=[0.1] @@ -340,105 +291,69 @@ for random_seed in selected_seeds: n_config=len(C_values)*len(epsilon_values)*len(tol_values) - - svr_hp=LinearSVR(max_iter=10000) - svr_hp_search=GridSearchCV(svr_hp, hyperparameters, scoring=metric_scorer, cv=kf) svr_hp_search.fit(tr_matrix, y_train) - - #%% svr_hp_results=svr_hp_search.cv_results_ svr_best_hp=svr_hp_search.best_params_ svr_best_r2=svr_hp_results['mean_test_score'][np.argmin(svr_hp_results['rank_test_score'])] - hp_storing['SVR'][random_seed]=svr_best_hp - #%% print('check') lin_svr=LinearSVR(max_iter=10000,**svr_best_hp) lin_svr.fit(tr_matrix, y_train) - svr_pred_log=lin_svr.predict(test_matrix) svr_pred_original=np.exp(svr_pred_log) - svr_pred_dict={'log':svr_pred_log, 'original':svr_pred_original} predictions_dict['SVR']=svr_pred_dict - r2_dict['SVR']={d: r2_score(y_test_dict[d], svr_pred_dict[d]) for d in domain} mse_dict['SVR']={d: mse(y_test_dict[d], svr_pred_dict[d]) for d in domain} - - #%% Kernel Ridge - + ################################################################################### -################################################################################### - - +################################################################################### print('Running hyperparameter optimization for Kernel Ridge') - kf=KFold(n_splits=5) - - C_values=np.arange(-7,8) C_values=np.power(2.0,C_values) - kr_alpha_values=1/(2*C_values) - hyperparameters={'alpha':kr_alpha_values, } - - - - kr_hp=KernelRidge(kernel='linear') - + + kr_hp=KernelRidge(kernel='linear') kr_hp_search=GridSearchCV(kr_hp, hyperparameters, scoring=metric_scorer, cv=kf) kr_hp_search.fit(tr_matrix, y_train) - #%% kr_hp_results=kr_hp_search.cv_results_ kr_best_hp=kr_hp_search.best_params_ kr_best_r2=kr_hp_results['mean_test_score'][np.argmin(kr_hp_results['rank_test_score'])] - hp_storing['Kernel Ridge'][random_seed]=kr_best_hp - #%% lin_kr=KernelRidge(kernel='linear', **kr_best_hp) lin_kr.fit(tr_matrix, y_train) - kr_pred_log=lin_kr.predict(test_matrix) kr_pred_original=np.exp(kr_pred_log) - kr_pred_dict={ 'log':kr_pred_log, 'original':kr_pred_original} predictions_dict['Kernel Ridge']=kr_pred_dict - r2_dict['Kernel Ridge']={d: r2_score(y_test_dict[d], kr_pred_dict[d]) for d in domain} mse_dict['Kernel Ridge']={d: mse(y_test_dict[d], kr_pred_dict[d]) for d in domain} - ############################################################################# ############################################################################# - - #%% GPR print('Running hyperparameter optimization for GPR') - kf=KFold(n_splits=5) - gpr_alpha_values=[-10,-9,-8,-7,-6,-5,-4,-3,-2] gpr_alpha_values=np.power(10.0,gpr_alpha_values) - hyperparameters={'alpha': gpr_alpha_values} - - + #gpr_hp=GPR(kernel=DotProduct()) gpr_hp=GPR(kernel=DotProduct(sigma_0=0.0, sigma_0_bounds='fixed')) - gpr_hp_search=GridSearchCV(gpr_hp, hyperparameters, scoring=metric_scorer, cv=kf) gpr_hp_search.fit(tr_matrix, y_train) @@ -446,45 +361,33 @@ for random_seed in selected_seeds: gpr_hp_results=gpr_hp_search.cv_results_ gpr_best_hp=gpr_hp_search.best_params_ gpr_best_r2=gpr_hp_results['mean_test_score'][np.argmin(gpr_hp_results['rank_test_score'])] - hp_storing['GPR'][random_seed]=gpr_best_hp - #%% #lin_gpr=GPR(kernel=DotProduct(sigma_0=0.0), **gpr_best_hp) lin_gpr=GPR(kernel=DotProduct(sigma_0=0.0, sigma_0_bounds='fixed'), **gpr_best_hp) - lin_gpr.fit(tr_matrix, y_train) - gpr_pred_log=lin_gpr.predict(test_matrix) gpr_pred_original=np.exp(gpr_pred_log) - gpr_pred_dict={'log':gpr_pred_log, 'original':gpr_pred_original} predictions_dict['GPR']=gpr_pred_dict - r2_dict['GPR']={d: r2_score(y_test_dict[d], gpr_pred_dict[d]) for d in domain} mse_dict['GPR']={d: mse(y_test_dict[d], gpr_pred_dict[d]) for d in domain} - r2_storing[random_seed]=r2_dict mse_storing[random_seed]=mse_dict predictions_storing[random_seed]=predictions_dict - -#%% + del tr_matrix del test_matrix -#%% - for k1 in r2_storing.keys(): for k2 in r2_storing[k1].keys(): for k3 in r2_storing[k1][k2].keys(): r2_storing[k1][k2][k3]=np.around(r2_storing[k1][k2][k3],4) #%% - seeds=selected_seeds - col_labels=[m+' '+d for m in models for d in domain] r2_table=np.zeros((len(seeds),6)) @@ -493,29 +396,20 @@ for i in range(len(seeds)): for k,m in enumerate(models): for l,d in enumerate(domain): r2_table[i,2*k+l]=r2_storing[current_seed][m][d] - -#%% - + if not type(seeds)==list: seeds=[seeds] df_r2=pd.DataFrame(r2_table, index=seeds, columns=col_labels) - - mse_table=np.zeros((len(seeds),6)) - for i in range(len(seeds)): current_seed=seeds[i] for k,m in enumerate(models): for l,d in enumerate(domain): mse_table[i,2*k+l]=mse_storing[current_seed][m][d] - - + df_mse=pd.DataFrame(mse_table, index=seeds, columns=col_labels) - - - #%% def r2_dict_to_df(res_dict): mod=['SVR', 'Kernel Ridge', 'GPR'] @@ -543,6 +437,7 @@ def pred_dict_to_df(p_dict): return res #%% +## The following code part is used to save the results for the experiments saving_path='insert path destination in which the results will be saved' if not os.path.exists(saving_path): @@ -550,10 +445,7 @@ if not os.path.exists(saving_path): now=datetime.now() date_time=str(now) - - date_time=date_time[0:19] - date_array=[] for i in range(len(date_time)): @@ -570,8 +462,6 @@ results_path=saving_path+date_str+'__{}__{}'.format(initial_seed, final_seed) #%% os.mkdir(results_path) - - f=open(results_path+'\\info.txt','w') f.write(dataset_path) f.write('\n') @@ -613,16 +503,8 @@ matrix_path=results_path+'\\'+'random_matrices\\' os.mkdir(matrix_path) A_array_saving=lib.list_to_array(A_list) - b_list_modified=[s.reshape((1,-1)) for s in b_list] b_array_saving=lib.list_to_array(b_list_modified) - np.save(matrix_path+'A_matrix_list.npy', A_array_saving) np.save(matrix_path+'b_vector_list.npy', b_array_saving) - - - - - -