From 5d2b51ee721e818724123b937dfa9c5305c49324 Mon Sep 17 00:00:00 2001 From: Sandipan Mohanty <s.mohanty@fz-juelich.de> Date: Tue, 29 Apr 2025 14:45:47 +0200 Subject: [PATCH] Add an MC minimization stage in regularize for bad crds When there are missing atoms in a PDB file, we end up with many undetermined coordinates in the population. RMSD minimization can not help us get good values for these. When a ProFASi simulation starts with such a structure however, we have to fill in all atoms. The ones originally missing in the PDB will have their XYZ coordinates calculated based on nominal default values for those coordinates, which may put them in overlapping positions with other atoms. This leads to very high energies when we start our energy minimization rounds (in billions of k_B T). When the energy is that high, any MC update which resolves the clash, no matter how detrimental to other energy terms and the RMSD, will be accepted. In those cases, "regularize" fails to produce any reasonable structure. Here we try one round of MC based energy minimization using only the bad, or indeterminate, coordinates. If the clash can be resolved like this, we will have a much better starting position for energy minimization at the same RMSD value as obtained in the first stage of regularization: the pure RMSD minimization. As it turns out, the result is a bit mixed. In some structures it does wonders, in others, it has no effect while consuming some more time. --- app/regularize.cc | 184 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 144 insertions(+), 40 deletions(-) diff --git a/app/regularize.cc b/app/regularize.cc index 9abd64e3..ed3fe881 100644 --- a/app/regularize.cc +++ b/app/regularize.cc @@ -46,7 +46,8 @@ void register_internal_plugins() register_molmc_plugins(); } -auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double goodenoughrmsd = 0.) -> double +auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, + const std::vector<CoordinateURL>& badcrds, double goodenoughrmsd = 0.) -> double { if (p == nullptr) throw prf::Exception { "Null population given to minimize_rmsd", __func__ }; @@ -55,6 +56,9 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good PDBReader pdbinput { ifilesel.filename() }; if (pdbinput.read_matrix() == 0) throw prf::Exception { FMT("Read error while parsing PDB file '{}'", ifilesel.filename()), __func__ }; + + prf::fix_coordinates(*p, badcrds); + for (auto ich = 0UL; ich < pc->NumberOfChains(); ++ich) { StateProperties sp; auto populationside = FMT("$::{:c}", 'A' + ich); @@ -64,8 +68,7 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good rmsd.set_struc2(populationside); rmsd.filter("+HV+@H"); rmsd.init(*p, sp); - std::cout << "Starting RMSD (all atoms, chain " << ich << ") = " - << rmsd(*p, sp) << "\n"; + std::cout << FMT("Starting RMSD (all atoms, chain {}) = {}\n", ich, rmsd(*p, sp)); auto&& crds = p->state().generalizedCoordinates(); std::vector<prf::CoordinateURL> provisionally_fixed; for (prf::gc::Storage::size_type idof {}; idof != crds.size(); ++idof) { @@ -124,10 +127,13 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good prf::enforce_PBC(*p); p->reconstruct(); auto alignment = rmsd.get_alignment(p->state()); - std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n"; + auto trv = alignment.transformation.translation(); + std::cout << FMT("chain {}, alignment translation = ({}, {}, {}\n", + ich, trv[0], trv[1], trv[2]); p->transform(alignment.transformation); for (auto& url : provisionally_fixed) { - crds.unfix(url); + if (not prf::contains(badcrds, url)) + crds.unfix(url); } crds.rehash(); ans = mcg.value; @@ -184,14 +190,63 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good prf::enforce_PBC(*p); p->reconstruct(); auto alignment = rmsd.get_alignment(p->state()); - std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n"; + // std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n"; p->transform(alignment.transformation); p->reconstruct(); ans = mcg.value; } + prf::unfix_coordinates(*p, badcrds); + p->reconstruct(); return ans; } +template <class F, class Delta> +auto MC_minimize(Population* p, StateProperties& sp, + F&& f, Delta&& delta, Temperature T, std::size_t ncyc, + std::span<const CoordinateURL> keepfixed) +{ + prf::fix_coordinates(*p, keepfixed); + MolMCUpdater upd; + // upd.parseCommands(cmds); + upd.init(*p); + upd.set_n_temperatures(1UL); + upd.set_T_index(0UL); + size_t ndof = upd.sweep_length(); + auto dofs = p->get_dof(); + auto mincoords = dofs; + auto gen = prf::RNGCache(prf::RandomNumberHandler::generator()); + prf::MH::MCSweep mc(T.beta(), ndof, gen, 1UL); + p->state().xyz().initcells(); + auto ecur = f(*p, sp); + auto emin = ecur; + std::cout << "MC minimization: " << ncyc << " cycles over " << ndof << " degrees of freedom\n"; + for (int icyc = 0; icyc < ncyc; ++icyc) { + + mc.advance( + *p, sp, upd, delta, f, + [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) { + f.apply_move(cc, sp1, sp2); + }, + [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) { + f.undo_move(cc, sp1, sp2); + }, + { ecur }); + enforce_PBC(*p); + p->reconstruct(); + auto newe = f(*p, sp); + + if (newe < emin) { + p->get_dof(mincoords); + emin = newe; + std::cout << "New minimum energy " << emin << "\n"; + } + std::cout << "Finished round " << icyc << " of MC with energy " << newe << "\n"; + } + p->set_dof(mincoords); + p->reconstruct(); + prf::unfix_coordinates(*p, keepfixed); + return emin; +} auto main(int argc, char* argv[]) -> int { using namespace std::string_literals; @@ -217,6 +272,8 @@ auto main(int argc, char* argv[]) -> int prog->registerOption({ "settings_file", "sf", 1, "(Use a settings file to adjust parameters.)" }); prog->registerOption({ "initial_structure", "start", 1, "(where to start, if different from the one being regularized)" }); + prog->registerOption({ "save_preminimization_structure", "pre", 1, + "(File name to save pre-minimization structure after reading PDB coordinates)" }); prog->registerOption({ "num_e_minim_cycles", "nec", 1, "(number of minimization cycles of E+k*RMSD)" }); prog->registerOption({ "initial_RMSD_factor", "f0", 1, "(initial factor k in E+k*RMSD)" }); @@ -232,6 +289,7 @@ auto main(int argc, char* argv[]) -> int "(Store a snapshot \"snapshot_xyz.xml\" when a new minimum energy is recorded.)" }); prog->registerOption({ "threads_per_task", "threads", 1, "(Number of threads to use for energy (value, gradient) evaluations)" }); + prog->registerSwitch({ "verbose_output", "v", false, "(Verbose logs on standard output)" }); InstructionList cmds; cmds += InstructionString("Pivot kappa 0"); cmds += InstructionString("Rot kappa 20"); @@ -246,6 +304,8 @@ auto main(int argc, char* argv[]) -> int cmds += settings.instructions(); } prf::Logger::setVerbosity(cmds.get_or("log_level", 20UL)); + auto vbose = cmds.get_or("verbose_output", false); + Logger blog(10); auto nstages = cmds.get_or(prog->longName("nec"), 9); auto rmsfact0 = cmds.get_or(prog->longName("f0"), 256.0); @@ -273,17 +333,25 @@ auto main(int argc, char* argv[]) -> int tbb::global_control tbbctl(tbb::global_control::max_allowed_parallelism, nthr); prf::parseCommands(ph, populationsetup); auto&& p = ph.population(); + auto&& ps = p->state(); + auto&& crds = ps.generalizedCoordinates(); - p->state().generalizedCoordinates().rehash(); + crds.rehash(); p->reconstruct(); - { - std::ofstream initstruct { "initstruct.xml" }; + auto badcrds = p->undetermined_after_import(); + if (not badcrds.empty()) { + std::cout << "The following coordinates could not be determined while reading the PDB structure.\n"; + for (int counter = 0; const auto& url : badcrds) { + std::cout << FMT("{: >25}", url.str()) << ((++counter) % 3 == 0 ? "\n" : " "); + } + std::cout << "\n"; + } + + if (prog->optionGiven("save_preminimization_structure")) { + std::ofstream initstruct { cmds.get_or("save_preminimization_structure", + "initstruct.xml") }; p->Write_XML(initstruct); } - upd.parseCommands(cmds); - upd.init(*p); - upd.set_n_temperatures(1UL); - upd.set_T_index(0UL); StateProperties sp; ProteinRMSD rmsd; @@ -295,7 +363,7 @@ auto main(int argc, char* argv[]) -> int auto t0 = std::chrono::steady_clock::now(); - auto rmsdmin = minimize_rmsd(p, ifilesels, goodenoughrmsd.value_or(0.)); + auto rmsdmin = minimize_rmsd(p, ifilesels, badcrds, goodenoughrmsd.value_or(0.)); std::mutex rmx, grmx; JacobianCarTor J(*p); @@ -320,12 +388,17 @@ auto main(int argc, char* argv[]) -> int if (prog->switchGiven("g")) return 0; + + upd.parseCommands(cmds); + upd.init(*p); + upd.set_n_temperatures(1UL); + upd.set_T_index(0UL); blog << "Initialising force field...\n"; if (auto&& oip = ph.interaction_potential(); oip.has_value()) { auto&& ip = *oip; ip.init(*ph.population(), sp); auto ffmaxcutoff = ip.cutoff(); - auto&& xyz = ph.population()->state().xyz(); + auto&& xyz = ps.xyz(); if (ffmaxcutoff) { std::cout << "Forcefield defines a maximum cutoff of " << *ffmaxcutoff << " Angstroms\n"; @@ -338,8 +411,11 @@ auto main(int argc, char* argv[]) -> int } auto fhname = ip.name(); std::cout << "Force field name : " << fhname << "\n"; + } else { + std::cerr << "Invalid or non-existent forcefield setup.\n"; + return 1; } - ph.population()->state().xyz().initcells(); + ps.xyz().initcells(); auto&& ff = ph.interaction_potential(); double rmsstep = 0.5, rmsfactor = rmsfact0; if (nstages != 1) { @@ -391,10 +467,30 @@ auto main(int argc, char* argv[]) -> int sr::copy(g, out.begin()); }; auto t2 = std::chrono::steady_clock::now(); + + if (not badcrds.empty()) { + auto allcrds = crds.urls(); + decltype(badcrds) goodcrds; + for (auto&& url : allcrds) { + auto idx = crds.index_of(url); + auto crd = crds.props(idx); + if (crd.mutability() == gc::Mutability::free and (not prf::contains(badcrds, url))) + goodcrds.push_back(url); + } + + p->reconstruct(); + auto cyccount = mccycperminim; + auto emin = MC_minimize(p, sp, *ff, *ff, T, + cyccount, goodcrds); + std::cout << "Minimum energy after " << cyccount << " MC cycles = " << emin << "\n"; + } + { - std::cout << "Preliminary energy minimization with CG...\n"; + std::cout << "Preliminary energy minimization with CG... all coordinates.\n"; + dof = p->get_dof(); auto m = prf::minimizers::cg(F, GF, dof); p->set_dof(m.location); + p->reconstruct(); dof = m.location; } @@ -421,10 +517,10 @@ auto main(int argc, char* argv[]) -> int std::vector mincoords(dof.size(), 0.); std::copy(dof.begin(), dof.end(), mincoords.begin()); - auto emin = stageff(*p, sp); + auto ecur = stageff(*p, sp); + auto emin = ecur; std::cout << "For Stage " << i << ", starting value of pseudo-energy, E_physical + " << rmsfactor << " * RMSD = " << emin << "\n"; - auto ecur = emin; if (mccycperminim != 0) std::cout << "Running " << mccycperminim << " MC cycles looking for a lower " << "pseudo-energy than the starting value " << emin << "...\n"; @@ -434,7 +530,8 @@ auto main(int argc, char* argv[]) -> int auto gen = prf::RNGCache(prf::RandomNumberHandler::generator()); prf::MH::MCSweep mc(T.beta(), ndof, gen, 1UL); p->state().xyz().initcells(); - mc.advance( + + /*auto [p_after, e_after] =*/mc.advance( *p, sp, upd, stagedelta, stageff, [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) { ff->apply_move(cc, sp1, sp2); @@ -446,15 +543,27 @@ auto main(int argc, char* argv[]) -> int }, { ecur }); enforce_PBC(*p); + + // std::cout << "Energy returned from advance() = " << e_after << "\n"; + // std::cout << "Energy using the returned population = " << (*ff)(p_after, sp) << "\n"; + // copy_state(p_after, *p); + // std::cout << "Energy after copying to p = " << (*ff)(*p, sp) << "\n"; p->reconstruct(); - auto newe = stageff(*p, sp); + ecur = stageff(*p, sp); + // std::cout << "Reevaluated energy after call to enforce_PBC and reconstruct = " << ecur << "\n"; - if (newe < emin) { + if (ecur < emin) { p->get_dof(mincoords); - emin = newe; + emin = ecur; blog(31) << "New minimum energy " << emin << "\n"; } - std::cout << "Finished round " << icyc << " of MC with energy " << newe << "\n"; + if (vbose) { + auto newr = ormsd(*p, sp); + std::cout << "Finished MC cycle " << icyc << " out of " << mccycperminim + << " with energy = " + << ecur << ", rmsd = " << newr << "\n"; + // std::getchar(); + } } if (mccycperminim != 0) { @@ -463,7 +572,7 @@ auto main(int argc, char* argv[]) -> int std::copy(mincoords.begin(), mincoords.end(), dof.begin()); p->set_dof(dof); p->reconstruct(); - p->state().xyz().initcells(); + ps.xyz().initcells(); emin = stageff(*p, sp); std::cout << "Re-evaluated pseudo-energy for chosen structure = " << emin << "\n"; } @@ -472,26 +581,24 @@ auto main(int argc, char* argv[]) -> int p->set_dof(m.location); p->reconstruct(); - p->state().xyz().initcells(); + ps.xyz().initcells(); dof = m.location; auto ephys = (*ff)(*p, sp); auto rm = ormsd(*p, sp); - std::cout << "After CG minimization, E_physical = " << ephys << ", RMSD = " << rm - << ", pseudo-energy E_physical + " << rmsfactor << " * RMSD = " << m.value << "\n"; + std::cout << FMT("After CG minimization, E_physical = {}, RMSD = {}, pseudo-energy E_physical + {} * RMSD = {}", + ephys, rm, rmsfactor, m.value); if (prog->switchGiven("store_intermediate_snapshots")) { auto alignment = rmsd.get_alignment(p->state()); p->transform(alignment.transformation); p->reconstruct(); - p->state().xyz().initcells(); + ps.xyz().initcells(); std::cout << "Writing intermediate output \"" << erminfile << "\n"; - write_xml_structure(p, erminfile, "Intermediate from "s + pdbinput, { ephys }); - write_xml_structure(p, "snapshot_"s + std::to_string(i) + ".xml"s, "Intermediate snapshot from "s + pdbinput, { ephys }); + write_xml_structure(p, erminfile, FMT("Intermediate from {}", pdbinput), { ephys }); + write_xml_structure(p, FMT("snapshot_{}.xml", i), FMT("Intermediate snapshot from {}", pdbinput), { ephys }); if (prog->optionGiven("pdb")) { - std::cout << "Saving PDB of intermediate structure as " - << "snapshot_"s + std::to_string(i) + ".pdb"s - << "\n"; - write_pdb_structure(p, "snapshot_"s + std::to_string(i) + ".pdb"s, { ephys }); + std::cout << FMT("Saving PDB of intermediate structure as snapshot_{}.pdb\n", i); + write_pdb_structure(p, FMT("snapshot_{}.pdb", i), { ephys }); } } @@ -507,7 +614,7 @@ auto main(int argc, char* argv[]) -> int ormsd(*p, sp); std::cout << "Writing regularized output \"" << erminfile << "\"\n"; - write_xml_structure(p, erminfile, "Regularized from "s + pdbinput, { ephys }); + write_xml_structure(p, erminfile, FMT("Regularized from {}", pdbinput), { ephys }); if (prog->optionGiven("pdb")) { std::cout << "Saving PDB of regularized structure as " << prog->option("pdb") << "\n"; write_pdb_structure(p, prog->option("pdb"), { ephys }); @@ -517,10 +624,7 @@ auto main(int argc, char* argv[]) -> int std::cout << "\nTime taken for energy minimization rounds: " << prf::to_string(dt) << ".\n"; std::cout << "Properties of the regularized state ...\n"; for (prf::StateProperties::ObsIndexType i {}; i.get() < sp.n_observables(); ++i) { - std::cout << sp.name(i) - << " = " << std::setw(16) - << std::fixed << std::setprecision(6) - << sp[i] << "\n"; + std::cout << FMT("{} = {: <16.6}\n", sp.name(i), sp[i]); } prf::flush(); } -- GitLab