From 5d2b51ee721e818724123b937dfa9c5305c49324 Mon Sep 17 00:00:00 2001
From: Sandipan Mohanty <s.mohanty@fz-juelich.de>
Date: Tue, 29 Apr 2025 14:45:47 +0200
Subject: [PATCH] Add an MC minimization stage in regularize for bad crds

When there are missing atoms in a PDB file, we end up with many
undetermined coordinates in the population. RMSD minimization
can not help us get good values for these. When a ProFASi simulation
starts with such a structure however, we have to fill in all atoms.
The ones originally missing in the PDB will have their XYZ coordinates
calculated based on nominal default values for those coordinates,
which may put them in overlapping positions with other atoms. This
leads to very high energies when we start our energy minimization
rounds (in billions of k_B T). When the energy is that high, any
MC update which resolves the clash, no matter how detrimental to
other energy terms and the RMSD, will be accepted. In those cases,
"regularize" fails to produce any reasonable structure. Here we
try one round of MC based energy minimization using only the bad,
or indeterminate, coordinates. If the clash can be resolved like
this, we will have a much better starting position for energy
minimization at the same RMSD value as obtained in the first stage
of regularization: the pure RMSD minimization. As it turns out,
the result is a bit mixed. In some structures it does wonders, in
others, it has no effect while consuming some more time.
---
 app/regularize.cc | 184 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 144 insertions(+), 40 deletions(-)

diff --git a/app/regularize.cc b/app/regularize.cc
index 9abd64e3..ed3fe881 100644
--- a/app/regularize.cc
+++ b/app/regularize.cc
@@ -46,7 +46,8 @@ void register_internal_plugins()
     register_molmc_plugins();
 }
 
-auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double goodenoughrmsd = 0.) -> double
+auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel,
+    const std::vector<CoordinateURL>& badcrds, double goodenoughrmsd = 0.) -> double
 {
     if (p == nullptr)
         throw prf::Exception { "Null population given to minimize_rmsd", __func__ };
@@ -55,6 +56,9 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good
     PDBReader pdbinput { ifilesel.filename() };
     if (pdbinput.read_matrix() == 0)
         throw prf::Exception { FMT("Read error while parsing PDB file '{}'", ifilesel.filename()), __func__ };
+
+    prf::fix_coordinates(*p, badcrds);
+
     for (auto ich = 0UL; ich < pc->NumberOfChains(); ++ich) {
         StateProperties sp;
         auto populationside = FMT("$::{:c}", 'A' + ich);
@@ -64,8 +68,7 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good
         rmsd.set_struc2(populationside);
         rmsd.filter("+HV+@H");
         rmsd.init(*p, sp);
-        std::cout << "Starting RMSD (all atoms, chain " << ich << ") = "
-                  << rmsd(*p, sp) << "\n";
+        std::cout << FMT("Starting RMSD (all atoms, chain {}) = {}\n", ich, rmsd(*p, sp));
         auto&& crds = p->state().generalizedCoordinates();
         std::vector<prf::CoordinateURL> provisionally_fixed;
         for (prf::gc::Storage::size_type idof {}; idof != crds.size(); ++idof) {
@@ -124,10 +127,13 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good
         prf::enforce_PBC(*p);
         p->reconstruct();
         auto alignment = rmsd.get_alignment(p->state());
-        std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n";
+        auto trv = alignment.transformation.translation();
+        std::cout << FMT("chain {}, alignment translation = ({}, {}, {}\n",
+            ich, trv[0], trv[1], trv[2]);
         p->transform(alignment.transformation);
         for (auto& url : provisionally_fixed) {
-            crds.unfix(url);
+            if (not prf::contains(badcrds, url))
+                crds.unfix(url);
         }
         crds.rehash();
         ans = mcg.value;
@@ -184,14 +190,63 @@ auto minimize_rmsd(Population* p, const SelectionFromFile& ifilesel, double good
         prf::enforce_PBC(*p);
         p->reconstruct();
         auto alignment = rmsd.get_alignment(p->state());
-        std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n";
+        // std::cout << "Alignment translation = " << alignment.transformation.translation() << "\n";
         p->transform(alignment.transformation);
         p->reconstruct();
         ans = mcg.value;
     }
+    prf::unfix_coordinates(*p, badcrds);
+    p->reconstruct();
     return ans;
 }
 
+template <class F, class Delta>
+auto MC_minimize(Population* p, StateProperties& sp,
+    F&& f, Delta&& delta, Temperature T, std::size_t ncyc,
+    std::span<const CoordinateURL> keepfixed)
+{
+    prf::fix_coordinates(*p, keepfixed);
+    MolMCUpdater upd;
+    // upd.parseCommands(cmds);
+    upd.init(*p);
+    upd.set_n_temperatures(1UL);
+    upd.set_T_index(0UL);
+    size_t ndof = upd.sweep_length();
+    auto dofs = p->get_dof();
+    auto mincoords = dofs;
+    auto gen = prf::RNGCache(prf::RandomNumberHandler::generator());
+    prf::MH::MCSweep mc(T.beta(), ndof, gen, 1UL);
+    p->state().xyz().initcells();
+    auto ecur = f(*p, sp);
+    auto emin = ecur;
+    std::cout << "MC minimization: " << ncyc << " cycles over " << ndof << " degrees of freedom\n";
+    for (int icyc = 0; icyc < ncyc; ++icyc) {
+
+        mc.advance(
+            *p, sp, upd, delta, f,
+            [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) {
+                f.apply_move(cc, sp1, sp2);
+            },
+            [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) {
+                f.undo_move(cc, sp1, sp2);
+            },
+            { ecur });
+        enforce_PBC(*p);
+        p->reconstruct();
+        auto newe = f(*p, sp);
+
+        if (newe < emin) {
+            p->get_dof(mincoords);
+            emin = newe;
+            std::cout << "New minimum energy " << emin << "\n";
+        }
+        std::cout << "Finished round " << icyc << " of MC with energy " << newe << "\n";
+    }
+    p->set_dof(mincoords);
+    p->reconstruct();
+    prf::unfix_coordinates(*p, keepfixed);
+    return emin;
+}
 auto main(int argc, char* argv[]) -> int
 {
     using namespace std::string_literals;
@@ -217,6 +272,8 @@ auto main(int argc, char* argv[]) -> int
     prog->registerOption({ "settings_file", "sf", 1, "(Use a settings file to adjust parameters.)" });
     prog->registerOption({ "initial_structure", "start", 1,
         "(where to start, if different from the one being regularized)" });
+    prog->registerOption({ "save_preminimization_structure", "pre", 1,
+        "(File name to save pre-minimization structure after reading PDB coordinates)" });
     prog->registerOption({ "num_e_minim_cycles", "nec", 1,
         "(number of minimization cycles of E+k*RMSD)" });
     prog->registerOption({ "initial_RMSD_factor", "f0", 1, "(initial factor k in E+k*RMSD)" });
@@ -232,6 +289,7 @@ auto main(int argc, char* argv[]) -> int
         "(Store a snapshot \"snapshot_xyz.xml\" when a new minimum energy is recorded.)" });
     prog->registerOption({ "threads_per_task", "threads", 1,
         "(Number of threads to use for energy (value, gradient) evaluations)" });
+    prog->registerSwitch({ "verbose_output", "v", false, "(Verbose logs on standard output)" });
     InstructionList cmds;
     cmds += InstructionString("Pivot kappa 0");
     cmds += InstructionString("Rot kappa 20");
@@ -246,6 +304,8 @@ auto main(int argc, char* argv[]) -> int
         cmds += settings.instructions();
     }
     prf::Logger::setVerbosity(cmds.get_or("log_level", 20UL));
+    auto vbose = cmds.get_or("verbose_output", false);
+
     Logger blog(10);
     auto nstages = cmds.get_or(prog->longName("nec"), 9);
     auto rmsfact0 = cmds.get_or(prog->longName("f0"), 256.0);
@@ -273,17 +333,25 @@ auto main(int argc, char* argv[]) -> int
     tbb::global_control tbbctl(tbb::global_control::max_allowed_parallelism, nthr);
     prf::parseCommands(ph, populationsetup);
     auto&& p = ph.population();
+    auto&& ps = p->state();
+    auto&& crds = ps.generalizedCoordinates();
 
-    p->state().generalizedCoordinates().rehash();
+    crds.rehash();
     p->reconstruct();
-    {
-        std::ofstream initstruct { "initstruct.xml" };
+    auto badcrds = p->undetermined_after_import();
+    if (not badcrds.empty()) {
+        std::cout << "The following coordinates could not be determined while reading the PDB structure.\n";
+        for (int counter = 0; const auto& url : badcrds) {
+            std::cout << FMT("{: >25}", url.str()) << ((++counter) % 3 == 0 ? "\n" : " ");
+        }
+        std::cout << "\n";
+    }
+
+    if (prog->optionGiven("save_preminimization_structure")) {
+        std::ofstream initstruct { cmds.get_or("save_preminimization_structure",
+            "initstruct.xml") };
         p->Write_XML(initstruct);
     }
-    upd.parseCommands(cmds);
-    upd.init(*p);
-    upd.set_n_temperatures(1UL);
-    upd.set_T_index(0UL);
 
     StateProperties sp;
     ProteinRMSD rmsd;
@@ -295,7 +363,7 @@ auto main(int argc, char* argv[]) -> int
 
     auto t0 = std::chrono::steady_clock::now();
 
-    auto rmsdmin = minimize_rmsd(p, ifilesels, goodenoughrmsd.value_or(0.));
+    auto rmsdmin = minimize_rmsd(p, ifilesels, badcrds, goodenoughrmsd.value_or(0.));
     std::mutex rmx, grmx;
     JacobianCarTor J(*p);
 
@@ -320,12 +388,17 @@ auto main(int argc, char* argv[]) -> int
 
     if (prog->switchGiven("g"))
         return 0;
+
+    upd.parseCommands(cmds);
+    upd.init(*p);
+    upd.set_n_temperatures(1UL);
+    upd.set_T_index(0UL);
     blog << "Initialising force field...\n";
     if (auto&& oip = ph.interaction_potential(); oip.has_value()) {
         auto&& ip = *oip;
         ip.init(*ph.population(), sp);
         auto ffmaxcutoff = ip.cutoff();
-        auto&& xyz = ph.population()->state().xyz();
+        auto&& xyz = ps.xyz();
         if (ffmaxcutoff) {
             std::cout << "Forcefield defines a maximum cutoff of "
                       << *ffmaxcutoff << " Angstroms\n";
@@ -338,8 +411,11 @@ auto main(int argc, char* argv[]) -> int
         }
         auto fhname = ip.name();
         std::cout << "Force field name : " << fhname << "\n";
+    } else {
+        std::cerr << "Invalid or non-existent forcefield setup.\n";
+        return 1;
     }
-    ph.population()->state().xyz().initcells();
+    ps.xyz().initcells();
     auto&& ff = ph.interaction_potential();
     double rmsstep = 0.5, rmsfactor = rmsfact0;
     if (nstages != 1) {
@@ -391,10 +467,30 @@ auto main(int argc, char* argv[]) -> int
         sr::copy(g, out.begin());
     };
     auto t2 = std::chrono::steady_clock::now();
+
+    if (not badcrds.empty()) {
+        auto allcrds = crds.urls();
+        decltype(badcrds) goodcrds;
+        for (auto&& url : allcrds) {
+            auto idx = crds.index_of(url);
+            auto crd = crds.props(idx);
+            if (crd.mutability() == gc::Mutability::free and (not prf::contains(badcrds, url)))
+                goodcrds.push_back(url);
+        }
+
+        p->reconstruct();
+        auto cyccount = mccycperminim;
+        auto emin = MC_minimize(p, sp, *ff, *ff, T,
+            cyccount, goodcrds);
+        std::cout << "Minimum energy after " << cyccount << " MC cycles = " << emin << "\n";
+    }
+
     {
-        std::cout << "Preliminary energy minimization with CG...\n";
+        std::cout << "Preliminary energy minimization with CG... all coordinates.\n";
+        dof = p->get_dof();
         auto m = prf::minimizers::cg(F, GF, dof);
         p->set_dof(m.location);
+        p->reconstruct();
         dof = m.location;
     }
 
@@ -421,10 +517,10 @@ auto main(int argc, char* argv[]) -> int
 
         std::vector mincoords(dof.size(), 0.);
         std::copy(dof.begin(), dof.end(), mincoords.begin());
-        auto emin = stageff(*p, sp);
+        auto ecur = stageff(*p, sp);
+        auto emin = ecur;
         std::cout << "For Stage " << i << ", starting value of pseudo-energy, E_physical + "
                   << rmsfactor << " * RMSD = " << emin << "\n";
-        auto ecur = emin;
         if (mccycperminim != 0)
             std::cout << "Running " << mccycperminim << " MC cycles looking for a lower "
                       << "pseudo-energy than the starting value " << emin << "...\n";
@@ -434,7 +530,8 @@ auto main(int argc, char* argv[]) -> int
             auto gen = prf::RNGCache(prf::RandomNumberHandler::generator());
             prf::MH::MCSweep mc(T.beta(), ndof, gen, 1UL);
             p->state().xyz().initcells();
-            mc.advance(
+
+            /*auto [p_after, e_after] =*/mc.advance(
                 *p, sp, upd, stagedelta, stageff,
                 [&](prf::ConformationChange& cc, prf::StateProperties& sp1, prf::StateProperties& sp2) {
                     ff->apply_move(cc, sp1, sp2);
@@ -446,15 +543,27 @@ auto main(int argc, char* argv[]) -> int
                 },
                 { ecur });
             enforce_PBC(*p);
+
+            // std::cout << "Energy returned from advance() = " << e_after << "\n";
+            // std::cout << "Energy using the returned population = " << (*ff)(p_after, sp) << "\n";
+            // copy_state(p_after, *p);
+            // std::cout << "Energy after copying to p = " << (*ff)(*p, sp) << "\n";
             p->reconstruct();
-            auto newe = stageff(*p, sp);
+            ecur = stageff(*p, sp);
+            // std::cout << "Reevaluated energy after call to enforce_PBC and reconstruct = " << ecur << "\n";
 
-            if (newe < emin) {
+            if (ecur < emin) {
                 p->get_dof(mincoords);
-                emin = newe;
+                emin = ecur;
                 blog(31) << "New minimum energy " << emin << "\n";
             }
-            std::cout << "Finished round " << icyc << " of MC with energy " << newe << "\n";
+            if (vbose) {
+                auto newr = ormsd(*p, sp);
+                std::cout << "Finished MC cycle " << icyc << " out of " << mccycperminim
+                          << " with energy = "
+                          << ecur << ", rmsd = " << newr << "\n";
+                // std::getchar();
+            }
         }
 
         if (mccycperminim != 0) {
@@ -463,7 +572,7 @@ auto main(int argc, char* argv[]) -> int
             std::copy(mincoords.begin(), mincoords.end(), dof.begin());
             p->set_dof(dof);
             p->reconstruct();
-            p->state().xyz().initcells();
+            ps.xyz().initcells();
             emin = stageff(*p, sp);
             std::cout << "Re-evaluated pseudo-energy for chosen structure =  " << emin << "\n";
         }
@@ -472,26 +581,24 @@ auto main(int argc, char* argv[]) -> int
 
         p->set_dof(m.location);
         p->reconstruct();
-        p->state().xyz().initcells();
+        ps.xyz().initcells();
         dof = m.location;
         auto ephys = (*ff)(*p, sp);
         auto rm = ormsd(*p, sp);
-        std::cout << "After CG minimization, E_physical = " << ephys << ", RMSD = " << rm
-                  << ", pseudo-energy E_physical + " << rmsfactor << " * RMSD = " << m.value << "\n";
+        std::cout << FMT("After CG minimization, E_physical = {}, RMSD = {}, pseudo-energy E_physical + {} * RMSD = {}",
+            ephys, rm, rmsfactor, m.value);
 
         if (prog->switchGiven("store_intermediate_snapshots")) {
             auto alignment = rmsd.get_alignment(p->state());
             p->transform(alignment.transformation);
             p->reconstruct();
-            p->state().xyz().initcells();
+            ps.xyz().initcells();
             std::cout << "Writing intermediate output \"" << erminfile << "\n";
-            write_xml_structure(p, erminfile, "Intermediate from "s + pdbinput, { ephys });
-            write_xml_structure(p, "snapshot_"s + std::to_string(i) + ".xml"s, "Intermediate snapshot from "s + pdbinput, { ephys });
+            write_xml_structure(p, erminfile, FMT("Intermediate from {}", pdbinput), { ephys });
+            write_xml_structure(p, FMT("snapshot_{}.xml", i), FMT("Intermediate snapshot from {}", pdbinput), { ephys });
             if (prog->optionGiven("pdb")) {
-                std::cout << "Saving PDB of intermediate structure as "
-                          << "snapshot_"s + std::to_string(i) + ".pdb"s
-                          << "\n";
-                write_pdb_structure(p, "snapshot_"s + std::to_string(i) + ".pdb"s, { ephys });
+                std::cout << FMT("Saving PDB of intermediate structure as snapshot_{}.pdb\n", i);
+                write_pdb_structure(p, FMT("snapshot_{}.pdb", i), { ephys });
             }
         }
 
@@ -507,7 +614,7 @@ auto main(int argc, char* argv[]) -> int
     ormsd(*p, sp);
 
     std::cout << "Writing regularized output \"" << erminfile << "\"\n";
-    write_xml_structure(p, erminfile, "Regularized from "s + pdbinput, { ephys });
+    write_xml_structure(p, erminfile, FMT("Regularized from {}", pdbinput), { ephys });
     if (prog->optionGiven("pdb")) {
         std::cout << "Saving PDB of regularized structure as " << prog->option("pdb") << "\n";
         write_pdb_structure(p, prog->option("pdb"), { ephys });
@@ -517,10 +624,7 @@ auto main(int argc, char* argv[]) -> int
     std::cout << "\nTime taken for energy minimization rounds: " << prf::to_string(dt) << ".\n";
     std::cout << "Properties of the regularized state ...\n";
     for (prf::StateProperties::ObsIndexType i {}; i.get() < sp.n_observables(); ++i) {
-        std::cout << sp.name(i)
-                  << " = " << std::setw(16)
-                  << std::fixed << std::setprecision(6)
-                  << sp[i] << "\n";
+        std::cout << FMT("{} = {: <16.6}\n", sp.name(i), sp[i]);
     }
     prf::flush();
 }
-- 
GitLab