Ignore:
Timestamp:
Jun 29, 2012, 8:01:38 AM (13 years ago)
Author:
Frederik Heber <heber@…>
Branches:
Action_Thermostats, Add_AtomRandomPerturbation, Add_FitFragmentPartialChargesAction, Add_RotateAroundBondAction, Add_SelectAtomByNameAction, Added_ParseSaveFragmentResults, AddingActions_SaveParseParticleParameters, Adding_Graph_to_ChangeBondActions, Adding_MD_integration_tests, Adding_ParticleName_to_Atom, Adding_StructOpt_integration_tests, AtomFragments, Automaking_mpqc_open, AutomationFragmentation_failures, Candidate_v1.5.4, Candidate_v1.6.0, Candidate_v1.6.1, ChangeBugEmailaddress, ChangingTestPorts, ChemicalSpaceEvaluator, CombiningParticlePotentialParsing, Combining_Subpackages, Debian_Package_split, Debian_package_split_molecuildergui_only, Disabling_MemDebug, Docu_Python_wait, EmpiricalPotential_contain_HomologyGraph, EmpiricalPotential_contain_HomologyGraph_documentation, Enable_parallel_make_install, Enhance_userguide, Enhanced_StructuralOptimization, Enhanced_StructuralOptimization_continued, Example_ManyWaysToTranslateAtom, Exclude_Hydrogens_annealWithBondGraph, FitPartialCharges_GlobalError, Fix_BoundInBox_CenterInBox_MoleculeActions, Fix_ChargeSampling_PBC, Fix_ChronosMutex, Fix_FitPartialCharges, Fix_FitPotential_needs_atomicnumbers, Fix_ForceAnnealing, Fix_IndependentFragmentGrids, Fix_ParseParticles, Fix_ParseParticles_split_forward_backward_Actions, Fix_PopActions, Fix_QtFragmentList_sorted_selection, Fix_Restrictedkeyset_FragmentMolecule, Fix_StatusMsg, Fix_StepWorldTime_single_argument, Fix_Verbose_Codepatterns, Fix_fitting_potentials, Fixes, ForceAnnealing_goodresults, ForceAnnealing_oldresults, ForceAnnealing_tocheck, ForceAnnealing_with_BondGraph, ForceAnnealing_with_BondGraph_continued, ForceAnnealing_with_BondGraph_continued_betteresults, ForceAnnealing_with_BondGraph_contraction-expansion, FragmentAction_writes_AtomFragments, FragmentMolecule_checks_bonddegrees, GeometryObjects, Gui_Fixes, Gui_displays_atomic_force_velocity, ImplicitCharges, IndependentFragmentGrids, IndependentFragmentGrids_IndividualZeroInstances, IndependentFragmentGrids_IntegrationTest, IndependentFragmentGrids_Sole_NN_Calculation, JobMarket_RobustOnKillsSegFaults, JobMarket_StableWorkerPool, JobMarket_unresolvable_hostname_fix, MoreRobust_FragmentAutomation, ODR_violation_mpqc_open, PartialCharges_OrthogonalSummation, PdbParser_setsAtomName, PythonUI_with_named_parameters, QtGui_reactivate_TimeChanged_changes, Recreated_GuiChecks, Rewrite_FitPartialCharges, RotateToPrincipalAxisSystem_UndoRedo, SaturateAtoms_findBestMatching, SaturateAtoms_singleDegree, StoppableMakroAction, Subpackage_CodePatterns, Subpackage_JobMarket, Subpackage_LinearAlgebra, Subpackage_levmar, Subpackage_mpqc_open, Subpackage_vmg, Switchable_LogView, ThirdParty_MPQC_rebuilt_buildsystem, TrajectoryDependenant_MaxOrder, TremoloParser_IncreasedPrecision, TremoloParser_MultipleTimesteps, TremoloParser_setsAtomName, Ubuntu_1604_changes, stable
Children:
f98c8e
Parents:
559385
git-author:
Frederik Heber <heber@…> (03/04/12 16:18:32)
git-committer:
Frederik Heber <heber@…> (06/29/12 08:01:38)
Message:

FragmentQueue can now resubmit jobs.

  • we keep an internal list of jobs currently being worked on. If the worker returns with a failure, they can be resubmit to the queue.
  • there are Max_Attempts (currently set to 1) tries to resubmit.
  • added unit test function on this.
  • added regression test Fragmentation/Automation resubmitjobs.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • src/Fragmentation/Automation/FragmentQueue.cpp

    r559385 rfe95b7  
    2323
    2424#include "CodePatterns/Assert.hpp"
     25#include "CodePatterns/Log.hpp"
    2526
    2627FragmentResult::ptr FragmentQueue::NoResult( new FragmentResult(-1) );
    2728FragmentResult::ptr FragmentQueue::NoResultQueued( new FragmentResult(-2) );
    2829FragmentResult::ptr FragmentQueue::ResultDelivered( new FragmentResult(-3) );
     30size_t FragmentQueue::Max_Attempts = (size_t)1;
    2931
    3032/** Constructor for class FragmentQueue.
     
    9395      "FragmentQueue::popJob() - there are no jobs on the queue.");
    9496  FragmentJob::ptr job = jobs.front();
     97#ifndef NDEBUG
     98  std::pair< BackupMap::iterator, bool> inserter =
     99#endif
     100  backup.insert( std::make_pair( job->getId(), job ));
     101  ASSERT (inserter.second,
     102      "FragmentQueue::popJob() - job "+toString(job->getId())+
     103      " is already in the backup.");
    95104  ResultMap::iterator iter = results.find(job->getId());
    96105  ASSERT(iter != results.end(),
     
    197206void FragmentQueue::pushResult(FragmentResult::ptr &_result)
    198207{
     208  const JobId_t id = _result->getId();
    199209  /// check for presence
    200   ResultMap::iterator iter = results.find(_result->getId());
     210  ResultMap::iterator iter = results.find(id);
    201211  ASSERT(iter != results.end(),
    202       "FragmentQueue::pushResult() - job "+toString(_result->getId())+" is not known to us.");
     212      "FragmentQueue::pushResult() - job "+toString(id)+" is not known to us.");
    203213  ASSERT(*iter->second == *NoResultQueued,
    204       "FragmentQueue::pushResult() - is not waiting for the result of job "+toString(_result->getId())+".");
    205   /// and overwrite NoResult in found entry
    206   iter->second = _result;
    207 }
    208 
     214      "FragmentQueue::pushResult() - is not waiting for the result of job "+toString(id)+".");
     215  // check whether this is a resubmitted job
     216  AttemptsMap::iterator attemptiter = attempts.find(id);
     217  // check whether succeeded or (finally) failed
     218  if ((_result->exitflag == 0) || ((attemptiter != attempts.end()) && (attemptiter->second >= Max_Attempts))) {
     219    // give notice if it is resubmitted job
     220    if (attemptiter != attempts.end()) {
     221      if (attemptiter->second >= Max_Attempts)
     222        ELOG(1, "Job #" << id << " failed on " << Max_Attempts << "th attempt for the last time.");
     223      else
     224        LOG(1, "INFO: Job #" << id << " succeeded on " << attemptiter->second << "th attempt.");
     225    }
     226    // remove in attempts
     227    if (attemptiter != attempts.end())
     228      attempts.erase(attemptiter);
     229    // remove in backup map
     230    BackupMap::iterator backupiter = backup.find(id);
     231    ASSERT( backupiter != backup.end(),
     232        "FragmentQueue::pushResult() - cannot find job "+toString(id)
     233        +" in backup.");
     234    backup.erase(backupiter);
     235    /// and overwrite NoResult in found entry
     236    iter->second = _result;
     237  } else {
     238    LOG(1, "Job " << id << " failed, resubmitting.");
     239    // increase attempts
     240    if (attemptiter != attempts.end())
     241      ++(attemptiter->second);
     242    else
     243      attempts.insert( std::make_pair(id, (size_t)1) );
     244    // resubmit job
     245    resubmitJob(id);
     246  }
     247}
     248
     249/** Resubmit a job which a worker failed to calculate.
     250 *
     251 * @param jobid id of the failed job
     252 */
     253void FragmentQueue::resubmitJob(const JobId_t jobid)
     254{
     255  BackupMap::iterator iter = backup.find(jobid);
     256  ASSERT( iter != backup.end(),
     257      "FragmentQueue::resubmitJob() - job id "+toString(jobid)
     258      +" not stored in backup.");
     259  if (iter != backup.end()) {
     260    // remove result
     261    ResultMap::iterator resiter = results.find(jobid);
     262    ASSERT( resiter != results.end(),
     263        "FragmentQueue::resubmitJob() - resubmitting job "+toString(jobid)
     264        +" for which no result is present.");
     265    results.erase(resiter);
     266    pushJob(iter->second);
     267    backup.erase(iter);
     268  }
     269}
     270
Note: See TracChangeset for help on using the changeset viewer.