ATLAS Offline Software
SlurmDriver.cxx
Go to the documentation of this file.
1 /*
2  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration
3 */
4 
6 
7 
8 //
9 // includes
10 //
11 
12 #include <EventLoop/SlurmDriver.h>
13 
15 #include <EventLoop/BatchJob.h>
16 #include <EventLoop/Job.h>
17 #include <EventLoop/ManagerData.h>
18 #include <EventLoop/MessageCheck.h>
19 #include <RootCoreUtils/Assert.h>
20 #include <RootCoreUtils/ThrowMsg.h>
21 #include <TSystem.h>
22 #include <fstream>
23 #include <memory>
24 #include <sstream>
25 
26 //
27 // method implementations
28 //
29 
31 
32 namespace EL
33 {
34  //****************************************************
35  void SlurmDriver :: testInvariant () const
36  {
37  RCU_INVARIANT (this != 0);
38  }
39  //****************************************************
41  {
42  m_b_job_name = false;
43  m_b_account = false;
44  m_b_run_time = false;
45 
46  RCU_NEW_INVARIANT (this);
47  }
48  //****************************************************
50  doManagerStep (Detail::ManagerData& data) const
51  {
52  RCU_READ_INVARIANT (this);
53  using namespace msgEventLoop;
55  switch (data.step)
56  {
58  {
59  data.batchInit = "export PATH LD_LIBRARY_PATH PYTHONPATH";
60  }
61  break;
62 
65  {
66  auto all_set = m_b_job_name && m_b_account && m_b_run_time;
67  if (!all_set)
68  {
69  ANA_MSG_INFO ("Job Name" << m_job_name);
70  ANA_MSG_INFO ("Account " << m_account);
71  ANA_MSG_INFO ("Run Time " << m_run_time);
72 
73  ANA_MSG_ERROR("All parameters need to be set before job can be submitted");
74  return ::StatusCode::FAILURE;
75  }
76 
77  RCU_READ_INVARIANT (this);
78 
79  if (data.resubmit)
80  RCU_THROW_MSG ("resubmission not supported for this driver");
81 
82  assert (!data.batchJobIndices.empty());
83  assert (data.batchJobIndices.back() + 1 == data.batchJobIndices.size());
84  const std::size_t njob = data.batchJobIndices.size();
85 
86  if(!data.options.castBool(Job::optBatchSharedFileSystem,true))
87  {
88  int status=gSystem->CopyFile("RootCore.par",(data.submitDir+"/submit/RootCore.par").c_str());
89  if(status != 0)
90  RCU_THROW_MSG ("failed to copy RootCore.par");
91  }
92 
93  {
94  std::ofstream file ((data.submitDir + "/submit/submit").c_str());
95 
96  file << "#!/bin/bash \n";
97  file << "\n";
98  file << "#SBATCH --job-name=" << m_job_name << "\n";
99  file << "#SBATCH --output=slurm-%j.out\n";
100  file << "#SBATCH --error=slurm-%j.err\n";
101  file << "#SBATCH --account=" << m_account << "\n";
102  if(!m_partition .empty()) file << "#SBATCH --partition=" << m_partition << "\n";
103  file << "#SBATCH --time=" << m_run_time << "\n";
104  if(!m_memory .empty()) file << "#SBATCH --mem=" << m_memory << "\n";
105  if(!m_constraint.empty()) file << "#SBATCH --constraint=" << m_constraint << "\n";
106  file << "\n";
107  file << data.options.castString(Job::optBatchSlurmExtraConfigLines) << "\n";
108  file << "\n";
109  //note: no "\n" at the of this string since this goes as pre-command to the execution of the next line
110  file << data.options.castString(Job::optBatchSlurmWrapperExec);
111  file << "./run ${SLURM_ARRAY_TASK_ID}\n";
112  }
113 
114  {
115  std::ostringstream cmd;
116  cmd << "cd " << data.submitDir << "/submit && sbatch --array=0-" << njob-1 << " " << data.options.castString (Job::optSubmitFlags) << " submit";
117  if (gSystem->Exec (cmd.str().c_str()) != 0)
118  RCU_THROW_MSG (("failed to execute: " + cmd.str()).c_str());
119  }
120  data.submitted = true;
121  }
122  break;
123 
124  default:
125  break;
126  }
127  return ::StatusCode::SUCCESS;
128  }
129 
130  //****************************************************
131  void SlurmDriver :: SetJobName(std::string job_name)
132  {
133  m_b_job_name = true;
134  m_job_name = job_name;
135  }
136  void SlurmDriver :: SetAccount(std::string account)
137  {
138  m_b_account = true;
139  m_account = account;
140  }
141  void SlurmDriver :: SetPartition(std::string partition)
142  {
144  }
145  void SlurmDriver :: SetRunTime(std::string run_time)
146  {
147  m_b_run_time = true;
148  m_run_time = run_time;
149  }
150  void SlurmDriver :: SetMemory(std::string memory)
151  {
152  m_memory = memory;
153  }
154  void SlurmDriver :: SetConstrain(std::string constraint)
155  {
156  m_constraint = constraint;
157  }
158  //****************************************************
159 }
EL::SlurmDriver::m_b_account
bool m_b_account
Definition: SlurmDriver.h:56
data
char data[hepevt_bytes_allocation_ATLAS]
Definition: HepEvt.cxx:11
EL::SlurmDriver::m_run_time
std::string m_run_time
Definition: SlurmDriver.h:51
SlurmDriver.h
EL::Job::optBatchSharedFileSystem
static const std::string optBatchSharedFileSystem
description: batch-specific options rationale: these options are for configuring batch drivers
Definition: Job.h:513
EL::SlurmDriver::m_account
std::string m_account
Definition: SlurmDriver.h:49
EL::Detail::ManagerStep::batchScriptVar
@ batchScriptVar
create the variables needed for the batch-run script
EL::BatchDriver::doManagerStep
virtual ::StatusCode doManagerStep(Detail::ManagerData &data) const override
rerun_display.cmd
string cmd
Definition: rerun_display.py:67
Job.h
EL::SlurmDriver::SetJobName
void SetJobName(std::string job_name)
ANA_MSG_ERROR
#define ANA_MSG_ERROR(xmsg)
Macro printing error messages.
Definition: Control/AthToolSupport/AsgMessaging/AsgMessaging/MessageCheck.h:294
EL::SlurmDriver::m_b_job_name
bool m_b_job_name
Definition: SlurmDriver.h:55
ANA_CHECK
#define ANA_CHECK(EXP)
check whether the given expression was successful
Definition: Control/AthToolSupport/AsgMessaging/AsgMessaging/MessageCheck.h:324
Assert.h
MessageCheck.h
EL::SlurmDriver::doManagerStep
virtual ::StatusCode doManagerStep(Detail::ManagerData &data) const override
empty
bool empty(TH1 *h)
Definition: computils.cxx:295
EL::SlurmDriver::m_b_run_time
bool m_b_run_time
Definition: SlurmDriver.h:57
EL::SlurmDriver::m_partition
std::string m_partition
Definition: SlurmDriver.h:50
EL::Detail::ManagerStep::doResubmit
@ doResubmit
call the actual doResubmit method
python.utils.AtlRunQueryMemUtil.memory
def memory(since=0.0)
Definition: AtlRunQueryMemUtil.py:30
EL::SlurmDriver::m_constraint
std::string m_constraint
Definition: SlurmDriver.h:53
EL::SlurmDriver::m_job_name
std::string m_job_name
Definition: SlurmDriver.h:48
EL::StatusCode
::StatusCode StatusCode
StatusCode definition for legacy code.
Definition: PhysicsAnalysis/D3PDTools/EventLoop/EventLoop/StatusCode.h:22
ManagerData.h
EL::Job::optBatchSlurmWrapperExec
static const std::string optBatchSlurmWrapperExec
Append a command before the main executable is called This is useful is you want to execute the comma...
Definition: Job.h:519
ANA_MSG_INFO
#define ANA_MSG_INFO(xmsg)
Macro printing info messages.
Definition: Control/AthToolSupport/AsgMessaging/AsgMessaging/MessageCheck.h:290
EL::SlurmDriver::SlurmDriver
SlurmDriver()
effects: standard default constructor guarantee: strong failures: low level errors I
file
TFile * file
Definition: tile_monitor.h:29
EL
This module defines the arguments passed from the BATCH driver to the BATCH worker.
Definition: AlgorithmWorkerData.h:24
EL::Job::optSubmitFlags
static const std::string optSubmitFlags
description: the name of the option for supplying extra submit parameters to batch systems rationale:...
Definition: Job.h:293
RCU_INVARIANT
#define RCU_INVARIANT(x)
Definition: Assert.h:201
EL::Job::optBatchSlurmExtraConfigLines
static const std::string optBatchSlurmExtraConfigLines
The content of this string will be executed in the job script on the worker node before the main exec...
Definition: Job.h:516
StatusCode.h
EL::SlurmDriver
a Driver for running on SLURM batch systems
Definition: SlurmDriver.h:23
ThrowMsg.h
BatchJob.h
EL::SlurmDriver::SetAccount
void SetAccount(std::string account)
EL::SlurmDriver::m_memory
std::string m_memory
Definition: SlurmDriver.h:52
EL::SlurmDriver::SetConstrain
void SetConstrain(std::string constraint)
EL::SlurmDriver::testInvariant
void testInvariant() const
effects: test the invariant of this object guarantee: no-fail
StateLessPT_NewConfig.partition
partition
Definition: StateLessPT_NewConfig.py:49
EL::SlurmDriver::SetPartition
void SetPartition(std::string partition)
EL::SlurmDriver::SetMemory
void SetMemory(std::string memory)
merge.status
status
Definition: merge.py:17
RCU_THROW_MSG
#define RCU_THROW_MSG(message)
Definition: PrintMsg.h:58
EL::SlurmDriver::SetRunTime
void SetRunTime(std::string run_time)
RCU_READ_INVARIANT
#define RCU_READ_INVARIANT(x)
Definition: Assert.h:229
ClassImp
ClassImp(EL::SlurmDriver) namespace EL
Definition: SlurmDriver.cxx:30
EL::Detail::ManagerStep::submitJob
@ submitJob
do the actual job submission
RCU_NEW_INVARIANT
#define RCU_NEW_INVARIANT(x)
Definition: Assert.h:233