ATLAS Offline Software
Step.py
Go to the documentation of this file.
1 #
2 # Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3 #
4 
5 '''
6 Base classes for steps in Trigger ART tests
7 '''
8 
9 import os
10 import sys
11 import signal
12 import subprocess
13 import time
14 import re
15 from enum import Enum
16 from threading import Timer
17 from TrigValTools.TrigValSteering.Common import get_logger, art_result, running_in_CI
18 from TestTools.logfiles import grep_with_context
19 from TrigValTools.TrigARTUtils import remember_cwd
20 
21 
22 class Step(object):
23  '''Base class for a step of a Trigger ART test'''
24 
25  class OutputStream(Enum):
26  NO_PRINT = 0
27  FILE_ONLY = 1
28  STDOUT_ONLY = 2
29  FILE_AND_STDOUT = 3
30 
31  def __init__(self, name=None):
32  self.log = get_logger()
33  self.name = name
34  self.type = None
35  self.executable = None
36  self.args = ''
37  self.cmd_prefix = None
38  self.cmd_suffix = None
39  self.output_stream = self.OutputStream.FILE_ONLY
40  self.log_file_name = None
41  self.result = None
42  self.auto_report_result = False
43  self.required = False
44  self.depends_on_previous = False # skip if previous step in the current step list failed
45  self.depends_on_exec = False # (only for CheckSteps) skip if ExecSteps failed
46  self.timeout = None
47  self.prmon = False
48  self.prmon_interval = 5 # monitoring interval in seconds
49  self.workdir = None
50 
51  def get_log_file_name(self):
52  return self.log_file_name or self.name+'.log'
53 
54  def configure(self, test=None):
55  '''
56  Can be implemented by derived classes.
57  Base class implementation only prints the configuration to debug log.
58  '''
59  self.log.debug(
60  'Step configuration:\n-- %s\n',
61  '\n-- '.join(['{}: {}'.format(k, v) for k, v in self.__dict__.items()]))
62 
63  def report_result(self, result=None, name=None):
64  if result is None:
65  if self.result is not None:
66  result = self.result
67  else:
68  self.misconfig_abort('report_result was called but result is None')
69 
70  if name is None:
71  if self.name is not None:
72  name = self.name
73  else:
74  name = 'UnnamedStep'
75 
76  art_result(result, name)
77 
78  def misconfig_abort(self, error_msg, *args, **kwargs):
79  '''
80  Print an error message (arguments passed to logging.error),
81  report non-zero art-result and exit the process with non-zero code
82  '''
83  self.log.error('Misconfiguration in %s: '+error_msg, self.name, *args, **kwargs)
84  self.report_result(1, 'TestConfig')
85  sys.exit(1)
86 
87  def __trace_and_kill(self, pid, signal, backtrace_list):
88  '''
89  Produce a backtrace for a process and its children, then call
90  os.killpg on the process. The last argument is a list of strings
91  where the first is filled with the backtrace by this function
92  (it has to be a list to be mutable).
93  '''
94  try:
95  # Produce backtrace for the parent and all children
96  import psutil
97  parent = psutil.Process(pid)
98  backtrace = ''
99  for proc in [parent] + parent.children(recursive=True):
100  backtrace += '\nTraceback for {} PID {}:\n'.format(proc.name(), proc.pid)
101  backtrace += subprocess.check_output('$ROOTSYS/etc/gdb-backtrace.sh {}'.format(proc.pid),
102  stderr=subprocess.STDOUT, shell=True).decode('utf-8')
103  backtrace_list[0] = backtrace
104 
105  # Kill the process
106  os.killpg(pid, signal)
107 
108  except Exception as e:
109  # This may happen e.g. if one of the processes finishes before we generate backtrace
110  msg = 'Caught exception while generating backtrace: ' + str(e)
111  backtrace_list[0] = msg
112  self.log.error(msg)
113 
114  def __execute_with_timeout(self, cmd, timeout_sec):
115  '''
116  Execute a shell process and kill it if it doesn't finish
117  before timeout_sec seconds pass. The implementation is based on
118  https://stackoverflow.com/a/10012262 and https://stackoverflow.com/a/4791612
119  In addition, a backtrace is produced for the timed out process and its children.
120  '''
121  proc = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid)
122  backtrace_list = ['']
123  timer = Timer(timeout_sec, self.__trace_and_kill,
124  [os.getpgid(proc.pid), signal.SIGKILL, backtrace_list])
125  try:
126  timer.start()
127  proc.communicate()
128  finally:
129  timer.cancel()
130 
131  if proc.returncode == -signal.SIGKILL:
132  self.log.error('%s killed on timeout of %s seconds',
133  self.name, self.timeout)
134  if (self.output_stream == self.OutputStream.FILE_ONLY or
135  self.output_stream == self.OutputStream.FILE_AND_STDOUT):
136  with open(self.get_log_file_name(), 'a') as log_file:
137  log_file.write(
138  'ERROR process killed on timeout '
139  'of {} seconds, command was {}\n'.format(
140  self.timeout, cmd))
141  log_file.write(backtrace_list[0])
142  return signal.SIGKILL # return 9 instead of -9
143 
144  return proc.returncode
145 
146  def __start_prmon(self):
147  self.log.debug('Starting prmon for pid %d', os.getpid())
148  prmon_cmd = 'prmon --pid {:d} --interval {:d}'.format(os.getpid(), self.prmon_interval)
149  prmon_cmd +=' --filename prmon.{name:s}.txt --json-summary prmon.summary.{name:s}.json'.format(name=self.name)
150  prmon_cmd +=' --log-filename prmon.{name:s}.log'.format(name=self.name)
151  return subprocess.Popen(prmon_cmd, shell=True)
152 
153  def __stop_prmon(self, prmon_proc):
154  self.log.debug('Stopping prmon')
155  try:
156  prmon_proc.send_signal(signal.SIGUSR1)
157  countWait = 0
158  while (not prmon_proc.poll()) and countWait < 10:
159  time.sleep(0.1)
160  countWait += 1
161  except OSError as err:
162  self.log.warning('Error while stopping prmon: %s', err)
163  pass
164 
165  def run(self, dry_run=False):
166  cmd = '{} {}'.format(self.executable, self.args)
167  if self.cmd_prefix:
168  cmd = self.cmd_prefix + cmd
169  if self.cmd_suffix:
170  cmd += self.cmd_suffix
171  if self.output_stream == self.OutputStream.NO_PRINT:
172  cmd += ' >/dev/null 2>&1'
173  elif self.output_stream == self.OutputStream.FILE_ONLY:
174  cmd += ' >{} 2>&1'.format(self.get_log_file_name())
175  elif self.output_stream == self.OutputStream.FILE_AND_STDOUT:
176  cmd += ' 2>&1 | tee {}; exit ${{PIPESTATUS[0]}}'.format(self.get_log_file_name())
177  elif self.output_stream == self.OutputStream.STDOUT_ONLY:
178  cmd += ' 2>&1'
179 
180  self.log.info('Running %s step using command:\n%s', self.name, cmd)
181  if dry_run:
182  self.result = 0
183  else:
184  with remember_cwd():
185  if self.workdir:
186  assert '..' not in self.workdir, "Illegal path for workdir -- must be a subdirectory of CWD"
187  assert not self.workdir.startswith('/'), "Illegal path for workdir -- no absolute paths!"
188  os.makedirs(self.workdir,exist_ok=True)
189  os.chdir(self.workdir)
190 
191  if self.prmon:
192  prmon_proc = self.__start_prmon()
193  if self.timeout:
194  self.result = self.__execute_with_timeout(cmd, self.timeout)
195  else:
196  self.result = subprocess.call(cmd, shell=True)
197  if self.prmon:
198  self.__stop_prmon(prmon_proc)
199 
200  if self.auto_report_result:
201  self.report_result()
202 
203  # Print full log to stdout for failed steps if running in CI
204  if self.required \
205  and self.result != 0 \
206  and running_in_CI() \
207  and self.output_stream==self.OutputStream.FILE_ONLY:
208  self.log.error('Step failure while running in CI. Printing partial log %s', self.get_log_file_name())
209 
210  # Common error patterns (might be good to use all patterns from check_log.py)
211  error_patterns = '^ERROR| ERROR | FATAL |[Tt]raceback'
212  grep_with_context(open(self.get_log_file_name()), error_patterns, lines=100)
213 
214  # Print also sub-step logs for transforms
215  if self.executable.endswith('_tf.py'):
216  log = open(self.get_log_file_name()).read()
217  step_matches = re.findall('Logs for (.*) are in (.*)', log)
218  if not step_matches:
219  self.log.warning('Failed to determine sub-step log names, cannot print the full sub-step logs')
220  else:
221  step_log_names = [m[1] for m in step_matches]
222  for step_log_name in step_log_names:
223  if os.path.isfile(step_log_name):
224  self.log.info('Printing partial sub-step log file %s', step_log_name)
225  grep_with_context(open(step_log_name), error_patterns, lines=100)
226 
227  return self.result, cmd
228 
229 
230 def get_step_from_list(step_name, step_list):
231  '''
232  Retrieve the first test matching the name from the list. Returns None if
233  no match is found.
234  '''
235  for step in step_list:
236  if step.name is not None and step_name in step.name:
237  return step
238  return None
239 
240 
241 def get_step_type_from_list(step_type, step_list):
242  '''
243  Retrieve the first test matching the type from the list. Returns None if
244  no match is found.
245  '''
246  for step in step_list:
247  if isinstance(step, step_type):
248  return step
249  return None
grepfile.info
info
Definition: grepfile.py:38
read
IovVectorMap_t read(const Folder &theFolder, const SelectionCriterion &choice, const unsigned int limit=10)
Definition: openCoraCool.cxx:569
AtlasMcWeight::decode
double decode(number_type binnedWeight)
Convert weight from unsigned to double.
Definition: AtlasMcWeight.cxx:32
python.TrigValSteering.Step.Step.get_log_file_name
def get_log_file_name(self)
Definition: Step.py:51
vtune_athena.format
format
Definition: vtune_athena.py:14
python.TrigValSteering.Step.Step.output_stream
output_stream
Definition: Step.py:39
python.TrigValSteering.Step.Step.timeout
timeout
Definition: Step.py:46
python.TrigValSteering.Step.Step.args
args
Definition: Step.py:36
python.TrigValSteering.Step.Step.__start_prmon
def __start_prmon(self)
Definition: Step.py:146
python.TrigValSteering.Step.Step
Definition: Step.py:22
python.TrigValSteering.Step.get_step_from_list
def get_step_from_list(step_name, step_list)
Definition: Step.py:230
python.TrigValSteering.Common.running_in_CI
def running_in_CI()
Definition: Common.py:78
python.TrigValSteering.Step.Step.auto_report_result
auto_report_result
Definition: Step.py:42
python.TrigValSteering.Step.get_step_type_from_list
def get_step_type_from_list(step_type, step_list)
Definition: Step.py:241
python.TrigValSteering.Step.Step.prmon
prmon
Definition: Step.py:47
python.TrigValSteering.Step.Step.__stop_prmon
def __stop_prmon(self, prmon_proc)
Definition: Step.py:153
python.TrigValSteering.Step.Step.prmon_interval
prmon_interval
Definition: Step.py:48
python.logfiles.grep_with_context
def grep_with_context(f, pattern, lines=100, print_head=True, print_tail=True)
Definition: logfiles.py:10
python.TrigValSteering.Step.Step.configure
def configure(self, test=None)
Definition: Step.py:54
python.TrigValSteering.Step.Step.OutputStream
Definition: Step.py:25
python.TrigValSteering.Step.Step.__execute_with_timeout
def __execute_with_timeout(self, cmd, timeout_sec)
Definition: Step.py:114
python.TrigValSteering.Step.Step.log_file_name
log_file_name
Definition: Step.py:40
python.TrigValSteering.Step.Step.workdir
workdir
Definition: Step.py:49
python.TrigValSteering.Step.Step.run
def run(self, dry_run=False)
Definition: Step.py:165
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
debug
const bool debug
Definition: MakeUncertaintyPlots.cxx:53
TrigJetMonitorAlgorithm.items
items
Definition: TrigJetMonitorAlgorithm.py:79
python.TrigValSteering.Step.Step.required
required
Definition: Step.py:43
python.TrigValSteering.Step.Step.depends_on_previous
depends_on_previous
Definition: Step.py:44
Trk::open
@ open
Definition: BinningType.h:40
python.TrigValSteering.Step.Step.result
result
Definition: Step.py:41
python.TrigValSteering.Step.Step.log
log
Definition: Step.py:32
python.TrigARTUtils.remember_cwd
def remember_cwd()
Definition: TrigARTUtils.py:50
python.TrigValSteering.Step.Step.name
name
Definition: Step.py:33
python.TrigValSteering.Common.get_logger
def get_logger()
Definition: Common.py:33
python.TrigValSteering.Step.Step.misconfig_abort
def misconfig_abort(self, error_msg, *args, **kwargs)
Definition: Step.py:78
python.TrigValSteering.Step.Step.executable
executable
Definition: Step.py:35
python.TrigValSteering.Step.Step.__trace_and_kill
def __trace_and_kill(self, pid, signal, backtrace_list)
Definition: Step.py:87
python.TrigValSteering.Step.Step.report_result
def report_result(self, result=None, name=None)
Definition: Step.py:63
python.TrigValSteering.Step.Step.cmd_suffix
cmd_suffix
Definition: Step.py:38
pickleTool.object
object
Definition: pickleTool.py:30
str
Definition: BTagTrackIpAccessor.cxx:11
python.TrigValSteering.Step.Step.cmd_prefix
cmd_prefix
Definition: Step.py:37
python.TrigValSteering.Common.art_result
def art_result(result, name)
Definition: Common.py:42
error
Definition: IImpactPoint3dEstimator.h:70
python.TrigValSteering.Step.Step.depends_on_exec
depends_on_exec
Definition: Step.py:45
python.TrigValSteering.Step.Step.__init__
def __init__(self, name=None)
Definition: Step.py:31
python.TrigValSteering.Step.Step.type
type
Definition: Step.py:34