ATLAS Offline Software
Loading...
Searching...
No Matches
Step.py
Go to the documentation of this file.
2# Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3#
4
5'''
6Base classes for steps in Trigger ART tests
7'''
8
9import os
10import sys
11import signal
12import subprocess
13import time
14import re
15from enum import Enum
16from threading import Timer
17from TrigValTools.TrigValSteering.Common import get_logger, art_result, running_in_CI
18from TestTools.logfiles import grep_with_context
19from TrigValTools.TrigARTUtils import remember_cwd
20
21
22class Step(object):
23 '''Base class for a step of a Trigger ART test'''
24
25 class OutputStream(Enum):
26 NO_PRINT = 0
27 FILE_ONLY = 1
28 STDOUT_ONLY = 2
29 FILE_AND_STDOUT = 3
30
31 def __init__(self, name=None):
32 self.log = get_logger()
33 self.name = name
34 self.type = None
35 self.executable = None
36 self.args = ''
37 self.cmd_prefix = None
38 self.cmd_suffix = None
39 self.output_stream = self.OutputStream.FILE_ONLY
40 self.log_file_name = None
41 self.result = None
42 self.auto_report_result = False
43 self.required = False
44 self.depends_on_previous = False # skip if previous step in the current step list failed
45 self.depends_on_exec = False # (only for CheckSteps) skip if ExecSteps failed
46 self.timeout = None
47 self.prmon = False
48 self.prmon_interval = 5 # monitoring interval in seconds
49 self.workdir = None
50
52 return self.log_file_name or self.name+'.log'
53
54 def configure(self, test=None):
55 '''
56 Can be implemented by derived classes.
57 Base class implementation only prints the configuration to debug log.
58 '''
59 self.log.debug(
60 'Step configuration:\n-- %s\n',
61 '\n-- '.join(['{}: {}'.format(k, v) for k, v in self.__dict__.items()]))
62
63 def report_result(self, result=None, name=None):
64 if result is None:
65 if self.result is not None:
66 result = self.result
67 else:
68 self.misconfig_abort('report_result was called but result is None')
69
70 if name is None:
71 if self.name is not None:
72 name = self.name
73 else:
74 name = 'UnnamedStep'
75
76 art_result(result, name)
77
78 def misconfig_abort(self, error_msg, *args, **kwargs):
79 '''
80 Print an error message (arguments passed to logging.error),
81 report non-zero art-result and exit the process with non-zero code
82 '''
83 self.log.error('Misconfiguration in %s: '+error_msg, self.name, *args, **kwargs)
84 self.report_result(1, 'TestConfig')
85 sys.exit(1)
86
87 def __trace_and_kill(self, pid, signal, backtrace_list):
88 '''
89 Produce a backtrace for a process and its children, then call
90 os.killpg on the process. The last argument is a list of strings
91 where the first is filled with the backtrace by this function
92 (it has to be a list to be mutable).
93 '''
94 try:
95 # Produce backtrace for the parent and all children
96 import psutil
97 parent = psutil.Process(pid)
98 backtrace = ''
99 for proc in [parent] + parent.children(recursive=True):
100 backtrace += '\nTraceback for {} PID {}:\n'.format(proc.name(), proc.pid)
101 backtrace += subprocess.check_output('$ROOTSYS/etc/gdb-backtrace.sh {}'.format(proc.pid),
102 stderr=subprocess.STDOUT, shell=True).decode('utf-8')
103 backtrace_list[0] = backtrace
104
105 # Kill the process
106 os.killpg(pid, signal)
107
108 except Exception as e:
109 # This may happen e.g. if one of the processes finishes before we generate backtrace
110 msg = 'Caught exception while generating backtrace: ' + str(e)
111 backtrace_list[0] = msg
112 self.log.error(msg)
113
114 def __execute_with_timeout(self, cmd, timeout_sec):
115 '''
116 Execute a shell process and kill it if it doesn't finish
117 before timeout_sec seconds pass. The implementation is based on
118 https://stackoverflow.com/a/10012262 and https://stackoverflow.com/a/4791612
119 In addition, a backtrace is produced for the timed out process and its children.
120 '''
121 proc = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid)
122 backtrace_list = ['']
123 timer = Timer(timeout_sec, self.__trace_and_kill,
124 [os.getpgid(proc.pid), signal.SIGKILL, backtrace_list])
125 try:
126 timer.start()
127 proc.communicate()
128 finally:
129 timer.cancel()
130
131 if proc.returncode == -signal.SIGKILL:
132 self.log.error('%s killed on timeout of %s seconds',
133 self.name, self.timeout)
134 if (self.output_stream == self.OutputStream.FILE_ONLY or
135 self.output_stream == self.OutputStream.FILE_AND_STDOUT):
136 with open(self.get_log_file_name(), 'a') as log_file:
137 log_file.write(
138 'ERROR process killed on timeout '
139 'of {} seconds, command was {}\n'.format(
140 self.timeout, cmd))
141 log_file.write(backtrace_list[0])
142 return signal.SIGKILL # return 9 instead of -9
143
144 return proc.returncode
145
146 def __start_prmon(self):
147 self.log.debug('Starting prmon for pid %d', os.getpid())
148 prmon_cmd = 'prmon --pid {:d} --interval {:d}'.format(os.getpid(), self.prmon_interval)
149 prmon_cmd +=' --filename prmon.{name:s}.txt --json-summary prmon.summary.{name:s}.json'.format(name=self.name)
150 prmon_cmd +=' --log-filename prmon.{name:s}.log'.format(name=self.name)
151 return subprocess.Popen(prmon_cmd, shell=True)
152
153 def __stop_prmon(self, prmon_proc):
154 self.log.debug('Stopping prmon')
155 try:
156 prmon_proc.send_signal(signal.SIGUSR1)
157 countWait = 0
158 while (not prmon_proc.poll()) and countWait < 10:
159 time.sleep(0.1)
160 countWait += 1
161 except OSError as err:
162 self.log.warning('Error while stopping prmon: %s', err)
163 pass
164
165 def run(self, dry_run=False):
166 cmd = '{} {}'.format(self.executable, self.args)
167 if self.cmd_prefix:
168 cmd = self.cmd_prefix + cmd
169 if self.cmd_suffix:
170 cmd += self.cmd_suffix
171 if self.output_stream == self.OutputStream.NO_PRINT:
172 cmd += ' >/dev/null 2>&1'
173 elif self.output_stream == self.OutputStream.FILE_ONLY:
174 cmd += ' >{} 2>&1'.format(self.get_log_file_name())
175 elif self.output_stream == self.OutputStream.FILE_AND_STDOUT:
176 cmd += ' 2>&1 | tee {}; exit ${{PIPESTATUS[0]}}'.format(self.get_log_file_name())
177 elif self.output_stream == self.OutputStream.STDOUT_ONLY:
178 cmd += ' 2>&1'
179
180 self.log.info('Running %s step using command:\n%s', self.name, cmd)
181 if dry_run:
182 self.result = 0
183 else:
184 with remember_cwd():
185 if self.workdir:
186 assert '..' not in self.workdir, "Illegal path for workdir -- must be a subdirectory of CWD"
187 assert not self.workdir.startswith('/'), "Illegal path for workdir -- no absolute paths!"
188 os.makedirs(self.workdir,exist_ok=True)
189 os.chdir(self.workdir)
190
191 if self.prmon:
192 prmon_proc = self.__start_prmon()
193 if self.timeout:
194 self.result = self.__execute_with_timeout(cmd, self.timeout)
195 else:
196 self.result = subprocess.call(cmd, shell=True)
197 if self.prmon:
198 self.__stop_prmon(prmon_proc)
199
200 if self.auto_report_result:
201 self.report_result()
202
203 # Print full log to stdout for failed steps if running in CI
204 if self.required \
205 and self.result != 0 \
206 and running_in_CI() \
207 and self.output_stream==self.OutputStream.FILE_ONLY:
208 self.log.error('Step failure while running in CI. Printing partial log %s', self.get_log_file_name())
209
210 # Common error patterns (might be good to use all patterns from check_log.py)
211 error_patterns = '^ERROR| ERROR | FATAL |[Tt]raceback'
212 grep_with_context(open(self.get_log_file_name()), error_patterns, lines=100)
213
214 # Print also sub-step logs for transforms
215 if self.executable.endswith('_tf.py'):
216 log = open(self.get_log_file_name()).read()
217 step_matches = re.findall('Logs for (.*) are in (.*)', log)
218 if not step_matches:
219 self.log.warning('Failed to determine sub-step log names, cannot print the full sub-step logs')
220 else:
221 step_log_names = [m[1] for m in step_matches]
222 for step_log_name in step_log_names:
223 if os.path.isfile(step_log_name):
224 self.log.info('Printing partial sub-step log file %s', step_log_name)
225 grep_with_context(open(step_log_name), error_patterns, lines=100)
226
227 return self.result, cmd
228
229
230def get_step_from_list(step_name, step_list):
231 '''
232 Retrieve the first test matching the name from the list. Returns None if
233 no match is found.
234 '''
235 for step in step_list:
236 if step.name is not None and step_name in step.name:
237 return step
238 return None
239
240
241def get_step_type_from_list(step_type, step_list):
242 '''
243 Retrieve the first test matching the type from the list. Returns None if
244 no match is found.
245 '''
246 for step in step_list:
247 if isinstance(step, step_type):
248 return step
249 return None
const bool debug
misconfig_abort(self, error_msg, *args, **kwargs)
Definition Step.py:78
__stop_prmon(self, prmon_proc)
Definition Step.py:153
configure(self, test=None)
Definition Step.py:54
__execute_with_timeout(self, cmd, timeout_sec)
Definition Step.py:114
__init__(self, name=None)
Definition Step.py:31
report_result(self, result=None, name=None)
Definition Step.py:63
__trace_and_kill(self, pid, signal, backtrace_list)
Definition Step.py:87
get_step_from_list(step_name, step_list)
Definition Step.py:230
get_step_type_from_list(step_type, step_list)
Definition Step.py:241
Definition run.py:1
IovVectorMap_t read(const Folder &theFolder, const SelectionCriterion &choice, const unsigned int limit=10)