ATLAS Offline Software
Loading...
Searching...
No Matches
util.convertLGBMToRootTree Namespace Reference

Classes

class  LGBMTextNode

Functions

 lgbm_rawresponse_each_tree (model, my_input)
 list2stdvector (values, dtype="float")
 dump_tree (tree_structure)
 dump2ROOT (model, output_filename, output_treename="lgbm")
 convertLGBMToRootTree (model, output_filename, tree_name="lgbm")
 test (model_file, tree_file, tree_name="lgbm", ntests=10000, test_file=None)
 get_test_data (feature_names, test_file=None, ntests=None)
 test_generic (booster, mvautils_predict, mva_utils, data_input)
 _ff (tree, node_infos)
 test_detail_event (booster, mva_utils, input_values)
 test_regression (booster, mva_utils, ntests=None, test_file=None)
 test_binary (booster, mva_utils, ntests=None, test_file=None)
 test_multiclass (booster, mva_utils, ntests=10000, test_file=None)
 check_file (fn)

Variables

str __author__ = "Ruggero Turra"
 level
 parser = argparse.ArgumentParser(description=__doc__)
 help
 nargs
 default
 action
 type
 int
 args = parser.parse_args()
 output
 output_treename = convertLGBMToRootTree(args.input, args.output, args.tree_name)
 result = test(args.input, args.output, args.tree_name, args.ntests, args.test_file)
 booster = lgb.Booster(model_file=args.input)
 objective = booster.dump_model()["objective"]

Detailed Description

Convert LightGBM model to TTree to be used with MVAUtils. 

Function Documentation

◆ _ff()

util.convertLGBMToRootTree._ff ( tree,
node_infos )
protected

Definition at line 341 of file convertLGBMToRootTree.py.

341def _ff(tree, node_infos):
342 if "left_child" in tree:
343 node_infos.append((tree["split_feature"], tree["threshold"]))
344 _ff(tree["left_child"])
345 _ff(tree["right_child"])
346
347

◆ check_file()

util.convertLGBMToRootTree.check_file ( fn)

Definition at line 531 of file convertLGBMToRootTree.py.

531def check_file(fn):
532 f = ROOT.TFile.Open(fn)
533 keys = f.GetListOfKeys()
534 keys = list(keys)
535 if len(keys) != 1:
536 logging.info("file %s is empty", fn)
537 return False
538 tree = f.Get(keys[0].GetName())
539 if type(tree) is not ROOT.TTree:
540 logging.info("cannot find TTree in file %s", fn)
541 return False
542 if not tree.GetEntries():
543 logging.info("tree is empty")
544 return False
545 return True
546
547

◆ convertLGBMToRootTree()

util.convertLGBMToRootTree.convertLGBMToRootTree ( model,
output_filename,
tree_name = "lgbm" )
Model: - a string, in this case, it is the name of
the input file containing the lgbm model you
can get this model with lgbm with
`boosted.save_model('my_model.txt')
- directly a lgbm booster object

Definition at line 192 of file convertLGBMToRootTree.py.

192def convertLGBMToRootTree(model, output_filename, tree_name="lgbm"):
193 """
194 Model: - a string, in this case, it is the name of
195 the input file containing the lgbm model you
196 can get this model with lgbm with
197 `boosted.save_model('my_model.txt')
198 - directly a lgbm booster object
199 """
200 if type(model) is str:
201 model = lgb.Booster(model_file=model)
202 return dump2ROOT(model, output_filename, tree_name)
203 else:
204 return dump2ROOT(model, output_filename, tree_name)
205
206

◆ dump2ROOT()

util.convertLGBMToRootTree.dump2ROOT ( model,
output_filename,
output_treename = "lgbm" )

Definition at line 141 of file convertLGBMToRootTree.py.

141def dump2ROOT(model, output_filename, output_treename="lgbm"):
142 model = model.dump_model()
143 fout = ROOT.TFile.Open(output_filename, "recreate")
144
145 features_array = ROOT.std.vector("int")()
146 values_array = ROOT.std.vector("float")()
147 default_lefts_array = ROOT.std.vector("bool")()
148
149 simple = True
150 node_type = "node_type=lgbm_simple"
151 for tree in model["tree_info"]:
152 tree_structure = tree["tree_structure"]
153 features, values, default_lefts, simple_tree = dump_tree(tree_structure)
154 if not simple_tree:
155 simple = False
156 node_type = "node_type=lgbm"
157
158 infos = ";".join(["%s=%s" % (k, str(v)) for k, v in model.items() if type(v) is not list])
159 title = ";".join(("creator=lgbm", node_type, infos))
160 root_tree = ROOT.TTree(output_treename, title)
161 root_tree.Branch("vars", "vector<int>", ROOT.AddressOf(features_array))
162 root_tree.Branch("values", "vector<float>", ROOT.AddressOf(values_array))
163
164 if not simple:
165 logging.info("tree support nan: using full implementation (LGBMNode)")
166 root_tree.Branch("default_left", "vector<bool>", ROOT.AddressOf(default_lefts_array))
167 if simple:
168 logging.info("tree do not support nan:" "using simple implementation (LGBMNodeSimple)")
169
170 for tree in model["tree_info"]:
171 tree_structure = tree["tree_structure"]
172 features, values, default_lefts, simple_tree = dump_tree(tree_structure)
173
174 features_array.clear()
175 values_array.clear()
176 default_lefts_array.clear()
177
178 for value in values:
179 values_array.push_back(value)
180 for feature in features:
181 features_array.push_back(feature)
182 for default_left in default_lefts:
183 default_lefts_array.push_back(default_left)
184
185 root_tree.Fill()
186
187 root_tree.Write()
188 fout.Close()
189 return output_treename
190
191

◆ dump_tree()

util.convertLGBMToRootTree.dump_tree ( tree_structure)
dump a single decision tree to arrays to be written into the TTree

Definition at line 101 of file convertLGBMToRootTree.py.

101def dump_tree(tree_structure):
102 """
103 dump a single decision tree to arrays to be written into the TTree
104 """
105
106 split_values = []
107 split_features = []
108 default_left = []
109 top = LGBMTextNode(tree_structure)
110 simple = [True] # python2 lack of nonlocal variables
111
112 def preorder(node):
113 # visit root
114 split_features.append(node.get_split_feature())
115 split_values.append(node.get_value())
116 default_left.append(node.get_default_left())
117
118 if not node.get_default_left():
119 simple[0] = False
120
121 if "decision_type" in node and node["decision_type"] != "<=":
122 raise ValueError(
123 "do not support categorical input BDT (decision_type = %s)" % node["decision_type"]
124 )
125
126 if "missing_type" in node:
127 if node["missing_type"] not in ("NaN", "None"):
128 raise ValueError("do not support missing values different from NaN or None")
129
130 # visit left
131 if node.get_left() is not None:
132 preorder(node.get_left())
133 # visit right
134 if node.get_right() is not None:
135 preorder(node.get_right())
136
137 preorder(top)
138 return split_features, split_values, default_left, simple[0]
139
140

◆ get_test_data()

util.convertLGBMToRootTree.get_test_data ( feature_names,
test_file = None,
ntests = None )

Definition at line 257 of file convertLGBMToRootTree.py.

257def get_test_data(feature_names, test_file=None, ntests=None):
258 nvars = len(feature_names)
259 if test_file is not None:
260 if ".root" in test_file:
261 if ":" not in test_file:
262 raise ValueError("when using ROOT file as test use the syntax filename:treename")
263 fn, tn = test_file.split(":")
264 f = ROOT.TFile.Open(fn)
265 if not f:
266 raise IOError("cannot find ROOT file %s" % fn)
267 tree = f.Get(tn)
268 if not tree:
269 raise IOError("cannot find TTree %s in %s" % (fn, tn))
270 branch_names = [br.GetName() for br in tree.GetListOfBranches()]
271 for feature in feature_names:
272 if feature not in branch_names:
273 raise IOError("required feature %s not in TTree")
274 rdf = ROOT.RDataFrame(tree, feature_names)
275 data_input = rdf.AsNumpy()
276 data_input = np.stack([data_input[k] for k in feature_names]).T
277 if ntests is not None:
278 data_input = data_input[:ntests]
279 logging.info(
280 "using as input %s inputs from TTree %s from ROOT file %s", len(data_input), tn, fn
281 )
282 else:
283 data_input = np.load(test_file)
284 if ntests is not None:
285 data_input = data_input[:ntests]
286 logging.info("using as input %s inputs from pickle file %s", len(data_input), test_file)
287 else:
288 if ntests is None:
289 ntests = 10000
290 logging.info("using as input %s random uniform inputs (-100,100)", ntests)
291 logging.warning(
292 "using random uniform input as test: this is not safe" "provide an input test file"
293 )
294 data_input = np.random.uniform(-100, 100, size=(ntests, nvars))
295
296 # to match what mvautils is doing (using c-float)
297 data_input = data_input.astype(np.float32)
298 return data_input
299
300

◆ lgbm_rawresponse_each_tree()

util.convertLGBMToRootTree.lgbm_rawresponse_each_tree ( model,
my_input )

Definition at line 28 of file convertLGBMToRootTree.py.

28def lgbm_rawresponse_each_tree(model, my_input):
29 nclasses = model.num_model_per_iteration()
30 output_values = np.array(
31 [np.array([[0] * nclasses])]
32 + [
33 model.predict(np.atleast_2d(my_input), raw_score=True, num_iteration=itree)
34 for itree in range(1, (model.num_trees() // nclasses + 1))
35 ]
36 )
37 output_trees = np.diff(output_values, axis=0)
38 return output_trees
39
40

◆ list2stdvector()

util.convertLGBMToRootTree.list2stdvector ( values,
dtype = "float" )

Definition at line 41 of file convertLGBMToRootTree.py.

41def list2stdvector(values, dtype="float"):
42 result = ROOT.std.vector(dtype)()
43 for v in values:
44 result.push_back(v)
45 return result
46
47

◆ test()

util.convertLGBMToRootTree.test ( model_file,
tree_file,
tree_name = "lgbm",
ntests = 10000,
test_file = None )

Definition at line 207 of file convertLGBMToRootTree.py.

207def test(model_file, tree_file, tree_name="lgbm", ntests=10000, test_file=None):
208 booster = lgb.Booster(model_file=model_file)
209 f = ROOT.TFile.Open(tree_file)
210 tree = f.Get(tree_name)
211 try:
212 _ = ROOT.MVAUtils.BDT
213 except Exception:
214 print("cannot import MVAUtils")
215 return None
216
217 mva_utils = ROOT.MVAUtils.BDT(tree)
218
219 objective = booster.dump_model()["objective"]
220
221 # sometimes options are inlined with objective
222 # we don't support non-default options
223 objective = objective.replace("sigmoid:1", "")
224 objective = objective.strip()
225
226 # binary and xentropy are not the exact same thing when training but the output value is the same
227 # same for l1/l2/huber/... regression
228 # (https://lightgbm.readthedocs.io/en/latest/Parameters.html)
229 binary_aliases = ("binary", "cross_entropy", "xentropy")
230 regression_aliases = (
231 (
232 "regression_l2",
233 "l2",
234 "mean_squared_error",
235 "mse",
236 "l2_root",
237 "root_mean_squared_error",
238 "rmse",
239 )
240 + ("regression_l1", "l1", "mean_absolute_error", "mae")
241 + ("huber",)
242 )
243 multiclass_aliases = ("multiclass", "softmax")
244 if objective in multiclass_aliases:
245 logging.info("assuming multiclass, testing")
246 return test_multiclass(booster, mva_utils, ntests, test_file)
247 elif objective in binary_aliases:
248 logging.info("assuming binary classification, testing")
249 return test_binary(booster, mva_utils, ntests, test_file)
250 elif objective in regression_aliases:
251 logging.info("assuming regression, testing")
252 return test_regression(booster, mva_utils, ntests, test_file)
253 else:
254 print("cannot understand objective '%s'" % objective)
255
256
void print(char *figname, TCanvas *c1)

◆ test_binary()

util.convertLGBMToRootTree.test_binary ( booster,
mva_utils,
ntests = None,
test_file = None )

Definition at line 419 of file convertLGBMToRootTree.py.

419def test_binary(booster, mva_utils, ntests=None, test_file=None):
420 data_input = get_test_data(booster.feature_name(), test_file, ntests)
421 return test_generic(booster, mva_utils.GetClassification, mva_utils, data_input)
422
423

◆ test_detail_event()

util.convertLGBMToRootTree.test_detail_event ( booster,
mva_utils,
input_values )

Definition at line 348 of file convertLGBMToRootTree.py.

348def test_detail_event(booster, mva_utils, input_values):
349 logging.info("input values")
350 for ivar, input_value in enumerate(input_values):
351 logging.info("var %d: %.15f", ivar, input_value)
352 logging.info("=" * 50)
353
354 ntrees_mva_utils = mva_utils.GetNTrees()
355 if ntrees_mva_utils != booster.num_trees():
356 logging.info("Number of trees are different mvautils: %s lgbm: %s", ntrees_mva_utils, booster.num_trees())
357 tree_outputs_lgbm = lgbm_rawresponse_each_tree(booster, input_values)
358
359 # loop over the trees
360 is_problem_found = False
361 for itree in range(ntrees_mva_utils):
362 tree_output_mvautils = mva_utils.GetTreeResponse(list2stdvector(input_values), itree)
363 tree_output_lgbm = tree_outputs_lgbm[itree][0]
364 if not np.allclose(tree_output_mvautils, tree_output_lgbm):
365 is_tree_ok = False
366 is_problem_found = True
367 logging.info("tree %d/%d are different", itree, ntrees_mva_utils)
368 logging.info("lgbm: %f", tree_output_lgbm)
369 logging.info("MVAUtils: %f", tree_output_mvautils)
370 logging.info("Tree details from MVAUtils")
371 mva_utils.PrintTree(itree)
372
373 # dump the tree from lightgbm
374 node_infos = []
375 _ff(
376 booster.dump_model()["tree_info"][itree][
377 "tree_structure"
378 ],
379 node_infos
380 )
381
382 # we now which tree is failing, check if this is
383 # due to input values very close to the threshold
384 # the problem is that lgbm is using double,
385 # while mva_utils is using float
386
387 for node_info in node_infos:
388 value = input_values[node_info[0]]
389 threshold = node_info[1]
390 if not np.isnan(value) and (value <= threshold) != (
391 np.float32(value) <= np.float32(threshold)
392 ):
393 logging.info(
394 "the problem could be due to double"
395 "(lgbm) -> float (mvautil) conversion"
396 " for variable %d: %.10f and threshold %.10f",
397 node_info[0],
398 value,
399 threshold,
400 )
401 # we consider this ok
402 is_tree_ok = True
403 break
404 if not is_tree_ok:
405 return False
406
407 if is_problem_found:
408 # if we have found the problem, but we arrive here
409 # it means that we found the problematic tree,
410 # but it is ok
411 return True
412
413

◆ test_generic()

util.convertLGBMToRootTree.test_generic ( booster,
mvautils_predict,
mva_utils,
data_input )

Definition at line 301 of file convertLGBMToRootTree.py.

301def test_generic(booster, mvautils_predict, mva_utils, data_input):
302 start = time.time()
303 results_lgbm = booster.predict(data_input)
304 logging.info("lgbm (vectorized) timing = %d/s", len(data_input) / (time.time() - start))
305
306 input_values_vector = ROOT.std.vector("float")()
307 results_MVAUtils = []
308 start = time.time()
309 for input_values in data_input:
310 input_values_vector.clear()
311 for v in input_values:
312 input_values_vector.push_back(v)
313 output_MVAUtils = mvautils_predict(input_values_vector)
314 results_MVAUtils.append(output_MVAUtils)
315 logging.info(
316 "mvautils (not vectorized+overhead) timing = %d/s", len(data_input) / (time.time() - start)
317 )
318
319 nevents_tested = 0
320 nevents_different = 0
321 for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate(
322 zip(data_input, results_lgbm, results_MVAUtils), 1
323 ):
324 nevents_tested += 1
325 if not np.allclose(output_lgbm, output_MVAUtils, rtol=1e-4):
326 nevents_different += 1
327 logging.info(
328 "--> output are different on input %d/%d mvautils: %s lgbm: %s",
329 ievent,
330 len(data_input),
331 output_MVAUtils,
332 output_lgbm,
333 )
334 if not test_detail_event(booster, mva_utils, input_values):
335 return False
336 logging.info("number of different events %d/%d", nevents_different, nevents_tested)
337 return True
338
339
340# helper for tree traversal

◆ test_multiclass()

util.convertLGBMToRootTree.test_multiclass ( booster,
mva_utils,
ntests = 10000,
test_file = None )

Definition at line 424 of file convertLGBMToRootTree.py.

424def test_multiclass(booster, mva_utils, ntests=10000, test_file=None):
425 import numpy as np
426
427 nvars = booster.num_feature()
428 nclasses = booster.num_model_per_iteration()
429 logging.info("using %d input features with %d classes", nvars, nclasses)
430
431 data_input = get_test_data(booster.feature_name(), test_file, ntests)
432
433 start = time.time()
434 results_lgbm = booster.predict(data_input)
435 logging.info(
436 "lgbm (vectorized) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)
437 )
438
439 input_values_vector = ROOT.std.vector("float")()
440 results_MVAUtils = []
441 start = time.time()
442 for input_values in data_input:
443 input_values_vector.clear()
444 for v in input_values:
445 input_values_vector.push_back(v)
446 output_MVAUtils = np.asarray(mva_utils.GetMultiResponse(input_values_vector, nclasses))
447 results_MVAUtils.append(output_MVAUtils)
448 logging.info(
449 "mvautils (not vectorized+overhead) timing = %s ms/input",
450 (time.time() - start) * 1000 / len(data_input),
451 )
452
453 stop_event_loop = False
454 for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate(
455 zip(data_input, results_lgbm, results_MVAUtils), 1
456 ):
457 if not np.allclose(output_lgbm, output_MVAUtils):
458 stop_event_loop = True
459 logging.info("--> output are different on input %d/%d:\n", ievent, len(data_input))
460 for ivar, input_value in enumerate(input_values):
461 logging.info("var %d: %.15f", ivar, input_value)
462 logging.info("=" * 50)
463 logging.info(" mvautils lgbm")
464 for ioutput, (o1, o2) in enumerate(zip(output_MVAUtils, output_lgbm)):
465 diff_flag = "" if np.allclose(o1, o2) else "<---"
466 logging.info("output %3d %.5e %.5e %s", ioutput, o1, o2, diff_flag)
467 output_trees_lgbm = lgbm_rawresponse_each_tree(booster, [input_values])
468
469 stop_tree_loop = False
470 for itree, output_tree_lgbm in enumerate(output_trees_lgbm):
471 output_tree_mva_utils = [
472 mva_utils.GetTreeResponse(list2stdvector(input_values), itree * nclasses + c)
473 for c in range(nclasses)
474 ]
475 if not np.allclose(output_tree_mva_utils, output_tree_lgbm[0]):
476 stop_tree_loop = True
477 logging.info("first tree/class with different answer (%d)", itree)
478 for isubtree, (ol, om) in enumerate(
479 zip(output_tree_lgbm[0], output_tree_mva_utils)
480 ):
481 if not np.allclose(ol, om):
482 logging.info("different in position %d", isubtree)
483 logging.info("lgbm: %f", ol)
484 logging.info("mvautils: %f", om)
485 logging.info("=" * 50)
486 logging.info(
487 "tree %d (itree) * %d (nclasses)" "+ %d (isubtree) = %d",
488 itree,
489 nclasses,
490 isubtree,
491 itree * nclasses + isubtree,
492 )
493 mva_utils.PrintTree(itree * nclasses + isubtree)
494
495 node_infos = []
496
497 # we now which tree is failing, check if this is
498 # due to input values very close to the threshold
499 # the problem is that lgbm is using double,
500 # while mva_utils is using float
501 _ff(
502 booster.dump_model()["tree_info"][itree * nclasses + isubtree][
503 "tree_structure"
504 ],
505 node_infos
506 )
507 for node_info in node_infos:
508 value = input_values[node_info[0]]
509 threshold = node_info[1]
510 if not np.isnan(value) and (value <= threshold) != (
511 np.float32(value) <= np.float32(threshold)
512 ):
513 logging.info(
514 "the problem could be due to double"
515 "(lgbm) -> float (mvautil) conversion"
516 "for variable %d: %f and threshold %f",
517 node_info[0],
518 value,
519 threshold,
520 )
521 stop_tree_loop = False
522 stop_event_loop = False
523
524 if stop_tree_loop:
525 break
526 if stop_event_loop:
527 return False
528 return True
529
530

◆ test_regression()

util.convertLGBMToRootTree.test_regression ( booster,
mva_utils,
ntests = None,
test_file = None )

Definition at line 414 of file convertLGBMToRootTree.py.

414def test_regression(booster, mva_utils, ntests=None, test_file=None):
415 data_input = get_test_data(booster.feature_name(), test_file, ntests)
416 return test_generic(booster, mva_utils.GetResponse, mva_utils, data_input)
417
418

Variable Documentation

◆ __author__

str util.convertLGBMToRootTree.__author__ = "Ruggero Turra"
private

Definition at line 7 of file convertLGBMToRootTree.py.

◆ action

util.convertLGBMToRootTree.action

Definition at line 555 of file convertLGBMToRootTree.py.

◆ args

util.convertLGBMToRootTree.args = parser.parse_args()

Definition at line 561 of file convertLGBMToRootTree.py.

◆ booster

util.convertLGBMToRootTree.booster = lgb.Booster(model_file=args.input)

Definition at line 588 of file convertLGBMToRootTree.py.

◆ default

util.convertLGBMToRootTree.default

Definition at line 554 of file convertLGBMToRootTree.py.

◆ help

util.convertLGBMToRootTree.help

Definition at line 552 of file convertLGBMToRootTree.py.

◆ int

util.convertLGBMToRootTree.int

Definition at line 556 of file convertLGBMToRootTree.py.

◆ level

util.convertLGBMToRootTree.level

Definition at line 25 of file convertLGBMToRootTree.py.

◆ nargs

util.convertLGBMToRootTree.nargs

Definition at line 553 of file convertLGBMToRootTree.py.

◆ objective

util.convertLGBMToRootTree.objective = booster.dump_model()["objective"]

Definition at line 589 of file convertLGBMToRootTree.py.

◆ output

util.convertLGBMToRootTree.output

Definition at line 566 of file convertLGBMToRootTree.py.

◆ output_treename

util.convertLGBMToRootTree.output_treename = convertLGBMToRootTree(args.input, args.output, args.tree_name)

Definition at line 569 of file convertLGBMToRootTree.py.

◆ parser

util.convertLGBMToRootTree.parser = argparse.ArgumentParser(description=__doc__)

Definition at line 551 of file convertLGBMToRootTree.py.

◆ result

util.convertLGBMToRootTree.result = test(args.input, args.output, args.tree_name, args.ntests, args.test_file)

Definition at line 576 of file convertLGBMToRootTree.py.

◆ type

util.convertLGBMToRootTree.type

Definition at line 556 of file convertLGBMToRootTree.py.