ATLAS Offline Software
Loading...
Searching...
No Matches
convertLGBMToRootTree.py
Go to the documentation of this file.
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
4
5""" Convert LightGBM model to TTree to be used with MVAUtils. """
6
7__author__ = "Ruggero Turra"
8
9try:
10 import lightgbm as lgb
11except ImportError:
12 print(
13 """cannot load lightgbm. Try to install it with
14 pip install lightgbm
15or (usually on lxplus)
16 pip install numpy scipy scikit-learn
17 pip install --no-binary :all: lightgbm
18"""
19 )
20import ROOT
21import time
22import logging
23import numpy as np
24
25logging.basicConfig(level=logging.DEBUG)
26
27
28def lgbm_rawresponse_each_tree(model, my_input):
29 nclasses = model.num_model_per_iteration()
30 output_values = np.array(
31 [np.array([[0] * nclasses])]
32 + [
33 model.predict(np.atleast_2d(my_input), raw_score=True, num_iteration=itree)
34 for itree in range(1, (model.num_trees() // nclasses + 1))
35 ]
36 )
37 output_trees = np.diff(output_values, axis=0)
38 return output_trees
39
40
41def list2stdvector(values, dtype="float"):
42 result = ROOT.std.vector(dtype)()
43 for v in values:
44 result.push_back(v)
45 return result
46
47
48class LGBMTextNode(dict):
49 """
50 Adaptor from LGBM dictionary to tree
51 """
52
53 def __init__(self, structure, invert_as_tmva=False):
54 super(LGBMTextNode, self).__init__(structure)
55 self.invert_as_tmva = invert_as_tmva
56
58 if "split_feature" in self:
59 return self["split_feature"]
60 else:
61 return -1
62
63 def get_value(self):
64 if "threshold" in self:
65 return self["threshold"]
66 else:
67 return self["leaf_value"]
68
69 def _get_left(self):
70 if "left_child" not in self:
71 return None
72 if self["decision_type"] == "<=":
73 return LGBMTextNode(self["left_child"])
74 else:
75 return LGBMTextNode(self["right_child"])
76
77 def _get_right(self):
78 if "right_child" not in self:
79 return None
80 if self["decision_type"] == "<=":
81 return LGBMTextNode(self["right_child"])
82 else:
83 return LGBMTextNode(self["left_child"])
84
85 def get_left(self):
86 if not self.invert_as_tmva:
87 return self._get_left()
88 else:
89 return self._get_right()
90
91 def get_right(self):
92 if not self.invert_as_tmva:
93 return self._get_right()
94 else:
95 return self._get_left()
96
98 return self.get("default_left", True)
99
100
101def dump_tree(tree_structure):
102 """
103 dump a single decision tree to arrays to be written into the TTree
104 """
105
106 split_values = []
107 split_features = []
108 default_left = []
109 top = LGBMTextNode(tree_structure)
110 simple = [True] # python2 lack of nonlocal variables
111
112 def preorder(node):
113 # visit root
114 split_features.append(node.get_split_feature())
115 split_values.append(node.get_value())
116 default_left.append(node.get_default_left())
117
118 if not node.get_default_left():
119 simple[0] = False
120
121 if "decision_type" in node and node["decision_type"] != "<=":
122 raise ValueError(
123 "do not support categorical input BDT (decision_type = %s)" % node["decision_type"]
124 )
125
126 if "missing_type" in node:
127 if node["missing_type"] not in ("NaN", "None"):
128 raise ValueError("do not support missing values different from NaN or None")
129
130 # visit left
131 if node.get_left() is not None:
132 preorder(node.get_left())
133 # visit right
134 if node.get_right() is not None:
135 preorder(node.get_right())
136
137 preorder(top)
138 return split_features, split_values, default_left, simple[0]
139
140
141def dump2ROOT(model, output_filename, output_treename="lgbm"):
142 model = model.dump_model()
143 fout = ROOT.TFile.Open(output_filename, "recreate")
144
145 features_array = ROOT.std.vector("int")()
146 values_array = ROOT.std.vector("float")()
147 default_lefts_array = ROOT.std.vector("bool")()
148
149 simple = True
150 node_type = "node_type=lgbm_simple"
151 for tree in model["tree_info"]:
152 tree_structure = tree["tree_structure"]
153 features, values, default_lefts, simple_tree = dump_tree(tree_structure)
154 if not simple_tree:
155 simple = False
156 node_type = "node_type=lgbm"
157
158 infos = ";".join(["%s=%s" % (k, str(v)) for k, v in model.items() if type(v) is not list])
159 title = ";".join(("creator=lgbm", node_type, infos))
160 root_tree = ROOT.TTree(output_treename, title)
161 root_tree.Branch("vars", "vector<int>", ROOT.AddressOf(features_array))
162 root_tree.Branch("values", "vector<float>", ROOT.AddressOf(values_array))
163
164 if not simple:
165 logging.info("tree support nan: using full implementation (LGBMNode)")
166 root_tree.Branch("default_left", "vector<bool>", ROOT.AddressOf(default_lefts_array))
167 if simple:
168 logging.info("tree do not support nan:" "using simple implementation (LGBMNodeSimple)")
169
170 for tree in model["tree_info"]:
171 tree_structure = tree["tree_structure"]
172 features, values, default_lefts, simple_tree = dump_tree(tree_structure)
173
174 features_array.clear()
175 values_array.clear()
176 default_lefts_array.clear()
177
178 for value in values:
179 values_array.push_back(value)
180 for feature in features:
181 features_array.push_back(feature)
182 for default_left in default_lefts:
183 default_lefts_array.push_back(default_left)
184
185 root_tree.Fill()
186
187 root_tree.Write()
188 fout.Close()
189 return output_treename
190
191
192def convertLGBMToRootTree(model, output_filename, tree_name="lgbm"):
193 """
194 Model: - a string, in this case, it is the name of
195 the input file containing the lgbm model you
196 can get this model with lgbm with
197 `boosted.save_model('my_model.txt')
198 - directly a lgbm booster object
199 """
200 if type(model) is str:
201 model = lgb.Booster(model_file=model)
202 return dump2ROOT(model, output_filename, tree_name)
203 else:
204 return dump2ROOT(model, output_filename, tree_name)
205
206
207def test(model_file, tree_file, tree_name="lgbm", ntests=10000, test_file=None):
208 booster = lgb.Booster(model_file=model_file)
209 f = ROOT.TFile.Open(tree_file)
210 tree = f.Get(tree_name)
211 try:
212 _ = ROOT.MVAUtils.BDT
213 except Exception:
214 print("cannot import MVAUtils")
215 return None
216
217 mva_utils = ROOT.MVAUtils.BDT(tree)
218
219 objective = booster.dump_model()["objective"]
220
221 # sometimes options are inlined with objective
222 # we don't support non-default options
223 objective = objective.replace("sigmoid:1", "")
224 objective = objective.strip()
225
226 # binary and xentropy are not the exact same thing when training but the output value is the same
227 # same for l1/l2/huber/... regression
228 # (https://lightgbm.readthedocs.io/en/latest/Parameters.html)
229 binary_aliases = ("binary", "cross_entropy", "xentropy")
230 regression_aliases = (
231 (
232 "regression_l2",
233 "l2",
234 "mean_squared_error",
235 "mse",
236 "l2_root",
237 "root_mean_squared_error",
238 "rmse",
239 )
240 + ("regression_l1", "l1", "mean_absolute_error", "mae")
241 + ("huber",)
242 )
243 multiclass_aliases = ("multiclass", "softmax")
244 if objective in multiclass_aliases:
245 logging.info("assuming multiclass, testing")
246 return test_multiclass(booster, mva_utils, ntests, test_file)
247 elif objective in binary_aliases:
248 logging.info("assuming binary classification, testing")
249 return test_binary(booster, mva_utils, ntests, test_file)
250 elif objective in regression_aliases:
251 logging.info("assuming regression, testing")
252 return test_regression(booster, mva_utils, ntests, test_file)
253 else:
254 print("cannot understand objective '%s'" % objective)
255
256
257def get_test_data(feature_names, test_file=None, ntests=None):
258 nvars = len(feature_names)
259 if test_file is not None:
260 if ".root" in test_file:
261 if ":" not in test_file:
262 raise ValueError("when using ROOT file as test use the syntax filename:treename")
263 fn, tn = test_file.split(":")
264 f = ROOT.TFile.Open(fn)
265 if not f:
266 raise IOError("cannot find ROOT file %s" % fn)
267 tree = f.Get(tn)
268 if not tree:
269 raise IOError("cannot find TTree %s in %s" % (fn, tn))
270 branch_names = [br.GetName() for br in tree.GetListOfBranches()]
271 for feature in feature_names:
272 if feature not in branch_names:
273 raise IOError("required feature %s not in TTree")
274 rdf = ROOT.RDataFrame(tree, feature_names)
275 data_input = rdf.AsNumpy()
276 data_input = np.stack([data_input[k] for k in feature_names]).T
277 if ntests is not None:
278 data_input = data_input[:ntests]
279 logging.info(
280 "using as input %s inputs from TTree %s from ROOT file %s", len(data_input), tn, fn
281 )
282 else:
283 data_input = np.load(test_file)
284 if ntests is not None:
285 data_input = data_input[:ntests]
286 logging.info("using as input %s inputs from pickle file %s", len(data_input), test_file)
287 else:
288 if ntests is None:
289 ntests = 10000
290 logging.info("using as input %s random uniform inputs (-100,100)", ntests)
291 logging.warning(
292 "using random uniform input as test: this is not safe" "provide an input test file"
293 )
294 data_input = np.random.uniform(-100, 100, size=(ntests, nvars))
295
296 # to match what mvautils is doing (using c-float)
297 data_input = data_input.astype(np.float32)
298 return data_input
299
300
301def test_generic(booster, mvautils_predict, mva_utils, data_input):
302 start = time.time()
303 results_lgbm = booster.predict(data_input)
304 logging.info("lgbm (vectorized) timing = %d/s", len(data_input) / (time.time() - start))
305
306 input_values_vector = ROOT.std.vector("float")()
307 results_MVAUtils = []
308 start = time.time()
309 for input_values in data_input:
310 input_values_vector.clear()
311 for v in input_values:
312 input_values_vector.push_back(v)
313 output_MVAUtils = mvautils_predict(input_values_vector)
314 results_MVAUtils.append(output_MVAUtils)
315 logging.info(
316 "mvautils (not vectorized+overhead) timing = %d/s", len(data_input) / (time.time() - start)
317 )
318
319 nevents_tested = 0
320 nevents_different = 0
321 for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate(
322 zip(data_input, results_lgbm, results_MVAUtils), 1
323 ):
324 nevents_tested += 1
325 if not np.allclose(output_lgbm, output_MVAUtils, rtol=1e-4):
326 nevents_different += 1
327 logging.info(
328 "--> output are different on input %d/%d mvautils: %s lgbm: %s",
329 ievent,
330 len(data_input),
331 output_MVAUtils,
332 output_lgbm,
333 )
334 if not test_detail_event(booster, mva_utils, input_values):
335 return False
336 logging.info("number of different events %d/%d", nevents_different, nevents_tested)
337 return True
338
339
340# helper for tree traversal
341def _ff(tree, node_infos):
342 if "left_child" in tree:
343 node_infos.append((tree["split_feature"], tree["threshold"]))
344 _ff(tree["left_child"])
345 _ff(tree["right_child"])
346
347
348def test_detail_event(booster, mva_utils, input_values):
349 logging.info("input values")
350 for ivar, input_value in enumerate(input_values):
351 logging.info("var %d: %.15f", ivar, input_value)
352 logging.info("=" * 50)
353
354 ntrees_mva_utils = mva_utils.GetNTrees()
355 if ntrees_mva_utils != booster.num_trees():
356 logging.info("Number of trees are different mvautils: %s lgbm: %s", ntrees_mva_utils, booster.num_trees())
357 tree_outputs_lgbm = lgbm_rawresponse_each_tree(booster, input_values)
358
359 # loop over the trees
360 is_problem_found = False
361 for itree in range(ntrees_mva_utils):
362 tree_output_mvautils = mva_utils.GetTreeResponse(list2stdvector(input_values), itree)
363 tree_output_lgbm = tree_outputs_lgbm[itree][0]
364 if not np.allclose(tree_output_mvautils, tree_output_lgbm):
365 is_tree_ok = False
366 is_problem_found = True
367 logging.info("tree %d/%d are different", itree, ntrees_mva_utils)
368 logging.info("lgbm: %f", tree_output_lgbm)
369 logging.info("MVAUtils: %f", tree_output_mvautils)
370 logging.info("Tree details from MVAUtils")
371 mva_utils.PrintTree(itree)
372
373 # dump the tree from lightgbm
374 node_infos = []
375 _ff(
376 booster.dump_model()["tree_info"][itree][
377 "tree_structure"
378 ],
379 node_infos
380 )
381
382 # we now which tree is failing, check if this is
383 # due to input values very close to the threshold
384 # the problem is that lgbm is using double,
385 # while mva_utils is using float
386
387 for node_info in node_infos:
388 value = input_values[node_info[0]]
389 threshold = node_info[1]
390 if not np.isnan(value) and (value <= threshold) != (
391 np.float32(value) <= np.float32(threshold)
392 ):
393 logging.info(
394 "the problem could be due to double"
395 "(lgbm) -> float (mvautil) conversion"
396 " for variable %d: %.10f and threshold %.10f",
397 node_info[0],
398 value,
399 threshold,
400 )
401 # we consider this ok
402 is_tree_ok = True
403 break
404 if not is_tree_ok:
405 return False
406
407 if is_problem_found:
408 # if we have found the problem, but we arrive here
409 # it means that we found the problematic tree,
410 # but it is ok
411 return True
412
413
414def test_regression(booster, mva_utils, ntests=None, test_file=None):
415 data_input = get_test_data(booster.feature_name(), test_file, ntests)
416 return test_generic(booster, mva_utils.GetResponse, mva_utils, data_input)
417
418
419def test_binary(booster, mva_utils, ntests=None, test_file=None):
420 data_input = get_test_data(booster.feature_name(), test_file, ntests)
421 return test_generic(booster, mva_utils.GetClassification, mva_utils, data_input)
422
423
424def test_multiclass(booster, mva_utils, ntests=10000, test_file=None):
425 import numpy as np
426
427 nvars = booster.num_feature()
428 nclasses = booster.num_model_per_iteration()
429 logging.info("using %d input features with %d classes", nvars, nclasses)
430
431 data_input = get_test_data(booster.feature_name(), test_file, ntests)
432
433 start = time.time()
434 results_lgbm = booster.predict(data_input)
435 logging.info(
436 "lgbm (vectorized) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)
437 )
438
439 input_values_vector = ROOT.std.vector("float")()
440 results_MVAUtils = []
441 start = time.time()
442 for input_values in data_input:
443 input_values_vector.clear()
444 for v in input_values:
445 input_values_vector.push_back(v)
446 output_MVAUtils = np.asarray(mva_utils.GetMultiResponse(input_values_vector, nclasses))
447 results_MVAUtils.append(output_MVAUtils)
448 logging.info(
449 "mvautils (not vectorized+overhead) timing = %s ms/input",
450 (time.time() - start) * 1000 / len(data_input),
451 )
452
453 stop_event_loop = False
454 for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate(
455 zip(data_input, results_lgbm, results_MVAUtils), 1
456 ):
457 if not np.allclose(output_lgbm, output_MVAUtils):
458 stop_event_loop = True
459 logging.info("--> output are different on input %d/%d:\n", ievent, len(data_input))
460 for ivar, input_value in enumerate(input_values):
461 logging.info("var %d: %.15f", ivar, input_value)
462 logging.info("=" * 50)
463 logging.info(" mvautils lgbm")
464 for ioutput, (o1, o2) in enumerate(zip(output_MVAUtils, output_lgbm)):
465 diff_flag = "" if np.allclose(o1, o2) else "<---"
466 logging.info("output %3d %.5e %.5e %s", ioutput, o1, o2, diff_flag)
467 output_trees_lgbm = lgbm_rawresponse_each_tree(booster, [input_values])
468
469 stop_tree_loop = False
470 for itree, output_tree_lgbm in enumerate(output_trees_lgbm):
471 output_tree_mva_utils = [
472 mva_utils.GetTreeResponse(list2stdvector(input_values), itree * nclasses + c)
473 for c in range(nclasses)
474 ]
475 if not np.allclose(output_tree_mva_utils, output_tree_lgbm[0]):
476 stop_tree_loop = True
477 logging.info("first tree/class with different answer (%d)", itree)
478 for isubtree, (ol, om) in enumerate(
479 zip(output_tree_lgbm[0], output_tree_mva_utils)
480 ):
481 if not np.allclose(ol, om):
482 logging.info("different in position %d", isubtree)
483 logging.info("lgbm: %f", ol)
484 logging.info("mvautils: %f", om)
485 logging.info("=" * 50)
486 logging.info(
487 "tree %d (itree) * %d (nclasses)" "+ %d (isubtree) = %d",
488 itree,
489 nclasses,
490 isubtree,
491 itree * nclasses + isubtree,
492 )
493 mva_utils.PrintTree(itree * nclasses + isubtree)
494
495 node_infos = []
496
497 # we now which tree is failing, check if this is
498 # due to input values very close to the threshold
499 # the problem is that lgbm is using double,
500 # while mva_utils is using float
501 _ff(
502 booster.dump_model()["tree_info"][itree * nclasses + isubtree][
503 "tree_structure"
504 ],
505 node_infos
506 )
507 for node_info in node_infos:
508 value = input_values[node_info[0]]
509 threshold = node_info[1]
510 if not np.isnan(value) and (value <= threshold) != (
511 np.float32(value) <= np.float32(threshold)
512 ):
513 logging.info(
514 "the problem could be due to double"
515 "(lgbm) -> float (mvautil) conversion"
516 "for variable %d: %f and threshold %f",
517 node_info[0],
518 value,
519 threshold,
520 )
521 stop_tree_loop = False
522 stop_event_loop = False
523
524 if stop_tree_loop:
525 break
526 if stop_event_loop:
527 return False
528 return True
529
530
532 f = ROOT.TFile.Open(fn)
533 keys = f.GetListOfKeys()
534 keys = list(keys)
535 if len(keys) != 1:
536 logging.info("file %s is empty", fn)
537 return False
538 tree = f.Get(keys[0].GetName())
539 if type(tree) is not ROOT.TTree:
540 logging.info("cannot find TTree in file %s", fn)
541 return False
542 if not tree.GetEntries():
543 logging.info("tree is empty")
544 return False
545 return True
546
547
548if __name__ == "__main__":
549 import argparse
550
551 parser = argparse.ArgumentParser(description=__doc__)
552 parser.add_argument("input", help="input text file from LGBM")
553 parser.add_argument("output", help="output ROOT filename", nargs="?")
554 parser.add_argument("--tree-name", default="lgbm")
555 parser.add_argument("--no-test", action="store_true", help="don't run test (not suggested)")
556 parser.add_argument("--ntests", type=int, help="number of test, default=1000")
557 parser.add_argument(
558 "--test-file", help="numpy pickle or ROOT file (use filename.root:treename)"
559 )
560
561 args = parser.parse_args()
562
563 if args.output is None:
564 import os
565
566 args.output = os.path.splitext(os.path.split(args.input)[1])[0] + ".root"
567
568 logging.info("converting input file %s to root file %s", args.input, args.output)
569 output_treename = convertLGBMToRootTree(args.input, args.output, args.tree_name)
570 if args.no_test:
571 print("model has not been tested. Do not use it production!")
572 else:
573 logging.info("testing model")
574 if not check_file(args.output):
575 print("problem when checking file")
576 result = test(args.input, args.output, args.tree_name, args.ntests, args.test_file)
577 if not result:
578 print(
579 "some problems during test." " Have you setup athena? Do not use this in production!"
580 )
581 else:
582 try:
583 print(
584 u"::: :) :) :) everything fine:" " LGBM output == MVAUtils output :) :) :) :::"
585 )
586 except UnicodeEncodeError:
587 print(":::==> everything fine:" "LGBM output == MVAUtils output <==:::")
588 booster = lgb.Booster(model_file=args.input)
589 objective = booster.dump_model()["objective"]
590 if "multiclass" in objective:
591 print(
592 """In c++ use your BDT as:
593#include "MVAUtils/BDT.h"
594
595TFile* f = TFile::Open("%s");
596TTree* tree = nullptr;
597f->GetObject("%s", tree);
598MVAUtils::BDT my_bdt(tree);
599// ...
600// std::vector<float> input_values(%d, 0.);
601// fill the vector using the order as in the trainig: %s
602// ...
603std::vector<float> output = my_bdt.GetMultiResponse(input_values, %d);
604 """
605 % (
606 args.output,
607 output_treename,
608 booster.num_feature(),
609 ",".join(booster.feature_name()),
610 booster.num_model_per_iteration(),
611 )
612 )
613 elif "binary" in objective:
614 print(
615 """In c++ use your BDT as:
616#include "MVAUtils/BDT.h"
617
618TFile* f = TFile::Open("%s");
619TTree* tree = nullptr;
620f->GetObject("%s", tree);
621MVAUtils::BDT my_bdt(tree);
622// ...
623// std::vector<float> input_values(%d, 0.);
624// fill the vector using the order as in the trainig: %s
625// ...
626float output = my_bdt.GetClassification(input_values);
627 """
628 % (
629 args.output,
630 output_treename,
631 booster.num_feature(),
632 ",".join(booster.feature_name()),
633 )
634 )
635 elif "regression" in objective:
636 print(
637 """In c++ use your BDT as:
638#include "MVAUtils/BDT.h"
639
640TFile* f = TFile::Open("%s");
641TTree* tree = nullptr;
642f->GetObject("%s", tree);
643MVAUtils::BDT my_bdt(tree);
644// ...
645// std::vector<float> input_values(%d, 0.);
646// fill the vector using the order as in the trainig: %s
647// ...
648float output = my_bdt.Predict(input_values);
649 """
650 % (
651 args.output,
652 output_treename,
653 booster.num_feature(),
654 ",".join(booster.feature_name()),
655 )
656 )
void print(char *figname, TCanvas *c1)
__init__(self, structure, invert_as_tmva=False)
test_multiclass(booster, mva_utils, ntests=10000, test_file=None)
test_generic(booster, mvautils_predict, mva_utils, data_input)
test(model_file, tree_file, tree_name="lgbm", ntests=10000, test_file=None)
dump2ROOT(model, output_filename, output_treename="lgbm")
test_binary(booster, mva_utils, ntests=None, test_file=None)
get_test_data(feature_names, test_file=None, ntests=None)
list2stdvector(values, dtype="float")
test_detail_event(booster, mva_utils, input_values)
test_regression(booster, mva_utils, ntests=None, test_file=None)
lgbm_rawresponse_each_tree(model, my_input)