Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify ML part as fastforest is integrated within RBDT #46

Merged
merged 4 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
*.root
*.pdf
*.npz
*.png
Expand Down
29 changes: 12 additions & 17 deletions analyses/cms-open-data-ttbar/analysis.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
import argparse
import os
import multiprocessing
from pathlib import Path
from time import time
from typing import Optional, Tuple

from distributed import Client, get_worker, LocalCluster, SSHCluster
import ml
import ROOT
from distributed import Client, LocalCluster, SSHCluster, get_worker
from ml import (
define_features,
infer_output_ml_features,
ml_features_config,
)
from plotting import save_ml_plots, save_plots
import ROOT
from utils import (
AGCInput,
AGCResult,
postprocess_results,
retrieve_inputs,
save_histos,
)

# Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data

Check failure on line 19 in analyses/cms-open-data-ttbar/analysis.py

View workflow job for this annotation

GitHub Actions / linter

Ruff (I001)

analyses/cms-open-data-ttbar/analysis.py:1:1: I001 Import block is un-sorted or un-formatted
# as a reference. Values are in pb.
XSEC_INFO = {
"ttbar": 396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
Expand Down Expand Up @@ -79,7 +74,7 @@
help=(
"Number of cores to use. In case of distributed execution this is the amount of cores per node."
),
default=len(os.sched_getaffinity(0)),
default = multiprocessing.cpu_count(),
type=int,
)
p.add_argument(
Expand Down Expand Up @@ -216,7 +211,7 @@
)

# Event selection - the core part of the algorithm applied for both regions
# Selecting events containing at least one lepton and four jets with pT > 25 GeV
# Selecting events containing at least one lepton and four jets with pT > 30 GeV
# Applying requirement at least one of them must be b-tagged jet (see details in the specification)
df = (
df.Define(
Expand Down Expand Up @@ -283,11 +278,11 @@
if not inference:
return (results, ml_results)

df4j2b = define_features(df4j2b)
df4j2b = infer_output_ml_features(df4j2b)
df4j2b = ml.define_features(df4j2b)
df4j2b = ml.infer_output_ml_features(df4j2b)

# Book histograms and, if needed, their systematic variations
for i, feature in enumerate(ml_features_config):
for i, feature in enumerate(ml.ml_features_config):
histo_model = ROOT.RDF.TH1DModel(
name=f"{feature.name}_{process}_{variation}",
title=feature.title,
Expand Down Expand Up @@ -348,7 +343,7 @@
client = None
load_cpp()
if args.inference:
ml.load_cpp("./fastforest")
ml.load_cpp()
martamaja10 marked this conversation as resolved.
Show resolved Hide resolved

run_graphs = ROOT.RDF.RunGraphs
else:
Expand All @@ -357,8 +352,8 @@
if args.inference:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)
if args.inference:
# TODO: make ml.load_cpp working on distributed
ROOT.RDF.Experimental.Distributed.initialize(ml.load_cpp, "./fastforest")
ROOT.RDF.Experimental.Distributed.initialize(ml.load_cpp)

else:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)
run_graphs = ROOT.RDF.Experimental.Distributed.RunGraphs
Expand All @@ -379,7 +374,7 @@
ml_results += ml_hist_list

# Select the right VariationsFor function depending on RDF or DistRDF
if "DistRDF" in type(df).__module__:
if type(df).__module__ == "DistRDF.DataFrame":
variationsfor_func = ROOT.RDF.Experimental.Distributed.VariationsFor
else:
variationsfor_func = ROOT.RDF.Experimental.VariationsFor
Expand Down
38 changes: 8 additions & 30 deletions analyses/cms-open-data-ttbar/ml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from dataclasses import dataclass
from typing import Tuple

Expand Down Expand Up @@ -73,46 +72,28 @@ class MLHistoConf:
for i in range(len(feature_names))
]


def load_cpp(fastforest_path, max_n_jets=6):
# the default value of max_n_jets is the same as in the refererence implementation
def load_cpp(max_n_jets=6):
# the default value of max_n_jets is the same as in the reference implementation
# https://github.com/iris-hep/analysis-grand-challenge

# For compiling ml_helpers.cpp, it is necessary to set paths for FastForest (https://github.com/guitargeek/XGBoost-FastForest) libraries and headers.

# The installed library is supposed to look like this:
# fastforest_path/
# ├── include
# │ └── fastforest.h
# └── lib
# ├── libfastforest.so -> libfastforest.so.1
# ├── libfastforest.so.0.2
# └── libfastforest.so.1 -> libfastforest.so.0.2

include = os.path.join(fastforest_path, "include") # path for headers
lib = os.path.join(fastforest_path, "lib") # path for libraries
ROOT.gSystem.AddIncludePath(f"-I{include}")
ROOT.gSystem.AddLinkedLibs(f"-L{lib} -lfastforest")
ROOT.gSystem.Load(f"{lib}/libfastforest.so.1")
ROOT.gSystem.CompileMacro("ml_helpers.cpp", "kO")

# Initialize FastForest models.
# Our BDT models have 20 input features according to the AGC documentation
# https://agc.readthedocs.io/en/latest/taskbackground.html#machine-learning-component

ROOT.gInterpreter.Declare(
# **Conditional derectives used to avoid redefinition error during distributed computing**
# **Conditional directives used to avoid redefinition error during distributed computing**
# Note:
# * moving all stuff in `Declare` to `ml_helpers.cpp` cancels the necessity of using `ifndef`
# * coming soon feature is `gInterpreter.Declare` with automatic header guards
# https://indico.fnal.gov/event/23628/contributions/240608/attachments/154873/201557/distributed_RDF_padulano_ROOT_workshop_2022.pdf
"""
#ifndef AGC_MODELS
#define AGC_MODELS

TMVA::Experimental::RBDT feven("feven", "models/bdt_even.root");
TMVA::Experimental::RBDT fodd("fodd", "models/bdt_odd.root");

const std::map<std::string, fastforest::FastForest> fastforest_models = get_fastforests("models/");
const fastforest::FastForest& feven = fastforest_models.at("even");
const fastforest::FastForest& fodd = fastforest_models.at("odd");
""".__add__(
f"""
size_t max_n_jets = {max_n_jets};
Expand All @@ -123,7 +104,6 @@ def load_cpp(fastforest_path, max_n_jets=6):
)
)


def define_features(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
return df.Define(
"features",
Expand All @@ -148,7 +128,6 @@ def define_features(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
""",
)


def predict_proba(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
"""get probability scores for every permutation in event"""

Expand All @@ -160,12 +139,11 @@ def predict_proba(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
"proba",
"""
bool is_even = (event % 2 == 0);
const auto& forest = (is_even) ? fodd : feven;
return inference(features, forest);
const auto& model = (is_even) ? fodd : feven;
return inference(features, model);
""",
)


def infer_output_ml_features(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
"""
Choose for each feature the best candidate with the highest probability score.
Expand Down
34 changes: 6 additions & 28 deletions analyses/cms-open-data-ttbar/ml_helpers.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#ifndef ML_HELPERS
#define ML_HELPERS

#include "fastforest.h"
#include "helpers.h"

#include <cmath>
Expand All @@ -13,6 +12,7 @@
#include <TError.h>
#include <ROOT/RVec.hxx>
#include <Math/Vector4D.h>
#include <TMVA/RBDT.hxx>

// copying jet_labels because we need to modify it
std::map<std::string, std::vector<int>> get_permutations (std::string jet_labels) {
Expand Down Expand Up @@ -62,7 +62,7 @@ std::map<std::string, std::vector<int>> get_permutations (std::string jet_labels
if (label == "w") label+=std::to_string(++count); // gets "w1" or "w2" labels
permutations[label].push_back(idx); // stores indexes of given permutation
}
count = 0; // needs to be reset after every itaration over labels
count = 0; // needs to be reset after every iteration over labels
} while (std::next_permutation(jet_labels.begin(), jet_labels.end()));


Expand All @@ -84,26 +84,6 @@ std::map<int, std::vector<ROOT::RVecI>> get_permutations_dict (size_t max_n_jets
return permutations_dict;
}


std::map<std::string, fastforest::FastForest> get_fastforests (const std::string& path_to_models) {

R__ASSERT(path_to_models.back() == '/');

std::size_t nfeatures=20;
std::vector<std::string> feature_names(nfeatures);
for (std::size_t i = 0; i < nfeatures; ++i) {
feature_names[i] = "f"+std::to_string(i);
}

auto fodd = fastforest::load_txt(path_to_models+"odd.txt", feature_names);
auto feven = fastforest::load_txt(path_to_models+"even.txt", feature_names);
return {{"even",feven}, {"odd", fodd}};
}





ROOT::RVec<ROOT::RVecD> eval_features (
const ROOT::RVec<ROOT::RVecI>& permut_indexes,
const ROOT::RVecD& jet_pt,
Expand Down Expand Up @@ -199,20 +179,18 @@ ROOT::RVec<ROOT::RVecD> eval_features (

}


ROOT::RVecF inference(const ROOT::RVec<ROOT::RVecD> &features, const fastforest::FastForest &forest) {
ROOT::RVecF inference(const ROOT::RVec<ROOT::RVecD> &features, const TMVA::Experimental::RBDT &bdt) {

size_t npermutations = features.at(0).size();
size_t nfeatures = features.size();
ROOT::RVecF res(npermutations);
float input[nfeatures];
size_t nfeatures = features.size();
ROOT::RVecF input(nfeatures);

for (std::size_t i = 0; i < npermutations; ++i) {
for (std::size_t j = 0; j < nfeatures; ++j) {
input[j] = features.at(j).at(i);
}
float score = forest(input, 0.0f);
res[i] = 1./(1.+std::exp(-score));
res[i] = bdt.Compute(input)[0];
}

return res;
Expand Down
Binary file not shown.
Binary file added analyses/cms-open-data-ttbar/models/bdt_odd.root
Binary file not shown.
Loading
Loading