Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[RF] Skip out-of-range events in RDataFrame to RooDataSet Helper
The RDataFrameHelper should be consistent with creating a RooDataSet
from a TTree, meaning out-of-range events should be skipped. This is
implemented in this commit, borrowing the logic from
`RooTreeDataStore::loadValues()`. A unit test is also implemented.

The previous logic of just taking just all values to fill the dataset
was very dangerous, because these values then clipped to the RooRealVar
limits and biased the number of events observed at the boundaries.

Closes #11017.
  • Loading branch information
guitargeek committed Jul 26, 2022
commit b5237a2f2671e235bf07993e57b7ac448269c765
38 changes: 36 additions & 2 deletions roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <RooArgSet.h>
#include <RooDataSet.h>
#include <RooDataHist.h>
#include <RooMsgService.h>

#include <ROOT/RDataFrame.hxx>
#include <ROOT/RDF/ActionHelpers.hxx>
Expand Down Expand Up @@ -71,6 +72,7 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
private:
std::shared_ptr<DataSet_t> _dataset;
std::mutex _mutex_dataset;
std::size_t _numInvalid = 0;

std::vector<std::vector<double>> _events; // One vector of values per data-processing slot
const std::size_t _eventSize; // Number of variables in dataset
Expand Down Expand Up @@ -146,6 +148,11 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
FillDataSet(vector, _eventSize);
vector.clear();
}

if (_numInvalid>0) {
const auto prefix = std::string(_dataset->ClassName()) + "Helper::Finalize(" + _dataset->GetName() + ") ";
oocoutW(static_cast<TObject*>(nullptr), DataHandling) << prefix << "Ignored " << _numInvalid << " out-of-range events\n";
}
}


Expand All @@ -163,10 +170,37 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
const RooArgSet& argSet = *_dataset->get();

for (std::size_t i = 0; i < events.size(); i += eventSize) {

// Creating a RooDataSet from an RDataFrame should be consistent with the
// creation from a TTree. The construction from a TTree discards entries
// outside the variable definition range, so we have to do that too (see
// also RooTreeDataStore::loadValues).

bool allOK = true;
for (std::size_t j=0; j < eventSize; ++j) {
static_cast<RooAbsRealLValue*>(argSet[j])->setVal(events[i+j]);
auto * destArg = static_cast<RooAbsRealLValue*>(argSet[j]);
double sourceVal = events[i+j];

if (!destArg->inRange(sourceVal, nullptr)) {
_numInvalid++ ;
allOK = false;
const auto prefix = std::string(_dataset->ClassName()) + "Helper::FillDataSet(" + _dataset->GetName() + ") ";
if (_numInvalid < 5) {
// Unlike in the TreeVectorStore case, we don't log the event
// number here because we don't know it anyway, because of
// RDataFrame slots and multithreading.
oocoutI(static_cast<TObject*>(nullptr), DataHandling) << prefix << "Skipping event because " << destArg->GetName()
<< " cannot accommodate the value " << sourceVal << "\n";
} else if (_numInvalid == 5) {
oocoutI(static_cast<TObject*>(nullptr), DataHandling) << prefix << "Skipping ...\n";
}
break ;
}
destArg->setVal(sourceVal);
}
if(allOK) {
_dataset->add(argSet);
}
_dataset->add(argSet);
}
}
};
Expand Down
51 changes: 51 additions & 0 deletions roofit/RDataFrameHelpers/test/testActionHelpers.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
/// \author Stephan Hageboeck (CERN)

#include <RooAbsDataHelper.h>
#include <RooGlobalFunc.h>
#include <RooMsgService.h>

#include <TROOT.h>
#include <TRandom.h>
#include <TSystem.h>
#include <Rtypes.h>

#include "gtest/gtest.h"
Expand Down Expand Up @@ -76,3 +80,50 @@ TEST(RooAbsDataHelper, MTConstruction)
EXPECT_NEAR(rooDataHist->moment(y, 2.), 0.25, 1.E-2); // Variance is affected in a binned distribution
}


/// This test verifies that out-of-range events are correctly skipped,
/// consistent with the construction of a RooDataSet from a TTree.
TEST(RooAbsDataHelper, SkipEventsOutOfRange) {

RooMsgService::instance().getStream(0).removeTopic(RooFit::DataHandling);
RooMsgService::instance().getStream(1).removeTopic(RooFit::DataHandling);

std::size_t nEvents = 100;
const char * filename = "testRooAbsDataHelper_SkipEventsOutOfRange_tree.root";
const char * treename = "tree";

{
// Create the ROOT file with the dataset
ROOT::RDataFrame rdf(nEvents);
auto rdf_x = rdf.Define("x", [](){ return gRandom->Gaus(0.0, 1.0); });
rdf_x.Snapshot(treename, filename);
// We can't reuse the same RDataFrame now, because when we rerun the event
// loop it would generate new random values. So this scope ends here and we
// open a new RDF from the file later.
}

// Open dataset with RDataFrame and TTree
std::unique_ptr<TFile> file{TFile::Open(filename, "READ")};
auto tree = file->Get<TTree>(treename);
ROOT::RDataFrame rdf(treename, filename);

// Create RooFit variable
RooRealVar x{"x", "x", 0.0, -2.0, 2.0};

// Create a RooDataset from the TTree, and one from the RDataFrame
RooDataSet dataSetTree{"dataSetTree", "dataSetTree", tree, x};
auto dataSetRDF = rdf.Book<double>(RooDataSetHelper("dataSetRDF", "dataSetRDF", RooArgSet(x)), {"x"});

// Check if in the creation of the datasets, the entries outside the
// variable range were sucessfully discarded.
double nPassing = *rdf.Filter("x >= -2 && x <= 2.0").Count();

EXPECT_EQ(dataSetRDF->numEntries(), nPassing);
EXPECT_EQ(dataSetTree.numEntries(), nPassing);

file.reset(); // Close file
gSystem->Unlink(filename); // delete temporary file

RooMsgService::instance().getStream(0).addTopic(RooFit::DataHandling);
RooMsgService::instance().getStream(1).addTopic(RooFit::DataHandling);
}