Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[TDF] Improve DS doc and refine interface
e.g. slot-entry order in method signatures. The changes in the tests and in
TDFNodes are just a consequence of the aforementioned improvements.
  • Loading branch information
dpiparo committed Sep 19, 2017
commit bdd4603cd407c1c936f62693096e28eacdbfcf26
53 changes: 41 additions & 12 deletions tree/treeplayer/inc/ROOT/TDataSource.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,63 @@ namespace ROOT {
namespace Experimental {
namespace TDF {

/**
\class ROOT::Experimental::TDF::TDataSource
\ingroup dataframe
\brief The TDataSource interface dictates how a TDataFrame interface to an arbitrary data format should look like.

A TDataSource allows to seamlessly provide an adaptor for any kind of data set or data format to the TDataFrame.
*/
class TDataSource {
public:
virtual ~TDataSource(){};

/// \brief Returns a reference to the collection of the dataset's column names
virtual const std::vector<std::string> &GetColumnNames() const = 0;

/// \brief Checks if the dataset has a certain column
/// \param[in] columnName The name of the column
virtual bool HasColumn(std::string_view) const = 0;
/// Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.

/// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
/// \param[in] columnName The name of the column
virtual std::string GetTypeName(std::string_view) const = 0;

/// Called at most once per column by TDF. Return vector of pointers to pointers to column values - one per slot.
/// \tparam T The type of the data stored in the column
/// \param[in] columnName The name of the column
///
/// These pointers are veritable cursors: it's a responsibility of the TDataSource implementation that they point to the
/// "right" memory region.
template <typename T>
std::vector<T **> GetColumnReaders(std::string_view name, unsigned int nSlots)
std::vector<T **> GetColumnReaders(std::string_view columnName)
{
auto typeErasedVec = GetColumnReadersImpl(name, nSlots, typeid(T));
auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
std::vector<T **> typedVec(typeErasedVec.size());
std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
[](void *p) { return static_cast<T **>(p); });
return typedVec;
}
/// Return chunks of entries to distribute to tasks. They are required to be continguous intervals with no entries
/// skipped, starting at 0 and ending at nEntries, e.g. [0-5],[5-10] for 10 entries.

/// \brief Return ranges of entries to distribute to tasks.
/// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the intervals
/// must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
virtual const std::vector<std::pair<ULong64_t, ULong64_t>> &GetEntryRanges() const = 0;
/// Different threads will loop over different ranges and will pass different "slot" values.
virtual void SetEntry(ULong64_t entry, unsigned int slot) = 0;
/// Method to set the number of slots. Some implementations may rely on this
/// information for optimisation purposes.

/// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
/// \param[in] slot The data processing slot that needs to be considered
/// \param[in] entry The entry which needs to be pointed to by the reader pointers
/// Slots are adopted to accommodate parallel data processing. Different workers will loop over different ranges and will
/// be labelled by different "slot" values.
virtual void SetEntry(unsigned int slot, ULong64_t entry) = 0;

/// \brief Convenience method to set the number of slots
/// For some implementation it's necessary to know the number of slots in advance for optimisation purposes.
virtual void SetNSlots(unsigned int nSlots) = 0;
/// Convenience method called at the start of each task, before processing a range of entries.
/// DataSources can implement it if needed (does nothing by default).
/// firstEntry is the first entry of the range that the task will process.

/// \brief Convenience method called at the start of the data processing.
/// \param[in] slot The data processing slot wihch needs to be initialised
/// \param[in] firstEntry The first entry of the range that the task will process.
virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}

protected:
Expand Down
3 changes: 1 addition & 2 deletions tree/treeplayer/inc/ROOT/TRootDS.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ private:
std::vector<std::vector<void *>> fBranchAddresses; // first container-> slot, second -> column;
std::vector<std::unique_ptr<TChain>> fChains;

void InitAddresses() {}
std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &);

public:
Expand All @@ -32,7 +31,7 @@ public:
bool HasColumn(std::string_view colName) const;
void InitSlot(unsigned int slot, ULong64_t firstEntry);
const std::vector<std::pair<ULong64_t, ULong64_t>> &GetEntryRanges() const;
void SetEntry(ULong64_t entry, unsigned int slot);
void SetEntry(unsigned int slot, ULong64_t entry);
void SetNSlots(unsigned int nSlots);
};
} // ns TDF
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/inc/ROOT/TTrivialDS.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public:
bool HasColumn(std::string_view colName) const;
std::string GetTypeName(std::string_view) const;
const std::vector<std::pair<ULong64_t, ULong64_t>> &GetEntryRanges() const;
void SetEntry(ULong64_t entry, unsigned int slot);
void SetEntry(unsigned int slot, ULong64_t entry);
void SetNSlots(unsigned int nSlots);
};
} // ns TDF
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/src/TDFNodes.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ void TLoopManager::RunDataSource()
// we are running single-thread, so all ranges are squashed together
const auto lastEntry = rangePairs.back().second;
for (ULong64_t i = 0ull; i < lastEntry; ++i) {
fDataSource->SetEntry(i, 0);
fDataSource->SetEntry(0, i);
RunAndCheckFilters(0, i);
}
}
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/src/TRootDS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ const std::vector<std::pair<ULong64_t, ULong64_t>> &TRootDS::GetEntryRanges() co
return fEntryRanges;
}

void TRootDS::SetEntry(ULong64_t entry, unsigned int slot)
void TRootDS::SetEntry(unsigned int slot, ULong64_t entry)
{
fChains[slot]->GetEntry(entry);
}
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/src/TTrivialDS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ const std::vector<std::pair<ULong64_t, ULong64_t>> &TTrivialDS::GetEntryRanges()
return fEntryRanges;
}

void TTrivialDS::SetEntry(ULong64_t entry, unsigned int slot)
void TTrivialDS::SetEntry(unsigned int slot, ULong64_t entry)
{
fCounter[slot] = entry;
}
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/test/dataframe/datasource_root.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ TEST(TRootTDS, ColumnReaders)
for (auto &&range : ranges) {
tds.InitSlot(slot, range.first);
for (auto i : ROOT::TSeq<int>(range.first, range.second)) {
tds.SetEntry(i, slot);
tds.SetEntry(slot, i);
auto val = **vals[slot];
EXPECT_EQ(i, val);
}
Expand Down
2 changes: 1 addition & 1 deletion tree/treeplayer/test/dataframe/datasource_trivial.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ TEST(TTrivialDS, ColumnReaders)
auto slot = 0U;
for (auto &&range : ranges) {
for (auto i : ROOT::TSeq<ULong64_t>(range.first, range.second)) {
tds.SetEntry(i, slot);
tds.SetEntry(slot, i);
auto val = **vals[slot];
EXPECT_EQ(i, val);
}
Expand Down