Skip to content

Add an option to edmFileUtil to print information on clusters #48044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions IOPool/Common/bin/CollUtil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "DataFormats/Provenance/interface/FileIndex.h"
#include "DataFormats/Provenance/interface/IndexIntoFile.h"

#include "TBasket.h"
#include "TBranch.h"
#include "TFile.h"
#include "TIterator.h"
Expand Down Expand Up @@ -97,6 +98,131 @@ namespace edm {
}
}

namespace {
class BranchBasketBytes {
public:
BranchBasketBytes(TBranch const *branch)
: basketFirstEntry_(branch->GetBasketEntry()),
basketBytes_(branch->GetBasketBytes()),
branchName_(branch->GetName()),
maxBaskets_(branch->GetMaxBaskets()) {}

bool isAlignedWithClusterBoundaries() const { return isAligned_; }

// Processes "next cluster" for the branch, calculating the
// number of bytes and baskets in the cluster
//
// @param[in] clusterBegin Begin entry number for the cluster
// @param[in] clusterEnd End entry number (exclusive) for the cluster
// @param[out] nonAlignedBranches Branch name is added to the set if the basket boundary
// does not align with cluster boundary
//
// @return Tuple of the number of bytes and baskets in the cluster
std::tuple<Long64_t, unsigned> bytesInNextCluster(Long64_t clusterBegin,
Long64_t clusterEnd,
std::set<std::string_view> &nonAlignedBranches) {
if (basketFirstEntry_[iBasket_] != clusterBegin) {
std::cout << "Branch " << branchName_ << " iBasket " << iBasket_ << " begin entry "
<< basketFirstEntry_[iBasket_] << " does not align with cluster boundary, expected " << clusterBegin
<< std::endl;
exit(1);
}

Long64_t bytes = 0;
unsigned nbaskets = 0;
for (; iBasket_ < maxBaskets_ and basketFirstEntry_[iBasket_] < clusterEnd; ++iBasket_) {
bytes += basketBytes_[iBasket_];
++nbaskets;
}
if (basketFirstEntry_[iBasket_] != clusterEnd) {
nonAlignedBranches.insert(branchName_);
isAligned_ = false;
return std::tuple(0, 0);
}
return std::tuple(bytes, nbaskets);
}

private:
Long64_t const *basketFirstEntry_;
Int_t const *basketBytes_;
std::string_view branchName_;
Int_t maxBaskets_;
Long64_t iBasket_ = 0;
bool isAligned_ = true;
};

std::vector<BranchBasketBytes> makeBranchBasketBytes(TBranch *branch, bool isEventsTree) {
std::vector<BranchBasketBytes> ret;

TObjArray *subBranches = branch->GetListOfBranches();
if (subBranches and subBranches->GetEntries() > 0) {
// process sub-branches if there are any
auto const nbranches = subBranches->GetEntries();
for (Long64_t iBranch = 0; iBranch < nbranches; ++iBranch) {
auto vec = makeBranchBasketBytes(dynamic_cast<TBranch *>(subBranches->At(iBranch)), isEventsTree);
ret.insert(ret.end(), std::make_move_iterator(vec.begin()), std::make_move_iterator(vec.end()));
}
} else {
ret.emplace_back(branch);
}
return ret;
}
} // namespace

void clusterPrint(TTree *tr, bool isEventsTree) {
TTree::TClusterIterator clusterIter = tr->GetClusterIterator(0);
Long64_t const nentries = tr->GetEntries();

// Keep the state of each branch basket index so that we don't
// have to iterate through everything on every cluster
std::vector<BranchBasketBytes> processors;
{
TObjArray *branches = tr->GetListOfBranches();
Long64_t const nbranches = branches->GetEntries();
for (Long64_t iBranch = 0; iBranch < nbranches; ++iBranch) {
auto vec = makeBranchBasketBytes(dynamic_cast<TBranch *>(branches->At(iBranch)), isEventsTree);
processors.insert(processors.end(), std::make_move_iterator(vec.begin()), std::make_move_iterator(vec.end()));
}
}

std::cout << "Printing cluster boundaries in terms of tree entries of the tree " << tr->GetName()
<< ". Note that end boundary is exclusive." << std::endl;
if (isEventsTree) {
std::cout << "For the Events tree the metadata branches are excluded from this calculation, "
"because their basket boundaries do not necessarily align with the cluster boundaries."
<< std::endl;
}
std::cout << std::setw(15) << "Begin" << std::setw(15) << "End" << std::setw(15) << "Entries" << std::setw(15)
<< "Max baskets" << std::setw(15) << "Bytes" << std::endl;
// Record branches whose baskets do not align with cluster boundaires
std::set<std::string_view> nonAlignedBranches;
Long64_t clusterBegin;
while ((clusterBegin = clusterIter()) < nentries) {
Long64_t clusterEnd = clusterIter.GetNextEntry();
Long64_t bytes = 0;
unsigned int maxbaskets = 0;
for (auto &p : processors) {
if (p.isAlignedWithClusterBoundaries()) {
auto const [byt, bas] = p.bytesInNextCluster(clusterBegin, clusterEnd, nonAlignedBranches);
bytes += byt;
maxbaskets = std::max(bas, maxbaskets);
}
}
std::cout << std::setw(15) << clusterBegin << std::setw(15) << clusterEnd << std::setw(15)
<< (clusterEnd - clusterBegin) << std::setw(15) << maxbaskets << std::setw(15) << bytes << std::endl;
}

if (not nonAlignedBranches.empty()) {
std::cout << "\nThe following branches had baskets whose entry boundaries did not align with the cluster "
"boundaries. Their baskets are excluded from the cluster size calculation above starting from the "
"first basket that did not align with a cluster boundary."
<< std::endl;
for (auto &name : nonAlignedBranches) {
std::cout << " " << name << std::endl;
}
}
}

std::string getUuid(TTree *uuidTree) {
FileID fid;
FileID *fidPtr = &fid;
Expand Down
1 change: 1 addition & 0 deletions IOPool/Common/bin/CollUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace edm {
Long64_t numEntries(TFile *hdl, const std::string &trname);
void printBranchNames(TTree *tree);
void longBranchPrint(TTree *tr);
void clusterPrint(TTree *tr, bool isEventsTree);
std::string getUuid(TTree *uuidTree);
void printUuids(TTree *uuidTree);
void printEventLists(TFile *tfl);
Expand Down
17 changes: 16 additions & 1 deletion IOPool/Common/bin/EdmFileUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ int main(int argc, char* argv[]) {
"JSON,j", "JSON output format. Any arguments listed below are ignored")("ls,l", "list file content")(
"map,m", "Print TFile::Map(\"extended\"). The output can be HUGE.")("print,P", "Print all")(
"verbose,v", "Verbose printout")("printBranchDetails,b", "Call Print()sc for all branches")(
"tree,t", boost::program_options::value<std::string>(), "Select tree used with -P and -b options")(
"printClusters", "Print detailed information about baskets and clusters for all branches")(
"tree,t",
boost::program_options::value<std::string>(),
"Select tree used with -P, -b, and --printClusters options")(
"events,e",
"Print list of all Events, Runs, and LuminosityBlocks in the file sorted by run number, luminosity block number, "
"and event number. Also prints the entry numbers and whether it is possible to use fast copy with the file.")(
Expand Down Expand Up @@ -110,6 +113,7 @@ int main(int argc, char* argv[]) {
bool tree = more && (vm.count("tree") > 0 ? true : false);
bool print = more && (vm.count("print") > 0 ? true : false);
bool printBranchDetails = more && (vm.count("printBranchDetails") > 0 ? true : false);
bool printClusters = more && (vm.count("printClusters") > 0 ? true : false);
bool onlyDecodeLFN =
decodeLFN && !(uuid || adler32 || allowRecovery || json || events || tree || ls || print || printBranchDetails);
std::string selectedTree = tree ? vm["tree"].as<std::string>() : edm::poolNames::eventTreeName();
Expand Down Expand Up @@ -274,6 +278,17 @@ int main(int argc, char* argv[]) {
edm::longBranchPrint(printTree);
}

if (printClusters) {
bool const isEventsTree = (selectedTree == edm::BranchTypeToProductTreeName(edm::InEvent));
TTree* printTree = (TTree*)tfile->Get(selectedTree.c_str());
if (printTree == nullptr) {
std::cout << "Tree " << selectedTree << " appears to be missing. Could not find it in the file.\n";
std::cout << "Exiting\n";
return 1;
}
edm::clusterPrint(printTree, isEventsTree);
}

// Print out event lists
if (events) {
edm::printEventLists(tfile.get());
Expand Down