Skip to content

Commit 27422f6

Browse files
authored
DPL: make arrow::FileSystem layered (#13655)
This allows distinguishing the navigation of a ROOT file from accessing the actual contents (TTree or eventually, RNTuple).
1 parent 1a84ee5 commit 27422f6

File tree

3 files changed

+126
-42
lines changed

3 files changed

+126
-42
lines changed

Framework/Core/include/Framework/RootArrowFilesystem.h

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <arrow/dataset/file_base.h>
1616
#include <arrow/filesystem/type_fwd.h>
1717
#include <arrow/type_fwd.h>
18+
#include <memory>
1819

1920
class TTree;
2021
class TBufferFile;
@@ -32,11 +33,13 @@ class TTreeFileWriteOptions : public arrow::dataset::FileWriteOptions
3233
}
3334
};
3435

35-
// This is a virtual filesystem based on a ttree, where branches with the
36-
// same prefix get grouped into a fragment
37-
class TTreeFileSystem : public arrow::fs::FileSystem
36+
// This is to avoid having to implement a bunch of unimplemented methods
37+
// for all the possible virtual filesystem we can invent on top of ROOT
38+
// data structures.
39+
class VirtualRootFileSystemBase : public arrow::fs::FileSystem
3840
{
3941
public:
42+
// Dummy implementation to avoid
4043
arrow::Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
4144
arrow::Result<arrow::fs::FileInfoVector> GetFileInfo(const arrow::fs::FileSelector& select) override;
4245

@@ -45,6 +48,8 @@ class TTreeFileSystem : public arrow::fs::FileSystem
4548
return this->type_name() == other.type_name();
4649
}
4750

51+
virtual std::shared_ptr<VirtualRootFileSystemBase> GetSubFilesystem(arrow::dataset::FileSource source) = 0;
52+
4853
arrow::Status CreateDir(const std::string& path, bool recursive) override;
4954

5055
arrow::Status DeleteDir(const std::string& path) override;
@@ -70,8 +75,19 @@ class TTreeFileSystem : public arrow::fs::FileSystem
7075
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> OpenAppendStream(
7176
const std::string& path,
7277
const std::shared_ptr<const arrow::KeyValueMetadata>& metadata) override;
78+
};
79+
80+
// A filesystem which allows me to get a TTree
81+
class TTreeFileSystem : public VirtualRootFileSystemBase
82+
{
83+
public:
84+
~TTreeFileSystem() override;
7385

74-
virtual TTree* GetTree(arrow::dataset::FileSource) = 0;
86+
std::shared_ptr<VirtualRootFileSystemBase> GetSubFilesystem(arrow::dataset::FileSource source) override
87+
{
88+
return std::dynamic_pointer_cast<VirtualRootFileSystemBase>(shared_from_this());
89+
};
90+
virtual TTree* GetTree(arrow::dataset::FileSource source) = 0;
7591
};
7692

7793
class SingleTreeFileSystem : public TTreeFileSystem
@@ -98,17 +114,19 @@ class SingleTreeFileSystem : public TTreeFileSystem
98114
TTree* mTree;
99115
};
100116

101-
class TFileFileSystem : public TTreeFileSystem
117+
class TFileFileSystem : public VirtualRootFileSystemBase
102118
{
103119
public:
120+
arrow::Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
121+
104122
TFileFileSystem(TDirectoryFile* f, size_t readahead);
105123

106124
std::string type_name() const override
107125
{
108126
return "TDirectoryFile";
109127
}
110128

111-
TTree* GetTree(arrow::dataset::FileSource source) override;
129+
std::shared_ptr<VirtualRootFileSystemBase> GetSubFilesystem(arrow::dataset::FileSource source) override;
112130

113131
// We can go back to the TFile in case this is needed.
114132
TDirectoryFile* GetFile()
@@ -120,24 +138,22 @@ class TFileFileSystem : public TTreeFileSystem
120138
TDirectoryFile* mFile;
121139
};
122140

123-
class TBufferFileFS : public TTreeFileSystem
141+
class TBufferFileFS : public VirtualRootFileSystemBase
124142
{
125143
public:
126144
TBufferFileFS(TBufferFile* f);
127145

146+
arrow::Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
128147
std::string type_name() const override
129148
{
130149
return "tbufferfile";
131150
}
132151

133-
TTree* GetTree(arrow::dataset::FileSource) override
134-
{
135-
// Simply return the only TTree we have
136-
return mTree;
137-
}
152+
std::shared_ptr<VirtualRootFileSystemBase> GetSubFilesystem(arrow::dataset::FileSource source) override;
138153

139154
private:
140-
TTree* mTree;
155+
TBufferFile* mBuffer;
156+
std::shared_ptr<VirtualRootFileSystemBase> mFilesystem;
141157
};
142158

143159
class TTreeFileFragment : public arrow::dataset::FileFragment
@@ -179,8 +195,12 @@ class TTreeFileFormat : public arrow::dataset::FileFormat
179195

180196
arrow::Result<bool> IsSupported(const arrow::dataset::FileSource& source) const override
181197
{
182-
auto fs = std::dynamic_pointer_cast<TTreeFileSystem>(source.filesystem());
183-
return fs != nullptr;
198+
auto fs = std::dynamic_pointer_cast<VirtualRootFileSystemBase>(source.filesystem());
199+
auto subFs = fs->GetSubFilesystem(source);
200+
if (std::dynamic_pointer_cast<TTreeFileSystem>(subFs)) {
201+
return true;
202+
}
203+
return false;
184204
}
185205

186206
arrow::Result<std::shared_ptr<arrow::Schema>> Inspect(const arrow::dataset::FileSource& source) const override;

Framework/Core/src/RootArrowFilesystem.cxx

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,19 @@
1111
#include "Framework/RootArrowFilesystem.h"
1212
#include "Framework/Endian.h"
1313
#include "Framework/RuntimeError.h"
14+
#include <Rtypes.h>
15+
#include <arrow/array/array_primitive.h>
1416
#include <arrow/array/builder_nested.h>
1517
#include <arrow/array/builder_primitive.h>
18+
#include <memory>
1619
#include <stdexcept>
1720
#include <TFile.h>
1821
#include <TLeaf.h>
1922
#include <TBufferFile.h>
2023
#include <TTree.h>
2124
#include <TDirectoryFile.h>
25+
#include <arrow/type.h>
26+
#include <arrow/type_fwd.h>
2227

2328
namespace
2429
{
@@ -73,35 +78,52 @@ namespace o2::framework
7378
{
7479

7580
TFileFileSystem::TFileFileSystem(TDirectoryFile* f, size_t readahead)
76-
: TTreeFileSystem(),
81+
: VirtualRootFileSystemBase(),
7782
mFile(f)
7883
{
7984
((TFile*)mFile)->SetReadaheadSize(50 * 1024 * 1024);
8085
}
8186

82-
TTree* TFileFileSystem::GetTree(arrow::dataset::FileSource source)
87+
std::shared_ptr<VirtualRootFileSystemBase> TFileFileSystem::GetSubFilesystem(arrow::dataset::FileSource source)
8388
{
84-
// Simply return the only TTree we have
85-
return (TTree*)mFile->Get(source.path().c_str());
89+
auto tree = (TTree*)mFile->GetObjectChecked(source.path().c_str(), TClass::GetClass<TTree>());
90+
if (tree) {
91+
return std::shared_ptr<VirtualRootFileSystemBase>(new SingleTreeFileSystem(tree));
92+
}
93+
94+
95+
auto directory = (TDirectoryFile*)mFile->GetObjectChecked(source.path().c_str(), TClass::GetClass<TDirectory>());
96+
if (directory) {
97+
return std::shared_ptr<VirtualRootFileSystemBase>(new TFileFileSystem(directory, 50 * 1024 * 1024));
98+
}
99+
throw runtime_error_f("Unsupported file layout");
86100
}
87101

88-
arrow::Result<arrow::fs::FileInfo> TTreeFileSystem::GetFileInfo(const std::string& path)
102+
arrow::Result<arrow::fs::FileInfo> TFileFileSystem::GetFileInfo(const std::string& path)
89103
{
90104
arrow::fs::FileInfo result;
91105
result.set_type(arrow::fs::FileType::NotFound);
92106
result.set_path(path);
93107
arrow::dataset::FileSource source(path, shared_from_this());
94108

95-
for (auto branch : *GetTree(source)->GetListOfBranches()) {
96-
if (strncmp(branch->GetName(), result.path().c_str(), path.size()) == 0) {
97-
result.set_type(arrow::fs::FileType::File);
98-
return result;
99-
}
109+
auto fs = GetSubFilesystem(source);
110+
111+
// For now we only support single trees.
112+
if (std::dynamic_pointer_cast<SingleTreeFileSystem>(fs)) {
113+
result.set_type(arrow::fs::FileType::File);
114+
return result;
100115
}
101116
return result;
102117
}
103118

104-
arrow::Result<arrow::fs::FileInfoVector> TTreeFileSystem::GetFileInfo(const arrow::fs::FileSelector& select)
119+
arrow::Result<arrow::fs::FileInfo> VirtualRootFileSystemBase::GetFileInfo(std::string const&)
120+
{
121+
arrow::fs::FileInfo result;
122+
result.set_type(arrow::fs::FileType::NotFound);
123+
return result;
124+
}
125+
126+
arrow::Result<arrow::fs::FileInfoVector> VirtualRootFileSystemBase::GetFileInfo(const arrow::fs::FileSelector& select)
105127
{
106128
arrow::fs::FileInfoVector results;
107129
auto selected = this->GetFileInfo(select.base_dir);
@@ -111,59 +133,59 @@ arrow::Result<arrow::fs::FileInfoVector> TTreeFileSystem::GetFileInfo(const arro
111133
return results;
112134
}
113135

114-
arrow::Status TTreeFileSystem::CreateDir(const std::string& path, bool recursive)
136+
arrow::Status VirtualRootFileSystemBase::CreateDir(const std::string& path, bool recursive)
115137
{
116138
return arrow::Status::NotImplemented("Read only filesystem");
117139
}
118140

119-
arrow::Status TTreeFileSystem::DeleteDir(const std::string& path)
141+
arrow::Status VirtualRootFileSystemBase::DeleteDir(const std::string& path)
120142
{
121143
return arrow::Status::NotImplemented("Read only filesystem");
122144
}
123145

124-
arrow::Status TTreeFileSystem::CopyFile(const std::string& src, const std::string& dest)
146+
arrow::Status VirtualRootFileSystemBase::CopyFile(const std::string& src, const std::string& dest)
125147
{
126148
return arrow::Status::NotImplemented("Read only filesystem");
127149
}
128150

129-
arrow::Status TTreeFileSystem::Move(const std::string& src, const std::string& dest)
151+
arrow::Status VirtualRootFileSystemBase::Move(const std::string& src, const std::string& dest)
130152
{
131153
return arrow::Status::NotImplemented("Read only filesystem");
132154
}
133155

134-
arrow::Status TTreeFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok)
156+
arrow::Status VirtualRootFileSystemBase::DeleteDirContents(const std::string& path, bool missing_dir_ok)
135157
{
136158
return arrow::Status::NotImplemented("Read only filesystem");
137159
}
138160

139-
arrow::Status TTreeFileSystem::DeleteRootDirContents()
161+
arrow::Status VirtualRootFileSystemBase::DeleteRootDirContents()
140162
{
141163
return arrow::Status::NotImplemented("Read only filesystem");
142164
}
143165

144-
arrow::Status TTreeFileSystem::DeleteFile(const std::string& path)
166+
arrow::Status VirtualRootFileSystemBase::DeleteFile(const std::string& path)
145167
{
146168
return arrow::Status::NotImplemented("Read only filesystem");
147169
}
148170

149-
arrow::Result<std::shared_ptr<arrow::io::InputStream>> TTreeFileSystem::OpenInputStream(const std::string& path)
171+
arrow::Result<std::shared_ptr<arrow::io::InputStream>> VirtualRootFileSystemBase::OpenInputStream(const std::string& path)
150172
{
151173
return arrow::Status::NotImplemented("Non streamable filesystem");
152174
}
153175

154-
arrow::Result<std::shared_ptr<arrow::io::RandomAccessFile>> TTreeFileSystem::OpenInputFile(const std::string& path)
176+
arrow::Result<std::shared_ptr<arrow::io::RandomAccessFile>> VirtualRootFileSystemBase::OpenInputFile(const std::string& path)
155177
{
156178
return arrow::Status::NotImplemented("No random access file system");
157179
}
158180

159-
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> TTreeFileSystem::OpenOutputStream(
181+
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> VirtualRootFileSystemBase::OpenOutputStream(
160182
const std::string& path,
161183
const std::shared_ptr<const arrow::KeyValueMetadata>& metadata)
162184
{
163185
return arrow::Status::NotImplemented("Non streamable filesystem");
164186
}
165187

166-
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> TTreeFileSystem::OpenAppendStream(
188+
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> VirtualRootFileSystemBase::OpenAppendStream(
167189
const std::string& path,
168190
const std::shared_ptr<const arrow::KeyValueMetadata>& metadata)
169191
{
@@ -173,8 +195,13 @@ arrow::Result<std::shared_ptr<arrow::io::OutputStream>> TTreeFileSystem::OpenApp
173195
arrow::Result<std::shared_ptr<arrow::Schema>> TTreeFileFormat::Inspect(const arrow::dataset::FileSource& source) const
174196
{
175197
arrow::Schema schema{{}};
176-
auto fs = std::dynamic_pointer_cast<TTreeFileSystem>(source.filesystem());
177-
TTree* tree = fs->GetTree(source);
198+
auto fs = std::dynamic_pointer_cast<VirtualRootFileSystemBase>(source.filesystem());
199+
// Actually get the TTree from the ROOT file.
200+
auto treeFs = std::dynamic_pointer_cast<TTreeFileSystem>(fs->GetSubFilesystem(source));
201+
if (!treeFs.get()) {
202+
throw runtime_error_f("Unknown filesystem %s\n", source.filesystem()->type_name().c_str());
203+
}
204+
TTree* tree = treeFs->GetTree(source);
178205

179206
auto branches = tree->GetListOfBranches();
180207
auto n = branches->GetEntries();
@@ -270,7 +297,9 @@ arrow::Result<arrow::RecordBatchGenerator> TTreeFileFormat::ScanBatchesAsync(
270297
auto physical_schema = *treeFragment->ReadPhysicalSchema();
271298

272299
static TBufferFile buffer{TBuffer::EMode::kWrite, 4 * 1024 * 1024};
273-
auto fs = std::dynamic_pointer_cast<TTreeFileSystem>(treeFragment->source().filesystem());
300+
auto containerFS = std::dynamic_pointer_cast<VirtualRootFileSystemBase>(treeFragment->source().filesystem());
301+
auto fs = std::dynamic_pointer_cast<TTreeFileSystem>(containerFS->GetSubFilesystem(treeFragment->source()));
302+
274303
int64_t rows = -1;
275304
TTree* tree = fs->GetTree(treeFragment->source());
276305
for (auto& field : fields) {
@@ -446,9 +475,42 @@ arrow::Result<arrow::RecordBatchGenerator> TTreeFileFormat::ScanBatchesAsync(
446475
}
447476

448477
TBufferFileFS::TBufferFileFS(TBufferFile* f)
449-
: TTreeFileSystem(),
450-
mTree((TTree*)f->ReadObject(TTree::Class()))
478+
: VirtualRootFileSystemBase(),
479+
mBuffer(f),
480+
mFilesystem(nullptr)
481+
{
482+
}
483+
484+
TTreeFileSystem::~TTreeFileSystem() = default;
485+
486+
487+
arrow::Result<arrow::fs::FileInfo> TBufferFileFS::GetFileInfo(const std::string& path)
488+
{
489+
arrow::fs::FileInfo result;
490+
result.set_type(arrow::fs::FileType::NotFound);
491+
result.set_path(path);
492+
arrow::dataset::FileSource source(path, shared_from_this());
493+
494+
// Only once to avoid rereading the streamed tree.
495+
if (!mFilesystem.get()) {
496+
return result;
497+
}
498+
499+
// For now we only support single trees.
500+
if (std::dynamic_pointer_cast<SingleTreeFileSystem>(mFilesystem)) {
501+
result.set_type(arrow::fs::FileType::File);
502+
return result;
503+
}
504+
return result;
505+
}
506+
507+
std::shared_ptr<VirtualRootFileSystemBase> TBufferFileFS::GetSubFilesystem(arrow::dataset::FileSource source)
451508
{
509+
if (!mFilesystem.get()) {
510+
auto tree = ((TTree*)mBuffer->ReadObject(TTree::Class()));
511+
mFilesystem = std::make_shared<SingleTreeFileSystem>(tree);
512+
}
513+
return mFilesystem;
452514
}
453515

454516
} // namespace o2::framework

Framework/Core/test/test_Root2ArrowTable.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <ROOT/RDataFrame.hxx>
2020
#include <ROOT/RArrowDS.hxx>
2121
#include <TBufferFile.h>
22+
#include <TClass.h>
2223
#include <TMemFile.h>
2324
#include <TDirectory.h>
2425
#include <TTree.h>
@@ -231,6 +232,7 @@ TEST_CASE("RootTree2Fragment")
231232
size_t totalSizeUncompressed = 0;
232233
auto format = std::make_shared<TTreeFileFormat>(totalSizeCompressed, totalSizeUncompressed);
233234
auto fs = std::make_shared<TBufferFileFS>(fileRead);
235+
234236
arrow::dataset::FileSource source("p", fs);
235237
REQUIRE(format->IsSupported(source) == true);
236238
auto schemaOpt = format->Inspect(source);

0 commit comments

Comments
 (0)