Skip to content

Commit 3cdf39a

Browse files
fix
1 parent 18bd320 commit 3cdf39a

18 files changed

+528
-139
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON)
4545
option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON)
4646
option(ICEBERG_BUILD_REST "Build rest catalog client" ON)
4747
option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF)
48+
option(ICEBERG_S3 "Build with S3 support" ON)
4849
option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF)
4950
option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF)
5051

ci/scripts/start_minio.sh

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ set -eux
2121

2222
MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}"
2323
MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}"
24-
MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:RELEASE.2024-12-18T00-00-00Z}"
24+
MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}"
2525
MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}"
2626
MINIO_PORT="${MINIO_PORT:-9000}"
2727
MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}"
@@ -64,7 +64,8 @@ start_minio_macos() {
6464
fi
6565

6666
brew install minio
67-
minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" &
67+
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
68+
minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" &
6869
wait_for_minio
6970
}
7071

@@ -84,7 +85,13 @@ download_mc() {
8485
;;
8586
Darwin*)
8687
MC_BIN="${mc_dir}/mc"
87-
curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}"
88+
local arch
89+
arch="$(uname -m)"
90+
if [ "${arch}" = "arm64" ]; then
91+
curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}"
92+
else
93+
curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}"
94+
fi
8895
chmod +x "${MC_BIN}"
8996
;;
9097
MINGW*|MSYS*|CYGWIN*)
@@ -109,13 +116,27 @@ create_bucket() {
109116
"${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}"
110117
}
111118

119+
start_minio_windows() {
120+
local minio_dir="${RUNNER_TEMP:-/tmp}"
121+
local minio_bin="${minio_dir}/minio.exe"
122+
curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}"
123+
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
124+
"${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" &
125+
wait_for_minio
126+
}
127+
112128
case "$(uname -s)" in
113129
Darwin*)
114130
if ! start_minio_docker; then
115131
start_minio_macos
116132
fi
117133
;;
118-
Linux*|MINGW*|MSYS*|CYGWIN*)
134+
MINGW*|MSYS*|CYGWIN*)
135+
if ! start_minio_docker; then
136+
start_minio_windows
137+
fi
138+
;;
139+
Linux*)
119140
start_minio_docker
120141
;;
121142
*)

cmake_modules/IcebergThirdpartyToolchain.cmake

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ function(resolve_arrow_dependency)
102102
# Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*)
103103
set(ARROW_IPC ON)
104104
set(ARROW_FILESYSTEM ON)
105-
set(ARROW_S3 ON)
105+
set(ARROW_S3 ${ICEBERG_S3})
106106
set(ARROW_JSON ON)
107107
set(ARROW_PARQUET ON)
108108
set(ARROW_SIMD_LEVEL "NONE")
@@ -165,6 +165,13 @@ function(resolve_arrow_dependency)
165165
install(FILES ${arrow_bundled_dependencies_location}
166166
DESTINATION ${ICEBERG_INSTALL_LIBDIR})
167167
endif()
168+
169+
# Arrow's exported static target interface may reference system libraries
170+
# (e.g. OpenSSL, CURL, ZLIB) that consumers need to find.
171+
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB)
172+
if(ARROW_S3)
173+
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL)
174+
endif()
168175
else()
169176
set(ARROW_VENDORED FALSE)
170177
find_package(Arrow CONFIG REQUIRED)

src/iceberg/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,15 @@ if(ICEBERG_BUILD_BUNDLE)
244244
OUTPUTS
245245
ICEBERG_BUNDLE_LIBRARIES)
246246

247+
if(ICEBERG_S3)
248+
foreach(target iceberg_bundle_static iceberg_bundle_shared)
249+
if(TARGET ${target})
250+
target_compile_definitions(${target}
251+
PUBLIC "$<BUILD_INTERFACE:ICEBERG_S3_ENABLED=1>")
252+
endif()
253+
endforeach()
254+
endif()
255+
247256
add_subdirectory(arrow)
248257
add_subdirectory(avro)
249258
add_subdirectory(parquet)

src/iceberg/arrow/arrow_fs_file_io.cc

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,23 @@
2525
#include "iceberg/arrow/arrow_file_io.h"
2626
#include "iceberg/arrow/arrow_fs_file_io_internal.h"
2727
#include "iceberg/arrow/arrow_status_internal.h"
28+
#include "iceberg/util/macros.h"
2829

2930
namespace iceberg::arrow {
3031

32+
Result<std::string> ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) {
33+
if (file_location.find("://") != std::string::npos) {
34+
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto path, arrow_fs_->PathFromUri(file_location));
35+
return path;
36+
}
37+
return file_location;
38+
}
39+
3140
/// \brief Read the content of the file at the given location.
3241
Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_location,
3342
std::optional<size_t> length) {
34-
::arrow::fs::FileInfo file_info(file_location);
43+
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
44+
::arrow::fs::FileInfo file_info(path);
3545
if (length.has_value()) {
3646
file_info.set_size(length.value());
3747
}
@@ -47,6 +57,10 @@ Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_loca
4757
ICEBERG_ARROW_ASSIGN_OR_RETURN(
4858
auto read_bytes,
4959
file->Read(read_length, reinterpret_cast<uint8_t*>(&content[offset])));
60+
if (read_bytes == 0) {
61+
return IOError("Unexpected EOF reading {}: got {} of {} bytes", file_location,
62+
offset, file_size);
63+
}
5064
remain -= read_bytes;
5165
offset += read_bytes;
5266
}
@@ -57,7 +71,8 @@ Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_loca
5771
/// \brief Write the given content to the file at the given location.
5872
Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location,
5973
std::string_view content) {
60-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(file_location));
74+
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
75+
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(path));
6176
ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size()));
6277
ICEBERG_ARROW_RETURN_NOT_OK(file->Flush());
6378
ICEBERG_ARROW_RETURN_NOT_OK(file->Close());
@@ -66,7 +81,8 @@ Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location,
6681

6782
/// \brief Delete a file at the given location.
6883
Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) {
69-
ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(file_location));
84+
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
85+
ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path));
7086
return {};
7187
}
7288

src/iceberg/arrow/arrow_fs_file_io_internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO {
5656
const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; }
5757

5858
private:
59+
/// \brief Resolve a file location to a filesystem path.
60+
Result<std::string> ResolvePath(const std::string& file_location);
61+
5962
std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_;
6063
};
6164

src/iceberg/arrow/arrow_s3_file_io.cc

Lines changed: 38 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@
1919

2020
#include <cstdlib>
2121
#include <mutex>
22-
#include <string_view>
22+
#include <stdexcept>
2323

2424
#include <arrow/filesystem/filesystem.h>
25-
#include <arrow/filesystem/localfs.h>
26-
#if __has_include(<arrow/filesystem/s3fs.h>)
27-
#include <arrow/filesystem/s3fs.h>
28-
#define ICEBERG_ARROW_HAS_S3 1
25+
#ifdef ICEBERG_S3_ENABLED
26+
# include <arrow/filesystem/s3fs.h>
27+
# define ICEBERG_ARROW_HAS_S3 1
2928
#else
30-
#define ICEBERG_ARROW_HAS_S3 0
29+
# define ICEBERG_ARROW_HAS_S3 0
3130
#endif
3231

3332
#include "iceberg/arrow/arrow_file_io.h"
@@ -40,23 +39,17 @@ namespace iceberg::arrow {
4039

4140
namespace {
4241

43-
bool IsS3Uri(std::string_view uri) { return uri.rfind("s3://", 0) == 0; }
44-
4542
Status EnsureS3Initialized() {
4643
#if ICEBERG_ARROW_HAS_S3
4744
static std::once_flag init_flag;
4845
static ::arrow::Status init_status = ::arrow::Status::OK();
4946
std::call_once(init_flag, []() {
5047
::arrow::fs::S3GlobalOptions options;
5148
init_status = ::arrow::fs::InitializeS3(options);
52-
if (init_status.ok()) {
53-
std::atexit([]() { (void)::arrow::fs::FinalizeS3(); });
54-
}
5549
});
5650
if (!init_status.ok()) {
57-
return std::unexpected<Error>{
58-
{.kind = ::iceberg::arrow::ToErrorKind(init_status),
59-
.message = init_status.ToString()}};
51+
return std::unexpected(Error{.kind = ::iceberg::arrow::ToErrorKind(init_status),
52+
.message = init_status.ToString()});
6053
}
6154
return {};
6255
#else
@@ -69,7 +62,7 @@ Status EnsureS3Initialized() {
6962
///
7063
/// \param properties The configuration properties map.
7164
/// \return Configured S3Options.
72-
::arrow::fs::S3Options ConfigureS3Options(
65+
Result<::arrow::fs::S3Options> ConfigureS3Options(
7366
const std::unordered_map<std::string, std::string>& properties) {
7467
::arrow::fs::S3Options options;
7568

@@ -100,13 +93,22 @@ ::arrow::fs::S3Options ConfigureS3Options(
10093
auto endpoint_it = properties.find(S3Properties::kEndpoint);
10194
if (endpoint_it != properties.end()) {
10295
options.endpoint_override = endpoint_it->second;
96+
} else {
97+
// Fall back to AWS standard environment variables for endpoint override
98+
const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3");
99+
if (s3_endpoint_env != nullptr) {
100+
options.endpoint_override = s3_endpoint_env;
101+
} else {
102+
const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL");
103+
if (endpoint_env != nullptr) {
104+
options.endpoint_override = endpoint_env;
105+
}
106+
}
103107
}
104108

105-
// Configure path-style access (needed for MinIO)
106109
auto path_style_it = properties.find(S3Properties::kPathStyleAccess);
107-
if (path_style_it != properties.end()) {
108-
// Arrow's S3 path-style is controlled via endpoint scheme
109-
// For path-style access, we need to ensure the endpoint is properly configured
110+
if (path_style_it != properties.end() && path_style_it->second == "true") {
111+
options.force_virtual_addressing = false;
110112
}
111113

112114
// Configure SSL
@@ -118,117 +120,45 @@ ::arrow::fs::S3Options ConfigureS3Options(
118120
// Configure timeouts
119121
auto connect_timeout_it = properties.find(S3Properties::kConnectTimeoutMs);
120122
if (connect_timeout_it != properties.end()) {
121-
options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0;
123+
try {
124+
options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0;
125+
} catch (const std::exception& e) {
126+
return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kConnectTimeoutMs,
127+
connect_timeout_it->second, e.what());
128+
}
122129
}
123130

124131
auto socket_timeout_it = properties.find(S3Properties::kSocketTimeoutMs);
125132
if (socket_timeout_it != properties.end()) {
126-
options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0;
133+
try {
134+
options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0;
135+
} catch (const std::exception& e) {
136+
return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kSocketTimeoutMs,
137+
socket_timeout_it->second, e.what());
138+
}
127139
}
128140

129141
return options;
130142
}
131-
132-
/// \brief Create an S3 FileSystem with the given options.
133-
///
134-
/// \param options The S3Options to use.
135-
/// \return A shared_ptr to the S3FileSystem, or an error.
136-
Result<std::shared_ptr<::arrow::fs::FileSystem>> MakeS3FileSystem(
137-
const ::arrow::fs::S3Options& options) {
138-
ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized());
139-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options));
140-
return fs;
141-
}
142143
#endif
143144

144-
Result<std::shared_ptr<::arrow::fs::FileSystem>> ResolveFileSystemFromUri(
145-
const std::string& uri, std::string* out_path) {
146-
if (IsS3Uri(uri)) {
147-
ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized());
148-
}
149-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::FileSystemFromUri(uri, out_path));
150-
return fs;
151-
}
152-
153-
/// \brief ArrowUriFileIO resolves FileSystem from URI for each operation.
154-
///
155-
/// This implementation is thread-safe as it creates a new FileSystem instance
156-
/// for each operation. However, it may be less efficient than caching the
157-
/// FileSystem. S3 initialization is done once per process.
158-
class ArrowUriFileIO : public FileIO {
159-
public:
160-
Result<std::string> ReadFile(const std::string& file_location,
161-
std::optional<size_t> length) override {
162-
std::string path;
163-
ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path));
164-
::arrow::fs::FileInfo file_info(path);
165-
if (length.has_value()) {
166-
file_info.set_size(length.value());
167-
}
168-
std::string content;
169-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenInputFile(file_info));
170-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file_size, file->GetSize());
171-
172-
content.resize(file_size);
173-
size_t remain = file_size;
174-
size_t offset = 0;
175-
while (remain > 0) {
176-
size_t read_length = std::min(remain, static_cast<size_t>(1024 * 1024));
177-
ICEBERG_ARROW_ASSIGN_OR_RETURN(
178-
auto read_bytes,
179-
file->Read(read_length, reinterpret_cast<uint8_t*>(&content[offset])));
180-
remain -= read_bytes;
181-
offset += read_bytes;
182-
}
183-
184-
return content;
185-
}
186-
187-
Status WriteFile(const std::string& file_location,
188-
std::string_view content) override {
189-
std::string path;
190-
ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path));
191-
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenOutputStream(path));
192-
ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size()));
193-
ICEBERG_ARROW_RETURN_NOT_OK(file->Flush());
194-
ICEBERG_ARROW_RETURN_NOT_OK(file->Close());
195-
return {};
196-
}
197-
198-
Status DeleteFile(const std::string& file_location) override {
199-
std::string path;
200-
ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path));
201-
ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFile(path));
202-
return {};
203-
}
204-
};
205-
206145
} // namespace
207146

208147
Result<std::unique_ptr<FileIO>> MakeS3FileIO(
209148
const std::string& uri,
210149
const std::unordered_map<std::string, std::string>& properties) {
211-
if (!IsS3Uri(uri)) {
150+
if (!uri.starts_with("s3://")) {
212151
return InvalidArgument("S3 URI must start with s3://");
213152
}
214153
#if !ICEBERG_ARROW_HAS_S3
215154
return NotImplemented("Arrow S3 support is not enabled");
216155
#else
217-
// If properties are empty, use the simple URI-based resolution
218-
if (properties.empty()) {
219-
// Validate that S3 can be initialized and the URI is valid
220-
std::string path;
221-
ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(uri, &path));
222-
(void)path;
223-
(void)fs;
224-
return std::make_unique<ArrowUriFileIO>();
225-
}
156+
ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized());
226157

227-
// Create S3FileSystem with explicit configuration
228-
auto options = ConfigureS3Options(properties);
229-
ICEBERG_ASSIGN_OR_RAISE(auto fs, MakeS3FileSystem(options));
158+
// Configure S3 options from properties (uses default credentials if empty)
159+
ICEBERG_ASSIGN_OR_RAISE(auto options, ConfigureS3Options(properties));
160+
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options));
230161

231-
// Return ArrowFileSystemFileIO with the configured S3 filesystem
232162
return std::make_unique<ArrowFileSystemFileIO>(std::move(fs));
233163
#endif
234164
}

0 commit comments

Comments
 (0)