Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,65 +313,90 @@ std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const {
return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name);
}

void AzureOptions::ClearCredentials() {
credential_kind_ = CredentialKind::kDefault;
storage_shared_key_credential_ = nullptr;
account_key_.clear();
sas_token_.clear();
tenant_id_.clear();
client_id_.clear();
client_secret_.clear();
token_credential_ = nullptr;
}

Status AzureOptions::ConfigureDefaultCredential() {
ClearCredentials();
credential_kind_ = CredentialKind::kDefault;
token_credential_ = std::make_shared<Azure::Identity::DefaultAzureCredential>();
return Status::OK();
}

Status AzureOptions::ConfigureAnonymousCredential() {
ClearCredentials();
credential_kind_ = CredentialKind::kAnonymous;
return Status::OK();
}

Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_key) {
ClearCredentials();
credential_kind_ = CredentialKind::kStorageSharedKey;
if (account_name.empty()) {
return Status::Invalid("AzureOptions doesn't contain a valid account name");
}
account_key_ = account_key;
storage_shared_key_credential_ =
std::make_shared<Storage::StorageSharedKeyCredential>(account_name, account_key);
return Status::OK();
}

Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) {
credential_kind_ = CredentialKind::kSASToken;
ClearCredentials();
if (account_name.empty()) {
return Status::Invalid("AzureOptions doesn't contain a valid account name");
}
sas_token_ = sas_token;
credential_kind_ = CredentialKind::kSASToken;
return Status::OK();
}

Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret) {
ClearCredentials();
tenant_id_ = tenant_id;
client_id_ = client_id;
client_secret_ = client_secret;
credential_kind_ = CredentialKind::kClientSecret;
token_credential_ = std::make_shared<Azure::Identity::ClientSecretCredential>(
tenant_id, client_id, client_secret);
return Status::OK();
}

Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& client_id) {
ClearCredentials();
client_id_ = client_id;
credential_kind_ = CredentialKind::kManagedIdentity;
token_credential_ =
std::make_shared<Azure::Identity::ManagedIdentityCredential>(client_id);
return Status::OK();
}

Status AzureOptions::ConfigureCLICredential() {
ClearCredentials();
credential_kind_ = CredentialKind::kCLI;
token_credential_ = std::make_shared<Azure::Identity::AzureCliCredential>();
return Status::OK();
}

Status AzureOptions::ConfigureWorkloadIdentityCredential() {
ClearCredentials();
credential_kind_ = CredentialKind::kWorkloadIdentity;
token_credential_ = std::make_shared<Azure::Identity::WorkloadIdentityCredential>();
return Status::OK();
}

Status AzureOptions::ConfigureEnvironmentCredential() {
ClearCredentials();
credential_kind_ = CredentialKind::kEnvironment;
token_credential_ = std::make_shared<Azure::Identity::EnvironmentCredential>();
return Status::OK();
Expand Down
15 changes: 11 additions & 4 deletions cpp/src/arrow/filesystem/azurefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,6 @@ struct ARROW_EXPORT AzureOptions {
/// Default: "https"
std::string dfs_storage_scheme = "https";

// TODO(GH-38598): Add support for more auth methods.
// std::string connection_string;
// std::string sas_token;

/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
Expand All @@ -126,7 +122,11 @@ struct ARROW_EXPORT AzureOptions {

std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
storage_shared_key_credential_;
std::string account_key_;
std::string sas_token_;
std::string tenant_id_;
std::string client_id_;
std::string client_secret_;
mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;

public:
Expand Down Expand Up @@ -187,6 +187,7 @@ struct ARROW_EXPORT AzureOptions {
static Result<AzureOptions> FromUri(const Uri& uri, std::string* out_path);
static Result<AzureOptions> FromUri(const std::string& uri, std::string* out_path);

void ClearCredentials();
Status ConfigureDefaultCredential();
Status ConfigureAnonymousCredential();
Status ConfigureAccountKeyCredential(const std::string& account_key);
Expand All @@ -204,6 +205,12 @@ struct ARROW_EXPORT AzureOptions {
std::string AccountBlobUrl(const std::string& account_name) const;
std::string AccountDfsUrl(const std::string& account_name) const;

std::string AccountKey() const { return account_key_; }
std::string SasToken() const { return sas_token_; }
std::string TenantId() const { return tenant_id_; }
std::string ClientId() const { return client_id_; }
std::string ClientSecret() const { return client_secret_; }

Result<std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>>
MakeBlobServiceClient() const;

Expand Down
63 changes: 60 additions & 3 deletions cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,11 @@ TEST(AzureFileSystem, InitializeWithDefaultCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureDefaultCredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

Expand All @@ -509,6 +514,23 @@ TEST(AzureFileSystem, InitializeWithAnonymousCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureAnonymousCredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithAccountKeyCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential("account_key"));
ASSERT_EQ(options.AccountKey(), "account_key");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

Expand All @@ -517,37 +539,67 @@ TEST(AzureFileSystem, InitializeWithClientSecretCredential) {
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(
options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret"));
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "tenant_id");
ASSERT_EQ(options.ClientId(), "client_id");
ASSERT_EQ(options.ClientSecret(), "client_secret");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithManagedIdentityCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));

ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential("specific-client-id"));
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "specific-client-id");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithCLICredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureCLICredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithWorkloadIdentityCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithEnvironmentCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureEnvironmentCredential());
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), "");
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

Expand Down Expand Up @@ -1679,9 +1731,14 @@ class TestAzureFileSystem : public ::testing::Test {
env->account_name(), env->account_key())));
// AzureOptions::FromUri will not cut off extra query parameters that it consumes, so
// make sure these don't cause problems.
ARROW_EXPECT_OK(options.ConfigureSASCredential(
"?blob_storage_authority=dummy_value0&" + sas_token.substr(1) +
"&credential_kind=dummy-value1"));
auto polluted_sas_token = "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) +
"&credential_kind=dummy-value1";
ARROW_EXPECT_OK(options.ConfigureSASCredential(polluted_sas_token));
ASSERT_EQ(options.AccountKey(), "");
ASSERT_EQ(options.SasToken(), polluted_sas_token);
ASSERT_EQ(options.TenantId(), "");
ASSERT_EQ(options.ClientId(), "");
ASSERT_EQ(options.ClientSecret(), "");
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));

AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File);
Expand Down
41 changes: 15 additions & 26 deletions python/pyarrow/_azurefs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,33 @@ cdef class AzureFileSystem(FileSystem):
Azure Blob Storage backed FileSystem implementation

This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a.
Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific
features will be used when they provide a performance advantage. Azurite emulator is
Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific
features will be used when they provide a performance advantage. Azurite emulator is
also supported. Note: `/` is the only supported delimiter.

The storage account is considered the root of the filesystem. When enabled, containers
will be created or deleted during relevant directory operations. Obviously, this also
requires authentication with the additional permissions.
The storage account is considered the root of the filesystem. When enabled, containers
will be created or deleted during relevant directory operations. Obviously, this also
requires authentication with the additional permissions.

By default `DefaultAzureCredential <https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential>`__
By default `DefaultAzureCredential <https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential>`__
is used for authentication. This means it will try several types of authentication
and go with the first one that works. If any authentication parameters are provided when
and go with the first one that works. If any authentication parameters are provided when
initialising the FileSystem, they will be used instead of the default credential.

Parameters
----------
account_name : str
Azure Blob Storage account name. This is the globally unique identifier for the
Azure Blob Storage account name. This is the globally unique identifier for the
storage account.
account_key : str, default None
Account key of the storage account. If sas_token and account_key are None the
Account key of the storage account. If sas_token and account_key are None the
default credential will be used. The parameters account_key and sas_token are
mutually exclusive.
blob_storage_authority : str, default None
hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful
for connecting to a local emulator, like Azurite.
blob_storage_scheme : str, default None
Either `http` or `https`. Defaults to `https`. Useful for connecting to a local
Either `http` or `https`. Defaults to `https`. Useful for connecting to a local
emulator, like Azurite.
client_id : str, default None
The client ID (Application ID) for Azure Active Directory authentication.
Expand Down Expand Up @@ -101,11 +101,6 @@ cdef class AzureFileSystem(FileSystem):
"""
cdef:
CAzureFileSystem* azurefs
c_string account_key
c_string sas_token
c_string tenant_id
c_string client_id
c_string client_secret

def __init__(self, account_name, *, account_key=None, blob_storage_authority=None,
blob_storage_scheme=None, client_id=None, client_secret=None,
Expand Down Expand Up @@ -133,14 +128,10 @@ cdef class AzureFileSystem(FileSystem):
raise ValueError("client_id must be specified")
if not tenant_id and not client_secret:
options.ConfigureManagedIdentityCredential(tobytes(client_id))
self.client_id = tobytes(client_id)
elif tenant_id and client_secret:
options.ConfigureClientSecretCredential(
tobytes(tenant_id), tobytes(client_id), tobytes(client_secret)
)
self.tenant_id = tobytes(tenant_id)
self.client_id = tobytes(client_id)
self.client_secret = tobytes(client_secret)
else:
raise ValueError(
"Invalid Azure credential configuration: "
Expand All @@ -149,10 +140,8 @@ cdef class AzureFileSystem(FileSystem):
)
elif account_key:
options.ConfigureAccountKeyCredential(tobytes(account_key))
self.account_key = tobytes(account_key)
elif sas_token:
options.ConfigureSASCredential(tobytes(sas_token))
self.sas_token = tobytes(sas_token)
else:
options.ConfigureDefaultCredential()

Expand All @@ -176,13 +165,13 @@ cdef class AzureFileSystem(FileSystem):
return (
AzureFileSystem._reconstruct, (dict(
account_name=frombytes(opts.account_name),
account_key=frombytes(self.account_key),
account_key=frombytes(opts.AccountKey()),
blob_storage_authority=frombytes(opts.blob_storage_authority),
blob_storage_scheme=frombytes(opts.blob_storage_scheme),
client_id=frombytes(self.client_id),
client_secret=frombytes(self.client_secret),
client_id=frombytes(opts.ClientId()),
client_secret=frombytes(opts.ClientSecret()),
dfs_storage_authority=frombytes(opts.dfs_storage_authority),
dfs_storage_scheme=frombytes(opts.dfs_storage_scheme),
sas_token=frombytes(self.sas_token),
tenant_id=frombytes(self.tenant_id)
sas_token=frombytes(opts.SasToken()),
tenant_id=frombytes(opts.TenantId())
),))
5 changes: 5 additions & 0 deletions python/pyarrow/includes/libarrow_fs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:
CStatus ConfigureClientSecretCredential(c_string tenant_id,
c_string client_id,
c_string client_secret)
c_string SasToken()
c_string AccountKey()
c_string TenantId()
c_string ClientId()
c_string ClientSecret()

cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem":
@staticmethod
Expand Down
9 changes: 8 additions & 1 deletion python/pyarrow/tests/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,14 +636,21 @@ def test_subtree_filesystem():
' base_fs=<pyarrow._fs.LocalFileSystem')


def test_filesystem_pickling(fs, pickle_module):
@pytest.mark.parametrize("wrap_with_in_subtree_fs", [False, True])
def test_filesystem_pickling(wrap_with_in_subtree_fs, fs, pickle_module):
if fs.type_name.split('::')[-1] == 'mock':
pytest.xfail(reason='MockFileSystem is not serializable')

if wrap_with_in_subtree_fs:
fs = SubTreeFileSystem('/', fs)
print(fs.base_fs.__reduce__())

serialized = pickle_module.dumps(fs)
restored = pickle_module.loads(serialized)
assert isinstance(restored, FileSystem)
assert restored.equals(fs)
print(fs.base_fs.__reduce__())
assert False


def test_filesystem_is_functional_after_pickling(fs, pathfn, pickle_module):
Expand Down
Loading