Skip to content
Draft
2 changes: 2 additions & 0 deletions acceptance/bundle/user_agent/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ OK deploy.direct /api/2.0/workspace/get-status engine/direct
OK deploy.direct /api/2.0/workspace/get-status engine/direct
OK deploy.direct /api/2.0/workspace/get-status engine/direct
OK deploy.direct /api/2.0/workspace/get-status engine/direct
OK deploy.direct /api/2.0/workspace/list-repo engine/direct
OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/empty.py engine/direct
OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deploy.lock engine/direct
OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deployment.json engine/direct
Expand All @@ -28,6 +29,7 @@ OK deploy.terraform /api/2.0/workspace/get-status engine/terraform
OK deploy.terraform /api/2.0/workspace/get-status engine/terraform
OK deploy.terraform /api/2.0/workspace/get-status engine/terraform
OK deploy.terraform /api/2.0/workspace/get-status engine/terraform
OK deploy.terraform /api/2.0/workspace/list-repo engine/terraform
OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/empty.py engine/terraform
OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deploy.lock engine/terraform
OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deployment.json engine/terraform
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,19 @@
"return_export_info": "true"
}
}
{
"headers": {
"User-Agent": [
"cli/[DEV_VERSION] databricks-sdk-go/[SDK_VERSION] go/[GO_VERSION] os/[OS] cmd/bundle_deploy cmd-exec-id/[UUID] interactive/none engine/direct auth/pat"
]
},
"method": "GET",
"path": "/api/2.0/workspace/list-repo",
"q": {
"path": "/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files",
"return_wsfs_metadata": "true"
}
}
{
"headers": {
"User-Agent": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,19 @@
"return_export_info": "true"
}
}
{
"headers": {
"User-Agent": [
"cli/[DEV_VERSION] databricks-sdk-go/[SDK_VERSION] go/[GO_VERSION] os/[OS] cmd/bundle_deploy cmd-exec-id/[UUID] interactive/none engine/terraform auth/pat"
]
},
"method": "GET",
"path": "/api/2.0/workspace/list-repo",
"q": {
"path": "/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files",
"return_wsfs_metadata": "true"
}
}
{
"headers": {
"User-Agent": [
Expand Down
66 changes: 66 additions & 0 deletions libs/filer/workspace_files_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,72 @@ func (w *WorkspaceFilesClient) Mkdir(ctx context.Context, name string) error {
})
}

// RemoteFileMetadata describes a single workspace object returned by the
// list-repo API with return_wsfs_metadata=true.
type RemoteFileMetadata struct {
// Absolute workspace path. Note that for notebooks, the extension is
// stripped (e.g. /Workspace/foo/bar.py -> /Workspace/foo/bar).
Path string

// SHA-256 hex digest of the blob. Populated only for FILE and NOTEBOOK
// objects (not directories) when the workspace returns wsfs metadata.
ContentSHA256Hex string

// "FILE", "NOTEBOOK", or "DIRECTORY".
ObjectType string
}

// ListWithSHAs recursively lists all workspace objects under the given path
// and returns their content SHAs from the workspace's wsfs metadata. This uses
// /api/2.0/workspace/list-repo with the (currently undocumented in the SDK)
// return_wsfs_metadata=true flag, which causes the response to include a
// content_sha256_hex field for files and notebooks.
//
// Returns nil if the path does not exist; callers should treat that as "no
// remote state to merge" rather than an error.
func (w *WorkspaceFilesClient) ListWithSHAs(ctx context.Context, dirPath string) ([]RemoteFileMetadata, error) {
type listObject struct {
ObjectType string `json:"object_type"`
Path string `json:"path"`
ContentSHA256Hex string `json:"content_sha256_hex"`
HasWsfsMetadata bool `json:"has_wsfs_metadata"`
}
type listResponse struct {
Objects []listObject `json:"objects"`
}

var resp listResponse
err := w.apiClient.Do(
ctx,
http.MethodGet,
"/api/2.0/workspace/list-repo",
w.orgIDHeaders(),
nil,
map[string]any{
"path": dirPath,
"return_wsfs_metadata": true,
},
&resp,
)
if err != nil {
var aerr *apierr.APIError
if errors.As(err, &aerr) && aerr.StatusCode == http.StatusNotFound {
return nil, nil
}
return nil, err
}

out := make([]RemoteFileMetadata, 0, len(resp.Objects))
for _, o := range resp.Objects {
out = append(out, RemoteFileMetadata{
Path: o.Path,
ContentSHA256Hex: o.ContentSHA256Hex,
ObjectType: o.ObjectType,
})
}
return out, nil
}

func (w *WorkspaceFilesClient) Stat(ctx context.Context, name string) (fs.FileInfo, error) {
absPath, err := w.root.Join(name)
if err != nil {
Expand Down
6 changes: 6 additions & 0 deletions libs/fileset/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ func (f File) Modified() (ts time.Time) {
return info.ModTime()
}

// Open returns a reader for the file contents. The caller is responsible
// for closing it.
func (f File) Open() (fs.File, error) {
return f.root.Open(f.Relative)
}

func (f *File) IsNotebook() (bool, error) {
if f.fileType != Unknown {
return f.fileType == Notebook, nil
Expand Down
133 changes: 133 additions & 0 deletions libs/sync/remote_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package sync

import (
"context"
"crypto/sha256"
"encoding/hex"
"io"
"path"

"github.com/databricks/cli/libs/filer"
"github.com/databricks/cli/libs/fileset"
"github.com/databricks/cli/libs/log"
)

// shaLister fetches content SHAs for a workspace directory in one bulk call.
// The interface lets tests stub the API call without spinning up a fake
// filer.
type shaLister interface {
ListWithSHAs(ctx context.Context, dirPath string) ([]filer.RemoteFileMetadata, error)
}

// RemoteFilter is the third layer of the sync pipeline. It takes the action
// plan produced by the snapshot diff and drops puts whose remote SHA already
// matches the local SHA — files the workspace already has, byte-for-byte.
//
// The expensive work (one bulk list, one SHA per skipped put candidate) only
// pays off when the snapshot diff has produced false-positive puts at scale.
// The caller decides when to invoke Apply; today that's only on a fresh
// snapshot (no prior local state).
type RemoteFilter struct {
lister shaLister
remotePath string
}

func NewRemoteFilter(lister shaLister, remotePath string) *RemoteFilter {
return &RemoteFilter{lister: lister, remotePath: remotePath}
}

// Apply returns a copy of d with put entries removed for files whose local
// SHA already matches the remote SHA. Errors fetching or computing SHAs are
// logged and treated as "do not skip" — the worst case is an unnecessary
// upload, which is the existing behavior.
func (f *RemoteFilter) Apply(ctx context.Context, d diff, files []fileset.File, localToRemote map[string]string) diff {
if len(d.put) == 0 || f == nil || f.lister == nil {
return d
}

remote, err := f.lister.ListWithSHAs(ctx, f.remotePath)
if err != nil {
log.Warnf(ctx, "could not fetch remote content SHAs from %s; uploading all candidate files: %s", f.remotePath, err)
return d
}
if len(remote) == 0 {
return d
}

remoteSHAByPath := make(map[string]string, len(remote))
for _, e := range remote {
if e.ContentSHA256Hex == "" {
continue
}
remoteSHAByPath[e.Path] = e.ContentSHA256Hex
}

localByRelative := make(map[string]*fileset.File, len(files))
for i := range files {
localByRelative[files[i].Relative] = &files[i]
}

keep := make([]string, 0, len(d.put))
skipped := 0
for _, p := range d.put {
if !f.canSkip(ctx, p, localByRelative, localToRemote, remoteSHAByPath) {
keep = append(keep, p)
continue
}
skipped++
}

if skipped > 0 {
log.Debugf(ctx, "remote-filter: skipped %d/%d uploads matching workspace SHAs", skipped, len(d.put))
}

return diff{
delete: d.delete,
rmdir: d.rmdir,
mkdir: d.mkdir,
put: keep,
}
}

// canSkip reports whether the put for relativePath can be safely dropped:
// the workspace already has a file at the corresponding remote path with the
// same SHA-256 as the local file.
func (f *RemoteFilter) canSkip(
ctx context.Context,
relativePath string,
localByRelative map[string]*fileset.File,
localToRemote map[string]string,
remoteSHAByPath map[string]string,
) bool {
local, ok := localByRelative[relativePath]
if !ok {
return false
}
remoteName, ok := localToRemote[relativePath]
if !ok {
return false
}
remoteSHA, ok := remoteSHAByPath[path.Join(f.remotePath, remoteName)]
if !ok {
return false
}
localSHA, err := computeFileSHA(local)
if err != nil {
log.Debugf(ctx, "remote-filter: hashing %s failed; will upload: %s", relativePath, err)
return false
}
return localSHA == remoteSHA
}

func computeFileSHA(f *fileset.File) (string, error) {
rc, err := f.Open()
if err != nil {
return "", err
}
defer rc.Close()
h := sha256.New()
if _, err := io.Copy(h, rc); err != nil {
return "", err
}
return hex.EncodeToString(h.Sum(nil)), nil
}
Loading