Skip to content

Commit bdeb5a2

Browse files
author
Max Wang
committed
updates per comments
1 parent dac66bd commit bdeb5a2

5 files changed

Lines changed: 235 additions & 521 deletions

File tree

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ A minimal Python SDK to use Microsoft Dataverse as a database for Azure AI Found
2020
- Bulk create via `CreateMultiple` (collection-bound) by passing `list[dict]` to `create(entity_set, payloads)`; returns list of created IDs.
2121
- Bulk update via `UpdateMultiple` by calling `update_multiple(entity_set, records)` with primary key attribute present in each record; returns nothing.
2222
- Retrieve multiple with server-driven paging: `get_multiple(...)` yields lists (pages) following `@odata.nextLink`. Control total via `$top` and per-page via `page_size` (Prefer: `odata.maxpagesize`).
23-
- Upload files, using either dv message blocks, a single request (supports file size up to 128 MB), or chunk upload under the hood
23+
- Upload files, using either a single request (supports file size up to 128 MB) or chunk upload under the hood
2424
- Optional pandas integration (`PandasODataClient`) for DataFrame based create / get / query.
2525

2626
Auth:
@@ -178,10 +178,9 @@ client.upload_file('account', record_id, 'sample_filecolumn', 'test.pdf', mode='
178178
```
179179

180180
Notes:
181-
- upload_file picks one of the three methods to use based on file size: if file is less than 128 MB uses upload_file_small, otherwise uses upload_file_chunk. upload_file_block is used when explicitly requested
181+
- upload_file picks one of the three methods to use based on file size: if file is less than 128 MB uses upload_file_small, otherwise uses upload_file_chunk
182182
- upload_file_small makes a single Web API call and only supports file size < 128 MB
183-
- upload_file_chunk uses PATCH with Content-Range to upload the file (more aligned with HTTP standard compared to Dataverse messages). It consists of 2 stages 1. PATCH request to get the headers used for actual upload. 2. Actual upload in chunks. It uses x-ms-chunk-size returned in the first stage to determine chunk size (normally 4 MB), and use Content-Range and Content-Length as metadata for the upload. Total number of Web API calls is number of chunks + 1. It's slightly more efficient than the block method because encoding is not needed and the Base64 encoding makes data larger by ~33%.
184-
- upload_file_block uses Dataverse messages and upload the file in Base64 encoded blocks (size limit is 4 MB for the Base64 encoded string), it consists of 3 stages: InitializeFileBlocksUpload, UploadBlock, and CommitFileBlocksUpload. Total number of Web API calls is number of blocks + 2.
183+
- upload_file_chunk uses PATCH with Content-Range to upload the file (more aligned with HTTP standard compared to Dataverse messages). It consists of 2 stages 1. PATCH request to get the headers used for actual upload. 2. Actual upload in chunks. It uses x-ms-chunk-size returned in the first stage to determine chunk size (normally 4 MB), and use Content-Range and Content-Length as metadata for the upload. Total number of Web API calls is number of chunks + 1
185184

186185
## Retrieve multiple with paging
187186

examples/quickstart_file_upload.py

Lines changed: 10 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,18 @@
1919

2020
base_url = entered.rstrip('/')
2121
# Mode selection (numeric):
22-
# 1 = dv message (original in-memory block upload)
23-
# 2 = small (single PATCH)
24-
# 3 = chunk (streaming block upload)
25-
# 4 = all (dv message + small + chunk)
26-
mode_raw = input("Choose mode: 1) dv message 2) small 3) chunk 4) all [default 4]: ").strip()
22+
# 1 = small (single PATCH <128MB)
23+
# 2 = chunk (streaming for any size)
24+
# 3 = all (small + chunk)
25+
mode_raw = input("Choose mode: 1) small 2) chunk 3) all [default 3]: ").strip()
2726
if not mode_raw:
28-
mode_raw = '4'
29-
if mode_raw not in {'1','2','3','4'}:
30-
print({"invalid_mode": mode_raw, "fallback": 4})
31-
mode_raw = '4'
27+
mode_raw = '3'
28+
if mode_raw not in {'1','2','3'}:
29+
print({"invalid_mode": mode_raw, "fallback": 3})
30+
mode_raw = '3'
3231
mode_int = int(mode_raw)
33-
run_original = mode_int in (1,4) # dv message
34-
run_small = mode_int in (2,4)
35-
run_chunk = mode_int in (3,4)
32+
run_small = mode_int in (1,3)
33+
run_chunk = mode_int in (2,3)
3634

3735
delete_table_choice = input("Delete the table at end? (y/N): ").strip() or 'n'
3836
cleanup_table = delete_table_choice.lower() in ("y", "yes", "true", "1")
@@ -176,8 +174,6 @@ def ensure_table():
176174
logical = table_info.get("entity_logical_name") or entity_set.rstrip("s")
177175
attr_prefix = logical.split('_',1)[0] if '_' in logical else logical
178176
name_attr = f"{attr_prefix}_name"
179-
file_attr_schema = f"{attr_prefix}_Document" # SchemaName for file attribute
180-
file_attr_logical = f"{attr_prefix}_document" # expected logical name (lowercase)
181177
small_file_attr_schema = f"{attr_prefix}_SmallDocument" # second file attribute for small single-request demo
182178
small_file_attr_logical = f"{attr_prefix}_smalldocument" # expected logical name (lowercase)
183179
chunk_file_attr_schema = f"{attr_prefix}_ChunkDocument" # attribute for streaming chunk upload demo
@@ -239,8 +235,6 @@ def ensure_file_attribute_generic(schema_name: str, label: str, key_prefix: str)
239235
return False
240236

241237
# Conditionally ensure each attribute only if its mode is selected
242-
if run_original:
243-
ensure_file_attribute_generic(file_attr_schema, "Document", "primary")
244238
if run_small:
245239
ensure_file_attribute_generic(small_file_attr_schema, "Small Document", "small")
246240
if run_chunk:
@@ -262,7 +256,6 @@ def ensure_file_attribute_generic(schema_name: str, label: str, key_prefix: str)
262256
print("No record id; aborting upload.")
263257
sys.exit(1)
264258

265-
size_bytes = None # will set if original mode runs
266259
src_hash_block = None
267260

268261
# --------------------------- Shared dataset helpers ---------------------------
@@ -279,71 +272,6 @@ def get_dataset_info(file_path: Path):
279272
_DATASET_INFO_CACHE[file_path] = info
280273
return info
281274

282-
# --------------------------- dv message block upload demo ---------------------------
283-
if run_original:
284-
try:
285-
# Derive primary key attribute from logical name (table_info earlier)
286-
pk_attr = f"{logical}id" if logical else f"{entity_set.rstrip('s')}id"
287-
# Prepare file info (moved here from earlier section)
288-
print("dv message block upload demo:")
289-
DATASET_FILE, size_bytes, src_hash_block = get_dataset_info(_GENERATED_TEST_FILE)
290-
backoff(lambda: client.upload_file(
291-
entity_set,
292-
record_id,
293-
file_attr_logical,
294-
str(DATASET_FILE),
295-
mode="block",
296-
id_attribute=pk_attr,
297-
))
298-
print({"block_upload_completed": True})
299-
# Immediate download + verify
300-
odata = client._get_odata()
301-
dl_url_block = f"{odata.api}/{entity_set}({record_id})/{file_attr_logical}/$value"
302-
resp_block = odata._request("get", dl_url_block, headers=odata._headers())
303-
resp_block.raise_for_status()
304-
content_block = resp_block.content or b""
305-
import hashlib # noqa: WPS433
306-
dst_hash_block = hashlib.sha256(content_block).hexdigest() if content_block else None
307-
hash_match_block_inline = (dst_hash_block == src_hash_block) if (dst_hash_block and src_hash_block) else None
308-
print({
309-
"block_source_size": size_bytes,
310-
"block_download_size": len(content_block),
311-
"block_size_match": len(content_block) == size_bytes,
312-
"block_source_sha256_prefix": src_hash_block[:16] if src_hash_block else None,
313-
"block_download_sha256_prefix": dst_hash_block[:16] if dst_hash_block else None,
314-
"block_hash_match": hash_match_block_inline,
315-
})
316-
317-
# Now test replacing with an 8MB file
318-
print("dv message block upload demo - REPLACE with 8MB file:")
319-
replacement_file, replace_size, replace_hash = get_dataset_info(_GENERATED_TEST_FILE_8MB)
320-
backoff(lambda: client.upload_file(
321-
entity_set,
322-
record_id,
323-
file_attr_logical,
324-
str(replacement_file),
325-
mode="block",
326-
id_attribute=pk_attr,
327-
))
328-
print({"block_replace_upload_completed": True})
329-
# Download and verify replacement
330-
resp_block_replace = odata._request("get", dl_url_block, headers=odata._headers())
331-
resp_block_replace.raise_for_status()
332-
content_block_replace = resp_block_replace.content or b""
333-
dst_hash_block_replace = hashlib.sha256(content_block_replace).hexdigest() if content_block_replace else None
334-
hash_match_block_replace = (dst_hash_block_replace == replace_hash) if (dst_hash_block_replace and replace_hash) else None
335-
print({
336-
"block_replace_source_size": replace_size,
337-
"block_replace_download_size": len(content_block_replace),
338-
"block_replace_size_match": len(content_block_replace) == replace_size,
339-
"block_replace_source_sha256_prefix": replace_hash[:16] if replace_hash else None,
340-
"block_replace_download_sha256_prefix": dst_hash_block_replace[:16] if dst_hash_block_replace else None,
341-
"block_replace_hash_match": hash_match_block_replace,
342-
})
343-
except Exception as e: # noqa: BLE001
344-
print({"upload_failed": str(e)})
345-
sys.exit(1)
346-
347275
# --------------------------- Small single-request file upload demo ---------------------------
348276
if run_small:
349277
print("Small single-request upload demo:")

src/dataverse_sdk/client.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,8 @@ def upload_file(
266266
record_id: str,
267267
file_name_attribute: str,
268268
path: str,
269-
*,
270269
mode: Optional[str] = None,
271270
mime_type: Optional[str] = None,
272-
id_attribute: Optional[str] = None,
273271
if_none_match: bool = True,
274272
) -> None:
275273
"""Upload a file to a Dataverse file column with automatic method selection.
@@ -289,15 +287,10 @@ def upload_file(
289287
- "auto": Automatically selects best method based on file size
290288
- "small": Single PATCH request (files <128MB only)
291289
- "chunk": Streaming chunked upload (any size, most efficient for large files)
292-
- "block": Message-based block upload (any size, compatibility fallback)
293290
mime_type : str | None, keyword-only, optional
294291
Explicit MIME type to persist with the file (e.g. "application/pdf"). If omitted the
295292
lower-level client attempts to infer from the filename extension and falls back to
296293
``application/octet-stream``.
297-
id_attribute : str | None, keyword-only, optional
298-
Logical name of the primary key attribute for the record (e.g. ``accountid``).
299-
**Required** when using "block" mode; raises ValueError if omitted.
300-
Not used for "small" or "chunk" modes.
301294
if_none_match : bool, keyword-only, optional
302295
When True (default), sends ``If-None-Match: null`` to only succeed if the column is
303296
currently empty. Set False to always overwrite (uses ``If-Match: *``).
@@ -315,7 +308,6 @@ def upload_file(
315308
path,
316309
mode=mode,
317310
mime_type=mime_type,
318-
id_attribute=id_attribute,
319311
if_none_match=if_none_match,
320312
)
321313
return None

0 commit comments

Comments
 (0)