From 3b6fdc03704aba1e786f4d4c42e7d0f4fee4c0ca Mon Sep 17 00:00:00 2001 From: An Qiuyu Date: Sun, 3 Aug 2025 09:57:03 +0800 Subject: [PATCH 1/5] Implement provider for azure blob storage --- setup.py | 1 + tests/providers/azureblobstorage/__init__.py | 0 tests/providers/azureblobstorage/fixtures.py | 249 ++++++++ .../azureblobstorage/fixtures/auth.json | 6 + .../fixtures/blob_list_response.xml | 29 + .../fixtures/blob_list_with_versions.xml | 31 + .../fixtures/blob_properties_headers.json | 20 + .../fixtures/credentials.json | 3 + .../fixtures/empty_folder_check.xml | 10 + .../fixtures/empty_list_response.xml | 10 + .../fixtures/error_authentication_failed.xml | 6 + .../fixtures/error_authorization_failure.xml | 5 + .../fixtures/error_internal_error.xml | 5 + .../fixtures/error_not_found.xml | 5 + .../fixtures/folder_exists.xml | 18 + .../fixtures/folder_not_found_response.xml | 10 + .../fixtures/folder_placeholder_headers.json | 6 + .../fixtures/folder_validation_response.xml | 26 + .../fixtures/root_list_response.xml | 32 + .../azureblobstorage/fixtures/settings.json | 5 + .../fixtures/special_characters_blobs.xml | 35 ++ .../azureblobstorage/test_metadata.py | 205 +++++++ .../azureblobstorage/test_provider.py | 551 ++++++++++++++++++ waterbutler/core/streams/__init__.py | 2 +- waterbutler/core/streams/base.py | 18 + .../providers/azureblobstorage/__init__.py | 3 + .../providers/azureblobstorage/exceptions.py | 31 + .../providers/azureblobstorage/metadata.py | 124 ++++ .../providers/azureblobstorage/provider.py | 470 +++++++++++++++ .../providers/azureblobstorage/settings.py | 6 + 30 files changed, 1921 insertions(+), 1 deletion(-) create mode 100644 tests/providers/azureblobstorage/__init__.py create mode 100644 tests/providers/azureblobstorage/fixtures.py create mode 100644 tests/providers/azureblobstorage/fixtures/auth.json create mode 100644 tests/providers/azureblobstorage/fixtures/blob_list_response.xml create mode 100644 tests/providers/azureblobstorage/fixtures/blob_list_with_versions.xml create mode 100644 tests/providers/azureblobstorage/fixtures/blob_properties_headers.json create mode 100644 tests/providers/azureblobstorage/fixtures/credentials.json create mode 100644 tests/providers/azureblobstorage/fixtures/empty_folder_check.xml create mode 100644 tests/providers/azureblobstorage/fixtures/empty_list_response.xml create mode 100644 tests/providers/azureblobstorage/fixtures/error_authentication_failed.xml create mode 100644 tests/providers/azureblobstorage/fixtures/error_authorization_failure.xml create mode 100644 tests/providers/azureblobstorage/fixtures/error_internal_error.xml create mode 100644 tests/providers/azureblobstorage/fixtures/error_not_found.xml create mode 100644 tests/providers/azureblobstorage/fixtures/folder_exists.xml create mode 100644 tests/providers/azureblobstorage/fixtures/folder_not_found_response.xml create mode 100644 tests/providers/azureblobstorage/fixtures/folder_placeholder_headers.json create mode 100644 tests/providers/azureblobstorage/fixtures/folder_validation_response.xml create mode 100644 tests/providers/azureblobstorage/fixtures/root_list_response.xml create mode 100644 tests/providers/azureblobstorage/fixtures/settings.json create mode 100644 tests/providers/azureblobstorage/fixtures/special_characters_blobs.xml create mode 100644 tests/providers/azureblobstorage/test_metadata.py create mode 100644 tests/providers/azureblobstorage/test_provider.py create mode 100644 waterbutler/providers/azureblobstorage/__init__.py create mode 100644 waterbutler/providers/azureblobstorage/exceptions.py create mode 100644 waterbutler/providers/azureblobstorage/metadata.py create mode 100644 waterbutler/providers/azureblobstorage/provider.py create mode 100644 waterbutler/providers/azureblobstorage/settings.py diff --git a/setup.py b/setup.py index 910a6fa39c..3067af4110 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ 'googledrive = waterbutler.providers.googledrive:GoogleDriveProvider', 'onedrive = waterbutler.providers.onedrive:OneDriveProvider', 'googlecloud = waterbutler.providers.googlecloud:GoogleCloudProvider', + 'azureblobstorage = waterbutler.providers.azureblobstorage:AzureBlobStorageProvider', ], }, ) diff --git a/tests/providers/azureblobstorage/__init__.py b/tests/providers/azureblobstorage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/providers/azureblobstorage/fixtures.py b/tests/providers/azureblobstorage/fixtures.py new file mode 100644 index 0000000000..5da2803808 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures.py @@ -0,0 +1,249 @@ +""" +Azure Blob Storage test fixtures that load from JSON and XML files +""" + +import pytest +import io +import json +import os +from pathlib import Path +from waterbutler.core import streams +from waterbutler.providers.azureblobstorage.provider import AzureBlobStorageProvider + + +# Get the fixtures directory path +FIXTURES_DIR = Path(__file__).parent / 'fixtures' + + +def load_fixture(filename): + """Load a fixture file from the fixtures directory""" + filepath = FIXTURES_DIR / filename + if filename.endswith('.xml'): + with open(filepath, 'r', encoding='utf-8') as f: + return f.read() + else: + with open(filepath, 'r', encoding='utf-8') as f: + return json.load(f) + + +# ============== Authentication Fixtures (JSON) ============== + +@pytest.fixture +def auth(): + """Auth information from the user""" + return load_fixture('auth.json') + + +@pytest.fixture +def credentials(): + """OAuth credentials from Azure Entra ID""" + return load_fixture('credentials.json') + + +@pytest.fixture +def settings(): + """Provider settings for Azure Blob Storage""" + return load_fixture('settings.json') + + +# ============== Response Fixtures (Mixed JSON/XML) ============== + +@pytest.fixture +def blob_properties_headers(): + """Standard blob properties headers (JSON format for headers)""" + return load_fixture('blob_properties_headers.json') + + +# XML Response Fixtures - Direct from Azure API format +@pytest.fixture +def blob_list_xml(): + """Standard blob list XML response""" + return load_fixture('blob_list_response.xml') + + +@pytest.fixture +def blob_list_with_versions_response(): + """Blob list with version information XML response""" + return load_fixture('blob_list_with_versions.xml') + + +@pytest.fixture +def empty_list_xml(): + """Empty folder list XML response""" + return load_fixture('empty_list_response.xml') + + +@pytest.fixture +def root_list_xml(): + """Root container list XML response""" + return load_fixture('root_list_response.xml') + + +@pytest.fixture +def special_characters_blobs(): + """Blobs with special characters in names XML response""" + return load_fixture('special_characters_blobs.xml') + + +# Error Response Fixtures +@pytest.fixture +def error_authentication_failed_xml(): + """Authentication failed error XML response""" + return load_fixture('error_authentication_failed.xml') + + +@pytest.fixture +def error_authorization_failure_xml(): + """Authorization failure error XML response""" + return load_fixture('error_authorization_failure.xml') + + +@pytest.fixture +def error_not_found_xml(): + """Blob not found error XML response""" + return load_fixture('error_not_found.xml') + + +@pytest.fixture +def error_internal_error_xml(): + """Internal server error XML response""" + return load_fixture('error_internal_error.xml') + + +@pytest.fixture +def error_response_xml(): + """Error response XML generator""" + def _error_xml(error_type): + try: + return load_fixture(f'error_{error_type}.xml') + except FileNotFoundError: + # Default error XML + return ''' + + UnknownError + Unknown error occurred + ''' + + return _error_xml + + +# ============== Provider Fixture ============== + +@pytest.fixture +def provider(auth, credentials, settings): + return AzureBlobStorageProvider(auth, credentials, settings) + + +# ============== Stream Fixtures ============== + +@pytest.fixture +def file_content(): + """Basic file content for testing""" + return b'SLEEP IS FOR THE WEAK GO SERVE STREAMS' + + +@pytest.fixture +def large_file_content(): + """Large file content for multipart upload testing""" + # 10 MB of data + return b'x' * (10 * 1024 * 1024) + + +@pytest.fixture +def file_like(file_content): + """File-like object""" + return io.BytesIO(file_content) + + +@pytest.fixture +def large_file_like(large_file_content): + """Large file-like object""" + return io.BytesIO(large_file_content) + + +@pytest.fixture +def file_stream(file_like): + """File stream for upload testing""" + return streams.FileStreamReader(file_like) + + +@pytest.fixture +def large_file_stream(large_file_like): + """Large file stream for multipart upload testing""" + return streams.FileStreamReader(large_file_like) + + +# ============== Folder Creation Fixtures ============== + +@pytest.fixture +def empty_folder_list_xml(): + """Empty folder list response for checking if folder exists""" + return load_fixture('empty_folder_check.xml') + + +@pytest.fixture +def folder_validation_response_xml(): + """XML response for folder validation (folder exists)""" + return load_fixture('folder_validation_response.xml') + + +@pytest.fixture +def folder_not_found_response_xml(): + """XML response for folder validation (folder does not exist)""" + return load_fixture('folder_not_found_response.xml') + + +@pytest.fixture +def folder_exists_xml(): + """XML response indicating folder already has content""" + return load_fixture('folder_exists.xml') + + +@pytest.fixture +def folder_placeholder_headers(): + """Standard headers for folder placeholder creation""" + return load_fixture('folder_placeholder_headers.json') + + +@pytest.fixture +def create_folder_test_data(): + """Test data for various folder creation scenarios""" + return { + 'simple_folder': { + 'path': '/newfolder/', + 'name': 'newfolder', + 'placeholder': 'newfolder/.osfkeep' + }, + 'nested_folder': { + 'path': '/parent/child/', + 'name': 'child', + 'placeholder': 'parent/child/.osfkeep' + }, + 'special_chars_folder': { + 'path': '/folder with spaces/', + 'name': 'folder with spaces', + 'placeholder': 'folder%20with%20spaces/.osfkeep' + } + } + + +# ============== Helper Functions ============== + +@pytest.fixture +def build_error_response(): + """Build a custom error response for testing""" + def _builder(code, message, status=400, auth_detail=None): + xml_body = f''' + + {code} + {message} + {f"{auth_detail}" if auth_detail else ""} + ''' + + return { + 'status': status, + 'body': xml_body, + 'headers': {'Content-Type': 'application/xml'} + } + + return _builder \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/auth.json b/tests/providers/azureblobstorage/fixtures/auth.json new file mode 100644 index 0000000000..79a7e54ab3 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/auth.json @@ -0,0 +1,6 @@ +{ + "name": "Test User", + "email": "test@example.com", + "id": "12345", + "provider": "azureblobstorage" +} \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/blob_list_response.xml b/tests/providers/azureblobstorage/fixtures/blob_list_response.xml new file mode 100644 index 0000000000..3216cc05d6 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/blob_list_response.xml @@ -0,0 +1,29 @@ + + + test/folder/ + + 5000 + / + + + test/folder/file1.txt + + Mon, 15 Jul 2025 07:28:00 GMT + "0x8D1A2B3C4D5E6F7" + 1024 + text/plain + + + sQqNsWTgdUEFt6mb5y4/5Q== + + BlockBlob + unlocked + available + + + + test/folder/subfolder/ + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/blob_list_with_versions.xml b/tests/providers/azureblobstorage/fixtures/blob_list_with_versions.xml new file mode 100644 index 0000000000..720bab8346 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/blob_list_with_versions.xml @@ -0,0 +1,31 @@ + + + + + test/report.pdf + 2025-07-15T09:45:00.5555555Z + true + + Mon, 15 Jul 2025 09:45:00 GMT + "0x8D1A2B3C4D5E702" + 524288 + application/pdf + BlockBlob + Hot + + + + test/report.pdf + 2025-07-15T08:30:00.4444444Z + false + + Mon, 15 Jul 2025 08:30:00 GMT + "0x8D1A2B3C4D5E701" + 520192 + application/pdf + BlockBlob + Hot + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/blob_properties_headers.json b/tests/providers/azureblobstorage/fixtures/blob_properties_headers.json new file mode 100644 index 0000000000..e9db68a9b3 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/blob_properties_headers.json @@ -0,0 +1,20 @@ +{ + "Accept-Ranges": "bytes", + "Content-Length": "1048576", + "Content-Type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "Content-MD5": "Q2h1Y2sgTm9ycmlzIGFwcHJvdmVz", + "Last-Modified": "Mon, 15 Jul 2025 09:45:00 GMT", + "ETag": "\"0x8D1A2B3C4D5E702\"", + "x-ms-blob-type": "BlockBlob", + "x-ms-lease-status": "unlocked", + "x-ms-lease-state": "available", + "x-ms-access-tier": "Hot", + "x-ms-access-tier-inferred": "true", + "x-ms-server-encrypted": "true", + "x-ms-version-id": "2025-07-15T09:45:00.5555555Z", + "x-ms-is-current-version": "true", + "x-ms-creation-time": "Mon, 15 Jul 2025 09:45:00 GMT", + "x-ms-meta-author": "Data Science Team", + "x-ms-meta-project": "Q3-Analysis", + "x-ms-meta-tags": "quarterly,financial,2025" +} diff --git a/tests/providers/azureblobstorage/fixtures/credentials.json b/tests/providers/azureblobstorage/fixtures/credentials.json new file mode 100644 index 0000000000..e4c21bf618 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/credentials.json @@ -0,0 +1,3 @@ +{ + "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6Ik1uQ19WWmNBVGZNNXBP..." +} diff --git a/tests/providers/azureblobstorage/fixtures/empty_folder_check.xml b/tests/providers/azureblobstorage/fixtures/empty_folder_check.xml new file mode 100644 index 0000000000..7e878abf72 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/empty_folder_check.xml @@ -0,0 +1,10 @@ + + + test/newfolder/ + + 5000 + / + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/empty_list_response.xml b/tests/providers/azureblobstorage/fixtures/empty_list_response.xml new file mode 100644 index 0000000000..074872eab2 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/empty_list_response.xml @@ -0,0 +1,10 @@ + + + + + 5000 + / + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/error_authentication_failed.xml b/tests/providers/azureblobstorage/fixtures/error_authentication_failed.xml new file mode 100644 index 0000000000..2260451afb --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/error_authentication_failed.xml @@ -0,0 +1,6 @@ + + + AuthenticationFailed + Server failed to authenticate the request. Make sure the value of the Authorization header is formed correctly including the signature. + The MAC signature found in the HTTP request is not the same as any computed signature. + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/error_authorization_failure.xml b/tests/providers/azureblobstorage/fixtures/error_authorization_failure.xml new file mode 100644 index 0000000000..76b6787f0e --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/error_authorization_failure.xml @@ -0,0 +1,5 @@ + + + AuthorizationFailure + This request is not authorized to perform this operation. + diff --git a/tests/providers/azureblobstorage/fixtures/error_internal_error.xml b/tests/providers/azureblobstorage/fixtures/error_internal_error.xml new file mode 100644 index 0000000000..1defb804ff --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/error_internal_error.xml @@ -0,0 +1,5 @@ + + + InternalError + The server encountered an internal error. Please retry the request. + diff --git a/tests/providers/azureblobstorage/fixtures/error_not_found.xml b/tests/providers/azureblobstorage/fixtures/error_not_found.xml new file mode 100644 index 0000000000..8484b1881e --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/error_not_found.xml @@ -0,0 +1,5 @@ + + + BlobNotFound + The specified blob does not exist. + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/folder_exists.xml b/tests/providers/azureblobstorage/fixtures/folder_exists.xml new file mode 100644 index 0000000000..f130e5d12e --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/folder_exists.xml @@ -0,0 +1,18 @@ + + + test/existingfolder/ + / + + + test/existingfolder/.osfkeep + + 100 + text/plain + Mon, 15 Jul 2025 08:00:00 GMT + "0x8D1A2B3C4D5E6F8" + BlockBlob + + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/folder_not_found_response.xml b/tests/providers/azureblobstorage/fixtures/folder_not_found_response.xml new file mode 100644 index 0000000000..546d50aa6e --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/folder_not_found_response.xml @@ -0,0 +1,10 @@ + + + test/nonexistent-folder/ + + 1 + / + + + + diff --git a/tests/providers/azureblobstorage/fixtures/folder_placeholder_headers.json b/tests/providers/azureblobstorage/fixtures/folder_placeholder_headers.json new file mode 100644 index 0000000000..c8fe28c66f --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/folder_placeholder_headers.json @@ -0,0 +1,6 @@ +{ + "ETag": "\"0x8D1A2B3C4D5E6FB\"", + "Last-Modified": "Mon, 15 Jul 2025 10:00:00 GMT", + "Content-Length": "0", + "x-ms-blob-type": "BlockBlob" +} diff --git a/tests/providers/azureblobstorage/fixtures/folder_validation_response.xml b/tests/providers/azureblobstorage/fixtures/folder_validation_response.xml new file mode 100644 index 0000000000..4c42d05e35 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/folder_validation_response.xml @@ -0,0 +1,26 @@ + + + test-folder/ + + 1 + / + + + test-folder/file1.txt + + Mon, 15 Jul 2025 08:00:00 GMT + "0x8D1A2B3C4D5E6F8" + 100 + text/plain + + + + + BlockBlob + unlocked + available + + + + + diff --git a/tests/providers/azureblobstorage/fixtures/root_list_response.xml b/tests/providers/azureblobstorage/fixtures/root_list_response.xml new file mode 100644 index 0000000000..163a0e313d --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/root_list_response.xml @@ -0,0 +1,32 @@ + + + test/ + + 5000 + / + + + test/root-file.txt + + Mon, 15 Jul 2025 10:00:00 GMT + "0x8D1A2B3C4D5E703" + 256 + text/plain + + + + + BlockBlob + unlocked + available + + + + test/documents/ + + + test/images/ + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/fixtures/settings.json b/tests/providers/azureblobstorage/fixtures/settings.json new file mode 100644 index 0000000000..06139ebe40 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/settings.json @@ -0,0 +1,5 @@ +{ + "container": "test-container", + "account_name": "teststorageaccount", + "base_folder": "test/" +} diff --git a/tests/providers/azureblobstorage/fixtures/special_characters_blobs.xml b/tests/providers/azureblobstorage/fixtures/special_characters_blobs.xml new file mode 100644 index 0000000000..735ed2f609 --- /dev/null +++ b/tests/providers/azureblobstorage/fixtures/special_characters_blobs.xml @@ -0,0 +1,35 @@ + + + + + test/file with spaces.txt + + Mon, 15 Jul 2025 11:00:00 GMT + "0x8D1A2B3C4D5E704" + 512 + text/plain + BlockBlob + + + + test/special@#$%file.txt + + Mon, 15 Jul 2025 11:15:00 GMT + "0x8D1A2B3C4D5E705" + 128 + text/plain + BlockBlob + + + + test/中文文件名.txt + + Mon, 15 Jul 2025 11:30:00 GMT + "0x8D1A2B3C4D5E706" + 64 + text/plain + BlockBlob + + + + \ No newline at end of file diff --git a/tests/providers/azureblobstorage/test_metadata.py b/tests/providers/azureblobstorage/test_metadata.py new file mode 100644 index 0000000000..a7307fb34c --- /dev/null +++ b/tests/providers/azureblobstorage/test_metadata.py @@ -0,0 +1,205 @@ +from tests.providers.azureblobstorage.fixtures import ( + blob_list_with_versions_response, blob_properties_headers, blob_list_xml, + special_characters_blobs, empty_list_xml, root_list_xml, + provider, auth, credentials, settings +) +from waterbutler.core.path import WaterButlerPath +from waterbutler.providers.azureblobstorage.metadata import ( + AzureBlobStorageFileMetadata, + AzureBlobStorageFileMetadataHeaders, + AzureBlobStorageFolderMetadata +) + + + +class TestAzureBlobFileMetadata: + """Test file metadata transformation from various sources""" + + def test_file_metadata_from_headers(self, blob_properties_headers): + """Test creating file metadata from HTTP headers""" + path = WaterButlerPath('/test-file.xlsx') + + metadata = AzureBlobStorageFileMetadataHeaders(path.path, blob_properties_headers) + + assert metadata.name == 'test-file.xlsx' + assert metadata.path == '/test-file.xlsx' + assert metadata.size == int(blob_properties_headers['Content-Length']) + assert metadata.content_type == blob_properties_headers['Content-Type'] + assert metadata.etag == blob_properties_headers['ETag'].strip('"') + assert metadata.modified == blob_properties_headers['Last-Modified'] + assert metadata.extra['md5'] == blob_properties_headers.get('Content-MD5', '') + + def test_file_metadata_from_list_response(self, provider, blob_list_xml): + """Test creating file metadata from XML list response""" + parsed = provider._convert_xml_to_blob_list(blob_list_xml) + + # Get the first blob from the parsed result + blobs = parsed.get('Blob', []) + assert len(blobs) > 0, "No blobs found in test fixture" + + blob_data = blobs[0] + metadata = AzureBlobStorageFileMetadata(blob_data) + + assert metadata.name == 'file1.txt' + assert '/file1.txt' in metadata.path + assert metadata.size == 1024 + assert metadata.content_type == 'text/plain' + assert metadata.etag == '0x8D1A2B3C4D5E6F7' + assert metadata.modified == 'Mon, 15 Jul 2025 07:28:00 GMT' + assert metadata.extra['md5'] == 'sQqNsWTgdUEFt6mb5y4/5Q==' + + def test_file_metadata_with_versions(self, provider, blob_list_with_versions_response): + """Test file metadata with version information""" + parsed = provider._convert_xml_to_blob_list(blob_list_with_versions_response) + + blobs = parsed.get('Blob', []) + assert len(blobs) > 0, "No blobs found in test fixture" + + # Get the first blob (current version) + blob_data = blobs[0] + metadata = AzureBlobStorageFileMetadata(blob_data) + + assert metadata.name == 'report.pdf' + assert '/report.pdf' in metadata.path + assert metadata.size == 524288 + assert metadata.extra['blob_type'] == 'BlockBlob' + + def test_file_metadata_minimal(self): + """Test file metadata with minimal required fields""" + headers = { + 'Content-Length': '100', + 'Last-Modified': 'Mon, 15 Jul 2025 12:00:00 GMT', + 'ETag': '"0x8D1A2B3C4D5E999"', + 'Content-Type': 'application/octet-stream' + } + + path = WaterButlerPath('/minimal.txt') + metadata = AzureBlobStorageFileMetadataHeaders(path.path, headers) + + assert metadata.name == 'minimal.txt' + assert metadata.size == 100 + assert metadata.content_type == 'application/octet-stream' + assert metadata.etag == '0x8D1A2B3C4D5E999' + + def test_file_metadata_special_characters(self, provider, special_characters_blobs): + """Test handling special characters in filenames""" + parsed = provider._convert_xml_to_blob_list(special_characters_blobs) + + blobs = parsed.get('Blob', []) + assert len(blobs) >= 2, "Expected at least 2 blobs with special characters" + + # Test first blob (file with spaces) + blob_data1 = blobs[0] + metadata1 = AzureBlobStorageFileMetadata(blob_data1) + assert 'file with spaces.txt' in metadata1.name + + # Test second blob (special characters) + blob_data2 = blobs[1] + metadata2 = AzureBlobStorageFileMetadata(blob_data2) + assert 'special' in metadata2.name or '@' in metadata2.name + + def test_file_metadata_serialization(self): + """Test JSON API serialization of file metadata""" + headers = { + 'Content-Length': '2048000', + 'Content-Type': 'application/pdf', + 'ETag': '"0x8D1A2B3C4D5E700"', + 'Last-Modified': 'Mon, 15 Jul 2025 14:30:00 GMT' + } + + path = WaterButlerPath('/report.pdf') + metadata = AzureBlobStorageFileMetadataHeaders(path.path, headers) + + assert metadata.name == 'report.pdf' + assert metadata.size == 2048000 + assert metadata.content_type == 'application/pdf' + assert metadata.etag == '0x8D1A2B3C4D5E700' + + +class TestAzureBlobFolderMetadata: + """Test folder metadata transformation""" + + def test_folder_metadata_from_path(self): + """Test creating folder metadata from blob prefix""" + folder_data = {'Name': 'documents/'} + metadata = AzureBlobStorageFolderMetadata(folder_data) + + assert metadata.name == 'documents' + assert '/documents/' in metadata.path + + def test_folder_metadata_from_list_response(self, provider, blob_list_xml): + """Test creating folder metadata from blob prefix in list response""" + parsed = provider._convert_xml_to_blob_list(blob_list_xml) + + # Get folder prefixes from the parsed result + prefixes = parsed.get('BlobPrefix', []) + if len(prefixes) > 0: + folder_data = prefixes[0] # Get the first prefix + metadata = AzureBlobStorageFolderMetadata(folder_data) + + assert 'subfolder' in metadata.name + assert '/subfolder/' in metadata.path + + def test_folder_metadata_root(self): + """Test root folder metadata""" + folder_data = {'Name': ''} + metadata = AzureBlobStorageFolderMetadata(folder_data) + + assert metadata.name == '' + + def test_folder_metadata_nested(self): + """Test nested folder paths""" + test_cases = [ + ('a/', 'a'), + ('a/b/', 'b'), + ('a/b/c/', 'c'), + ('path/to/deep/folder/', 'folder') + ] + + for folder_path, expected_name in test_cases: + folder_data = {'Name': folder_path} + metadata = AzureBlobStorageFolderMetadata(folder_data) + assert metadata.name == expected_name + + def test_folder_metadata_serialization(self): + """Test basic properties of folder metadata""" + folder_data = {'Name': 'projects/'} + metadata = AzureBlobStorageFolderMetadata(folder_data) + + assert metadata.name == 'projects' + assert '/projects/' in metadata.path + + +class TestMetadataListProcessing: + """Test processing lists of metadata from API responses""" + + def test_process_empty_folder(self, provider, empty_list_xml): + """Test processing empty folder response""" + parsed = provider._convert_xml_to_blob_list(empty_list_xml) + + blobs = parsed.get('Blob', []) + prefixes = parsed.get('BlobPrefix', []) + + assert len(blobs) == 0 + assert len(prefixes) == 0 + + def test_process_root_listing(self, provider, root_list_xml): + """Test processing root container listing""" + parsed = provider._convert_xml_to_blob_list(root_list_xml) + + blobs = parsed.get('Blob', []) + assert len(blobs) >= 0 + + prefixes = parsed.get('BlobPrefix', []) + folder_names = [p['Name'].rstrip('/') for p in prefixes] + assert isinstance(folder_names, list) + + def test_mixed_content_listing(self, provider, blob_list_xml): + """Test folder with both files and subfolders""" + parsed = provider._convert_xml_to_blob_list(blob_list_xml) + + blobs = parsed.get('Blob', []) + prefixes = parsed.get('BlobPrefix', []) + + assert len(blobs) >= 0 + assert len(prefixes) >= 0 diff --git a/tests/providers/azureblobstorage/test_provider.py b/tests/providers/azureblobstorage/test_provider.py new file mode 100644 index 0000000000..62c3315b3d --- /dev/null +++ b/tests/providers/azureblobstorage/test_provider.py @@ -0,0 +1,551 @@ +import pytest + + +import aiohttpretty + +from waterbutler.core import exceptions +from waterbutler.core.path import WaterButlerPath +from waterbutler.providers.azureblobstorage.provider import AzureBlobStorageProvider + +from tests.providers.azureblobstorage.fixtures import ( + auth, credentials, settings, provider, file_content, + file_stream, large_file_stream, large_file_content, + file_like, large_file_like, blob_list_xml, root_list_xml, + empty_folder_list_xml, folder_placeholder_headers, + folder_exists_xml, folder_validation_response_xml, + error_authentication_failed_xml, error_authorization_failure_xml, + error_not_found_xml, error_internal_error_xml, folder_not_found_response_xml +) + + +class TestValidatePath: + """Test path validation and parsing""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_validate_v1_path_file(self, provider): + """Test file path validation""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'HEAD', + blob_url, + status=200, + headers={'Content-Type': 'text/plain', 'Content-Length': '100'} + ) + + wb_path_v1 = await provider.validate_v1_path('/test-file.txt') + wb_path_v0 = await provider.validate_path('/test-file.txt') + + assert wb_path_v1 == wb_path_v0 + assert not wb_path_v1.is_dir + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_validate_v1_path_folder(self, provider, folder_validation_response_xml): + """Test folder path validation""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={ + 'restype': 'container', + 'comp': 'list', + 'prefix': 'test/test-folder/', + 'delimiter': '/', + 'maxresults': '1' + }, + body=folder_validation_response_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + wb_path_v1 = await provider.validate_v1_path('/test-folder/') + wb_path_v0 = await provider.validate_path('/test-folder/') + + assert wb_path_v1.is_dir + assert wb_path_v0.is_dir + assert wb_path_v1.path == 'test-folder/' + assert wb_path_v0.path == 'test-folder/' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_validate_v1_path_folder_not_found(self, provider, folder_not_found_response_xml): + """Test folder path validation for non-existent folder""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={ + 'restype': 'container', + 'comp': 'list', + 'prefix': 'test/nonexistent-folder/', + 'delimiter': '/', + 'maxresults': '1' + }, + body=folder_not_found_response_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + with pytest.raises(exceptions.NotFoundError): + await provider.validate_v1_path('/nonexistent-folder/') + + wb_path_v0 = await provider.validate_path('/nonexistent-folder/') + assert wb_path_v0.is_dir + assert wb_path_v0.path == 'nonexistent-folder/' + + +class TestDownload: + """Test file download operations""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_download_file(self, provider): + """Test basic file download""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'GET', + blob_url, + body=b'test file content', + headers={ + 'Content-Type': 'text/plain', + 'Content-Length': '17', + 'ETag': '"0x8D1A2B3C4D5E6F7"', + 'Last-Modified': 'Mon, 15 Jul 2025 07:28:00 GMT' + }, + status=200 + ) + + path = WaterButlerPath('/test-file.txt') + result = await provider.download(path) + content = await result.read() + assert content == b'test file content' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_download_range(self, provider): + """Test download with byte range""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'GET', + blob_url, + body=b'te', + headers={ + 'Content-Range': 'bytes 0-1/17', + 'Content-Type': 'text/plain', + 'Content-Length': '2' + }, + status=206 + ) + + path = WaterButlerPath('/test-file.txt') + result = await provider.download(path, range=(0, 1)) + content = await result.read() + assert content == b'te' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_download_unauthorized(self, provider, error_authentication_failed_xml): + """Test download with invalid OAuth token""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'GET', + blob_url, + status=401, + headers={'Content-Type': 'application/xml'}, + body=error_authentication_failed_xml + ) + + path = WaterButlerPath('/test-file.txt') + with pytest.raises(exceptions.DownloadError): + await provider.download(path) + + +class TestUpload: + """Test file upload operations""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_file(self, provider, file_stream): + """Test basic file upload""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-upload.txt')) + + aiohttpretty.register_uri( + 'HEAD', + blob_url, + responses=[ + {'status': 404}, + { + 'status': 200, + 'headers': { + 'Content-Type': 'application/octet-stream', + 'Content-Length': '39', + 'ETag': '"0x8D1A2B3C4D5E6F8"', + 'Last-Modified': 'Mon, 15 Jul 2025 08:00:00 GMT' + } + } + ] + ) + + aiohttpretty.register_uri( + 'PUT', + blob_url, + headers={ + 'ETag': '"0x8D1A2B3C4D5E6F8"', + 'Last-Modified': 'Mon, 15 Jul 2025 08:00:00 GMT' + }, + status=201 + ) + + path = WaterButlerPath('/test-upload.txt') + metadata, created = await provider.upload(file_stream, path) + + assert created is True + assert metadata.name == 'test-upload.txt' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_overwrite(self, provider, file_stream): + """Test upload overwriting existing file""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('existing-file.txt')) + + aiohttpretty.register_uri( + 'HEAD', + blob_url, + responses=[ + { + 'status': 200, + 'headers': { + 'ETag': '"0x8D1A2B3C4D5E6F7"', + 'Content-Length': '1024', + 'Last-Modified': 'Mon, 15 Jul 2025 07:00:00 GMT' + } + }, + { + 'status': 200, + 'headers': { + 'Content-Type': 'application/octet-stream', + 'Content-Length': '39', + 'ETag': '"0x8D1A2B3C4D5E6F9"', + 'Last-Modified': 'Mon, 15 Jul 2025 08:30:00 GMT' + } + } + ] + ) + + aiohttpretty.register_uri( + 'PUT', + blob_url, + headers={ + 'ETag': '"0x8D1A2B3C4D5E6F9"', + 'Last-Modified': 'Mon, 15 Jul 2025 08:30:00 GMT' + }, + status=201 + ) + + path = WaterButlerPath('/existing-file.txt') + metadata, created = await provider.upload(file_stream, path, conflict='replace') + + assert created is False + assert metadata.name == 'existing-file.txt' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_large_file_simplified(self, provider, large_file_stream): + """Test large file upload""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('large-file.bin')) + + aiohttpretty.register_uri( + 'HEAD', + blob_url, + responses=[ + {'status': 404}, + { + 'status': 200, + 'headers': { + 'Content-Type': 'application/octet-stream', + 'Content-Length': '10485760', # 10MB + 'ETag': '"0x8D1A2B3C4D5E6FA"', + 'Last-Modified': 'Mon, 15 Jul 2025 09:00:00 GMT' + } + } + ] + ) + + aiohttpretty.register_uri( + 'PUT', + blob_url, + headers={ + 'ETag': '"0x8D1A2B3C4D5E6FA"', + 'Last-Modified': 'Mon, 15 Jul 2025 09:00:00 GMT' + }, + status=201 + ) + + aiohttpretty.register_uri( + 'PUT', + f'{blob_url}?comp=block&blockid=*', + status=201, + match_querystring=False + ) + + aiohttpretty.register_uri( + 'PUT', + f'{blob_url}?comp=blocklist', + headers={'ETag': '"0x8D1A2B3C4D5E6FA"'}, + status=201 + ) + + path = WaterButlerPath('/large-file.bin') + metadata, created = await provider.upload(large_file_stream, path) + + assert created is True + assert metadata.name == 'large-file.bin' + + +class TestMetadata: + """Test metadata operations""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_metadata_file(self, provider): + """Test getting file metadata""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'HEAD', + blob_url, + headers={ + 'Content-Type': 'text/plain', + 'Content-Length': '1024', + 'ETag': '"0x8D1A2B3C4D5E6F7"', + 'Last-Modified': 'Mon, 15 Jul 2025 07:28:00 GMT' + }, + status=200 + ) + + path = WaterButlerPath('/test-file.txt') + metadata = await provider.metadata(path) + + assert metadata.name == 'test-file.txt' + assert metadata.size == 1024 + assert metadata.content_type == 'text/plain' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_metadata_folder(self, provider, blob_list_xml): + """Test listing folder contents""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={'restype': 'container', 'comp': 'list', 'prefix': 'test/folder/', 'delimiter': '/'}, + body=blob_list_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + path = WaterButlerPath('/folder/') + metadata_list = await provider.metadata(path) + + assert isinstance(metadata_list, list) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_metadata_root(self, provider, root_list_xml): + """Test listing root container contents""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={'restype': 'container', 'comp': 'list', 'prefix': 'test/', 'delimiter': '/'}, + body=root_list_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + path = WaterButlerPath('/') + metadata_list = await provider.metadata(path) + assert isinstance(metadata_list, list) + + +class TestDelete: + """Test delete operations""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_delete_file(self, provider): + """Test deleting a file""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri('DELETE', blob_url, status=202) + + path = WaterButlerPath('/test-file.txt') + await provider.delete(path) + + assert aiohttpretty.has_call(method='DELETE', uri=blob_url) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_delete_not_found(self, provider, error_not_found_xml): + """Test deleting non-existent file""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('missing-file.txt')) + + aiohttpretty.register_uri( + 'DELETE', + blob_url, + status=404, + headers={'Content-Type': 'application/xml'}, + body=error_not_found_xml + ) + + path = WaterButlerPath('/missing-file.txt') + await provider.delete(path) + + +class TestCreateFolder: + """Test folder creation""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_create_folder(self, provider, empty_folder_list_xml, folder_placeholder_headers): + """Test creating a folder (placeholder blob)""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={'restype': 'container', 'comp': 'list', 'prefix': 'test/newfolder/', 'delimiter': '/'}, + body=empty_folder_list_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + placeholder_url = provider.build_url(provider.container, provider._get_blob_path('newfolder/.osfkeep')) + + aiohttpretty.register_uri( + 'HEAD', + provider.build_url(provider.container, provider._get_blob_path('newfolder')), + status=404 + ) + + aiohttpretty.register_uri( + 'PUT', + placeholder_url, + headers=folder_placeholder_headers, + status=201 + ) + + path = WaterButlerPath('/newfolder/') + metadata = await provider.create_folder(path) + assert metadata.name == 'newfolder' + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_create_folder_already_exists(self, provider, folder_exists_xml): + """Test creating a folder that already exists""" + list_url = provider.build_url(provider.container) + + aiohttpretty.register_uri( + 'GET', + list_url, + params={'restype': 'container', 'comp': 'list', 'prefix': 'test/existingfolder/', 'delimiter': '/'}, + body=folder_exists_xml, + headers={'Content-Type': 'application/xml'}, + status=200 + ) + + path = WaterButlerPath('/existingfolder/') + with pytest.raises(exceptions.FolderNamingConflict): + await provider.create_folder(path) + + +class TestAuthentication: + """Test authentication and provider initialization""" + + def test_provider_initialization(self, auth, credentials, settings): + """Test provider initialization with OAuth credentials""" + provider_instance = AzureBlobStorageProvider(auth, credentials, settings) + + assert provider_instance.account_name == settings['account_name'] + assert provider_instance.container == settings['container'] + assert provider_instance.auth_token == credentials['token'] + + def test_provider_validation_missing_credentials(self, auth, settings): + """Test provider fails with missing credentials""" + with pytest.raises(ValueError, match="token is required"): + AzureBlobStorageProvider(auth, {}, settings) + + def test_provider_validation_missing_settings(self, auth, credentials): + """Test provider fails with missing settings""" + with pytest.raises(ValueError, match="account_name is required"): + AzureBlobStorageProvider(auth, credentials, {'container': 'test'}) + + with pytest.raises(ValueError, match="container is required"): + AzureBlobStorageProvider(auth, credentials, {'account_name': 'test'}) + + +class TestErrorHandling: + """Test core error scenarios""" + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_expired_token(self, provider, error_authentication_failed_xml): + """Test handling of expired OAuth token""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'GET', + blob_url, + status=401, + headers={'Content-Type': 'application/xml'}, + body=error_authentication_failed_xml + ) + + path = WaterButlerPath('/test-file.txt') + with pytest.raises(exceptions.DownloadError): + await provider.download(path) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_insufficient_permissions(self, provider, error_authorization_failure_xml): + """Test handling of insufficient permissions""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'DELETE', + blob_url, + status=403, + headers={'Content-Type': 'application/xml'}, + body=error_authorization_failure_xml + ) + + path = WaterButlerPath('/test-file.txt') + with pytest.raises(exceptions.DeleteError): + await provider.delete(path) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_server_error(self, provider, error_internal_error_xml): + """Test handling of server errors""" + blob_url = provider.build_url(provider.container, provider._get_blob_path('test-file.txt')) + + aiohttpretty.register_uri( + 'GET', + blob_url, + status=500, + headers={'Content-Type': 'application/xml'}, + body=error_internal_error_xml + ) + + path = WaterButlerPath('/test-file.txt') + with pytest.raises(exceptions.DownloadError): + await provider.download(path) diff --git a/waterbutler/core/streams/__init__.py b/waterbutler/core/streams/__init__.py index 8f5230589d..6feeb4780d 100644 --- a/waterbutler/core/streams/__init__.py +++ b/waterbutler/core/streams/__init__.py @@ -4,7 +4,7 @@ from waterbutler.core.streams.base import CutoffStream # noqa from waterbutler.core.streams.base import StringStream # noqa from waterbutler.core.streams.base import EmptyStream # noqa - +from waterbutler.core.streams.base import ByteStream # noqa from waterbutler.core.streams.file import FileStreamReader # noqa from waterbutler.core.streams.file import PartialFileStreamReader # noqa diff --git a/waterbutler/core/streams/base.py b/waterbutler/core/streams/base.py index 449e7922ed..ad44f246ec 100644 --- a/waterbutler/core/streams/base.py +++ b/waterbutler/core/streams/base.py @@ -220,6 +220,24 @@ async def _read(self, n=-1): return await asyncio.StreamReader.read(self, n) +class ByteStream(BaseStream): + def __init__(self, data): + super().__init__() + if not isinstance(data, bytes): + raise TypeError('Data must be either bytes, found {!r}'.format(type(data))) + + self._size = len(data) + self.feed_data(data) + self.feed_eof() + + @property + def size(self): + return self._size + + async def _read(self, n=-1): + return (await asyncio.StreamReader.read(self, n)) + + class EmptyStream(BaseStream): """An empty stream with size 0 that returns nothing when read. Useful for representing empty folders when building zipfiles. diff --git a/waterbutler/providers/azureblobstorage/__init__.py b/waterbutler/providers/azureblobstorage/__init__.py new file mode 100644 index 0000000000..2012c53c63 --- /dev/null +++ b/waterbutler/providers/azureblobstorage/__init__.py @@ -0,0 +1,3 @@ +from waterbutler.providers.azureblobstorage.provider import AzureBlobStorageProvider + +__all__ = ['AzureBlobStorageProvider'] diff --git a/waterbutler/providers/azureblobstorage/exceptions.py b/waterbutler/providers/azureblobstorage/exceptions.py new file mode 100644 index 0000000000..3dafef1110 --- /dev/null +++ b/waterbutler/providers/azureblobstorage/exceptions.py @@ -0,0 +1,31 @@ +from waterbutler.core import exceptions + + +class AzureBlobStorageError(exceptions.ProviderError): + """Base exception for Azure Blob Storage provider.""" + pass + + +class AzureBlobStorageNotFoundError(AzureBlobStorageError, exceptions.NotFoundError): + """Exception for when a blob is not found.""" + pass + + +class AzureBlobStorageUploadError(AzureBlobStorageError, exceptions.UploadError): + """Exception for upload errors.""" + pass + + +class AzureBlobStorageDownloadError(AzureBlobStorageError, exceptions.DownloadError): + """Exception for download errors.""" + pass + + +class AzureBlobStorageDeleteError(AzureBlobStorageError, exceptions.DeleteError): + """Exception for delete errors.""" + pass + + +class AzureBlobStorageMetadataError(AzureBlobStorageError, exceptions.MetadataError): + """Exception for metadata errors.""" + pass diff --git a/waterbutler/providers/azureblobstorage/metadata.py b/waterbutler/providers/azureblobstorage/metadata.py new file mode 100644 index 0000000000..ccff598056 --- /dev/null +++ b/waterbutler/providers/azureblobstorage/metadata.py @@ -0,0 +1,124 @@ +import os + +from waterbutler.core import utils +from waterbutler.core import metadata +import logging + +logger = logging.getLogger(__name__) + + +def strip_char(str, chars): + if str.startswith(chars): + return str[len(chars):] + return str + + +class AzureBlobStorageMetadata(metadata.BaseMetadata): + + @property + def provider(self): + return 'azureblobstorage' + + @property + def name(self): + return os.path.split(self.path)[1] + + @property + def created_utc(self): + return None + + +class AzureBlobStorageFileMetadataHeaders(AzureBlobStorageMetadata, metadata.BaseFileMetadata): + + def __init__(self, path, headers): + self._path = path + super().__init__(dict(headers)) + + @property + def path(self): + return '/' + strip_char(self._path, self.raw.get('base_folder', '')) + + @property + def size(self): + return int(self.raw['Content-Length']) + + @property + def content_type(self): + return self.raw['Content-Type'] + + @property + def created_utc(self): + creation_time = self.raw.get('Creation-Time') + return utils.normalize_datetime(creation_time) + + @property + def modified(self): + return self.raw.get('Last-Modified') + + @property + def etag(self): + return self.raw.get('ETag', '').strip('"') + + @property + def extra(self): + return { + 'md5': self.raw.get('Content-MD5', ''), + 'etag': self.raw.get('ETag', '').strip('"') + } + + +class AzureBlobStorageFileMetadata(AzureBlobStorageMetadata, metadata.BaseFileMetadata): + + @property + def path(self): + return '/' + strip_char(self.raw['Name'], self.raw.get('base_folder', '')) + + @property + def size(self): + return int(self.raw['Properties']['Content-Length']) + + @property + def content_type(self): + return self.raw['Properties']['Content-Type'] + + @property + def created_utc(self): + creation_time = self.raw['Properties'].get('CreationTime') + return utils.normalize_datetime(creation_time) + + @property + def modified(self): + return self.raw['Properties'].get('Last-Modified') + + @property + def etag(self): + return self.raw['Properties'].get('Etag', '').strip('"') + + @property + def extra(self): + return { + 'md5': self.raw['Properties'].get('Content-MD5', ''), + 'etag': self.raw['Properties'].get('Etag', '').strip('"'), + 'blob_type': self.raw['Properties'].get('BlobType', ''), + 'access_tier': self.raw['Properties'].get('AccessTier', ''), + 'creation_time': self.raw['Properties'].get('CreationTime', ''), + 'lease_status': self.raw['Properties'].get('LeaseStatus', ''), + 'lease_state': self.raw['Properties'].get('LeaseState', ''), + 'server_encrypted': self.raw['Properties'].get('ServerEncrypted', '') + } + + +class AzureBlobStorageFolderMetadata(AzureBlobStorageMetadata, metadata.BaseFolderMetadata): + + @property + def name(self): + if not self.raw['Name'] or self.raw['Name'] == '/': + return '' + name_parts = self.raw['Name'].rstrip('/').split('/') + return name_parts[-1] if name_parts else '' + + @property + def path(self): + if self.raw.get('base_folder', ''): + return '/' + strip_char(self.raw['Name'], self.raw.get('base_folder', '')) + return '/' + self.raw['Name'] diff --git a/waterbutler/providers/azureblobstorage/provider.py b/waterbutler/providers/azureblobstorage/provider.py new file mode 100644 index 0000000000..0d37c5383f --- /dev/null +++ b/waterbutler/providers/azureblobstorage/provider.py @@ -0,0 +1,470 @@ +import base64 +import hashlib +import asyncio +import logging +import datetime +import xml.etree.ElementTree as ET +import uuid + +from waterbutler.core import streams +from waterbutler.core import provider +from waterbutler.core import exceptions +from waterbutler.core.path import WaterButlerPath +from waterbutler.core.streams import StringStream, ByteStream + +from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFileMetadata +from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFolderMetadata +from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFileMetadataHeaders + +MAX_UPLOAD_BLOCK_SIZE = 4 * 1024 * 1024 # 4MB +MAX_UPLOAD_ONCE_SIZE = 64 * 1024 * 1024 # 64MB +UPLOAD_PARALLEL_NUM = 2 # must be more than 1 +API_VERSION = '2025-07-05' + +logger = logging.getLogger(__name__) + + +class AzureBlobStorageProvider(provider.BaseProvider): + """Provider for Azure Blob Storage cloud storage service using OAuth2 authentication.""" + + NAME = 'azureblobstorage' + + def __init__(self, auth, credentials, settings, **kwargs): + """ + :param dict auth: Not used + :param dict credentials: Dict containing OAuth2 'token' + :param dict settings: Dict containing 'container', 'account_name', and optional 'prefix' + """ + super().__init__(auth, credentials, settings, **kwargs) + + self.account_name = settings.get('account_name') + self.container = settings.get('container') + self.base_folder = settings.get('base_folder', '/') + self.auth_token = credentials.get('token') + + # Set BASE_URL for the parent class build_url method + self.BASE_URL = f"https://{self.account_name}.blob.core.windows.net" + + if not self.account_name: + raise ValueError("account_name is required") + if not self.container: + raise ValueError("container is required") + if not self.auth_token: + raise ValueError("token is required") + + def _get_blob_path(self, blob_name=None): + if blob_name: + return f"{self.base_folder}{blob_name}" if self.base_folder else blob_name + return self.base_folder or '/' + + def build_url(self, *segments, **query): + processed_segments = [] + + for segment in segments: + if isinstance(segment, str) and segment: + if '/' in segment and segment != '/': + components = [comp for comp in segment.split('/') if comp] + processed_segments.extend(components) + else: + if segment != '/': + processed_segments.append(segment) + + return super().build_url(*processed_segments, **query) + + @property + def default_headers(self): + return { + 'Authorization': f'Bearer {self.auth_token}', + 'x-ms-version': API_VERSION, + 'x-ms-client-request-id': str(uuid.uuid4()), + 'User-Agent': 'WaterButler-AzureBlobStorage/1.0', + 'x-ms-date': datetime.datetime.now().strftime('%a, %d %b %Y %H:%M:%S GMT') + } + + async def validate_v1_path(self, path, **kwargs): + """Validate path exists in storage.""" + if path == '/': + return WaterButlerPath(path) + + implicit_folder = path.endswith('/') + + if implicit_folder: + url = self.build_url(self.container) + params = { + 'restype': 'container', + 'comp': 'list', + 'prefix': f"{self.base_folder + path.lstrip('/')}", + 'delimiter': '/', + 'maxresults': '1' # String to avoid aiohttpretty TypeError with int params + } + + resp = await self.make_request( + 'GET', url, params=params, + expects=(200,), + throws=exceptions.MetadataError + ) + + body = await resp.text() + parsed = self._convert_xml_to_blob_list(body) + if len(parsed['Blob']) == 0 and len(parsed['BlobPrefix']) == 0: + raise exceptions.NotFoundError(str(path)) + else: + url = self.build_url(self.container, self.base_folder, path) + logger.info(f"Validating path: {path} with URL: {url}") + resp = await self.make_request( + 'HEAD', url, + expects=(200, 404), + throws=exceptions.MetadataError + ) + + if resp.status == 404: + raise exceptions.NotFoundError(str(path)) + + return WaterButlerPath(path) + + async def validate_path(self, path, **kwargs): + """Simple path validation.""" + return WaterButlerPath(path) + + def can_duplicate_names(self): + return True + + def can_intra_copy(self, dest_provider, path=None): + return False + + def can_intra_move(self, dest_provider, path=None): + return False + + async def intra_copy(self, dest_provider, source_path, dest_path): + raise NotImplementedError() + + async def download(self, path, accept_url=False, version=None, range=None, **kwargs): + """Download a blob from Azure Storage.""" + if not path.is_file: + raise exceptions.DownloadError('No file specified for download', code=400) + + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + resp = await self.make_request( + 'GET', url, + expects=(200, 206), + range=range, + throws=exceptions.DownloadError + ) + + return streams.ResponseStreamReader(resp) + + async def upload(self, stream, path, conflict='replace', block_id_prefix=None, **kwargs): + """Upload a stream to Azure Blob Storage.""" + path, exists = await self.handle_name_conflict(path, conflict=conflict) + + if block_id_prefix is None: + block_id_prefix = str(uuid.uuid4()) + + # Use simple upload for small files, block upload for large files + if stream.size <= MAX_UPLOAD_ONCE_SIZE: + await self._upload_at_once(stream, path) + else: + await self._upload_blocks(stream, path, block_id_prefix) + + metadata = await self.metadata(path, **kwargs) + return metadata, not exists + + async def _upload_at_once(self, stream, path): + """Upload small file in one request.""" + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5)) + headers = { + 'Content-Length': str(stream.size), + 'x-ms-blob-type': 'BlockBlob' + } + + await self.make_request( + 'PUT', url, headers=headers, data=stream, + expects=(201,), throws=exceptions.UploadError + ) + + async def _upload_blocks(self, stream, path, block_id_prefix): + """Upload large file using block upload.""" + block_id_list = [] + lock = asyncio.Lock() + + async def sub_upload(): + while True: + async with lock: + chunk = await stream.read(MAX_UPLOAD_BLOCK_SIZE) + if not chunk: + return + + sub_stream = ByteStream(chunk) + block_id = self._format_block_id(block_id_prefix, len(block_id_list)) + block_id_list.append(block_id) + + await self._put_block(sub_stream, path, block_id) + + # Run parallel uploads + tasks = [sub_upload() for _ in range(UPLOAD_PARALLEL_NUM)] + await asyncio.gather(*tasks) + + # Commit block list + await self._put_block_list(path, block_id_list) + + async def _put_block(self, stream, path, block_id): + """Upload a single block.""" + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5)) + params = {'comp': 'block', 'blockid': block_id} + headers = {'Content-Length': str(stream.size)} + + await self.make_request( + 'PUT', url, headers=headers, params=params, data=stream, + expects=(201,), throws=exceptions.UploadError + ) + + async def _put_block_list(self, path, block_id_list): + """Commit block list to create blob.""" + xml_data = '' + for block_id in block_id_list: + xml_data += f'{block_id}' + xml_data += '' + + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + stream = StringStream(xml_data) + params = {'comp': 'blocklist'} + headers = { + 'Content-Length': str(stream.size), + 'Content-Type': 'application/xml' + } + + await self.make_request( + 'PUT', url, headers=headers, params=params, data=stream, + expects=(201,), throws=exceptions.UploadError + ) + + @staticmethod + def _format_block_id(prefix, index): + """Format block ID for Azure Storage.""" + block_string = f"{prefix}_{index:05d}" + return base64.urlsafe_b64encode(block_string.encode('utf-8')).decode('utf-8') + + async def delete(self, path, confirm_delete=0, **kwargs): + if path.is_root and confirm_delete != 1: + raise exceptions.DeleteError( + 'confirm_delete=1 is required for deleting root provider folder', + code=400 + ) + + if path.is_file: + await self._delete_blob(path) + else: + await self._delete_folder(path) + + async def _delete_blob(self, path): + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + await self.make_request( + 'DELETE', url, + expects=(202, 404), throws=exceptions.DeleteError + ) + + async def _delete_folder(self, path, **kwargs): + # List all blobs in the folder + url = self.build_url(self.container) + prefix = self._get_blob_path(path.path if not path.is_root else '') + params = { + 'restype': 'container', + 'comp': 'list', + 'prefix': prefix + } + + resp = await self.make_request( + 'GET', url, params=params, + expects=(200,), throws=exceptions.DeleteError + ) + + body = await resp.text() + blob_names = self._parse_blob_list(body) + + if not blob_names and not path.is_root: + raise exceptions.DeleteError('Folder not found', code=404) + + # Delete each blob + for blob_name in blob_names: + url = self.build_url(self.container, blob_name) + await self.make_request( + 'DELETE', url, + expects=(202, 404), throws=exceptions.DeleteError + ) + + async def metadata(self, path, revision=None, **kwargs): + if path.is_dir: + metadata = await self._metadata_folder(path) + for item in metadata: + item.raw['base_folder'] = self.base_folder + else: + metadata = await self._metadata_file(path, revision=revision) + metadata.raw['base_folder'] = self.base_folder + + return metadata + + async def _metadata_file(self, path, revision=None): + if revision == 'Latest': + revision = None + + clean_path = path.path[1:] if path.path.startswith('/') else path.path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + resp = await self.make_request( + 'HEAD', url, + expects=(200,), throws=exceptions.MetadataError + ) + + return AzureBlobStorageFileMetadataHeaders(path.path, resp.headers) + + async def _metadata_folder(self, path): + """Get metadata for a folder (list contents).""" + url = self.build_url(self.container) + prefix = self._get_blob_path(path.path if not path.is_root else '') + items = [] + marker = None + while True: + params = { + 'restype': 'container', + 'comp': 'list', + 'prefix': prefix, + 'delimiter': '/', + } + if marker: + params['marker'] = marker + + resp = await self.make_request( + 'GET', url, params=params, + expects=(200,), throws=exceptions.MetadataError + ) + + body = await resp.text() + parsed = self._convert_xml_to_blob_list(body) + blobs = parsed.get('Blob', []) + prefixes = parsed.get('BlobPrefix', []) + + if not blobs and not prefixes and marker is None and not path.is_root: + raise exceptions.NotFoundError(str(path)) + + for folder_md in [ + AzureBlobStorageFolderMetadata(item) + for item in prefixes if item['Name'] != path.path + ]: + folder_md.raw['base_folder'] = self.base_folder + items.append(folder_md) + + for blob in blobs: + if blob['Name'].endswith('.osfkeep'): + continue + file_md = AzureBlobStorageFileMetadata(blob) + file_md.raw['base_folder'] = self.base_folder + items.append(file_md) + + marker = parsed.get('NextMarker') + if not marker: + break + + return items + + async def create_folder(self, path, folder_precheck=True, **kwargs): + """Create a folder by uploading a marker file.""" + WaterButlerPath.validate_folder(path) + + if folder_precheck: + folder_exists = await self.exists(path) + if folder_exists is not False: # exists returns False for non-existent, [] for empty folder, or metadata for non-empty + raise exceptions.FolderNamingConflict(path.name) + file_exists = await self.exists(await self.validate_path('/' + path.path[:-1])) + if file_exists is not False: + raise exceptions.FolderNamingConflict(path.name) + + # Create .osfkeep file to represent the folder + marker_path = path.path + '.osfkeep' + clean_path = marker_path[1:] if marker_path.startswith('/') else marker_path + url = self.build_url(self.container, self._get_blob_path(clean_path)) + + headers = { + 'Content-Length': '0', + 'x-ms-blob-type': 'BlockBlob' + } + + await self.make_request( + 'PUT', url, headers=headers, data=b'', + expects=(201,), throws=exceptions.CreateFolderError + ) + + metadata = AzureBlobStorageFolderMetadata({'Name': path.path}) + metadata.raw['base_folder'] = self.base_folder + return metadata + + async def revisions(self, path, **kwargs): + return [] + + def _parse_blob_list(self, xml_body, prefix=None): + try: + root = ET.fromstring(xml_body) + blobs = [] + + for blob_elem in root.findall('.//Blob'): + name_elem = blob_elem.find('Name') + if name_elem is not None: + blob_name = name_elem.text + if not prefix or blob_name.startswith(prefix): + blobs.append(blob_name) + + return blobs + except ET.ParseError as e: + logger.error(f"Failed to parse blob list XML: {e}") + return [] + + def _convert_xml_to_blob_list(self, xml_body): + try: + root = ET.fromstring(xml_body) + + blobs_elem = root.find('.//Blobs') + if blobs_elem is None: + return {} + + result = { + 'Prefix': root.find('.//Prefix').text if root.find('.//Prefix') is not None else '', + 'Delimiter': root.find('.//Delimiter').text if root.find('.//Delimiter') is not None else '', + 'Blob': [], + 'BlobPrefix': [], + 'NextMarker': root.find('.//NextMarker').text if root.find('.//NextMarker') is not None else None + } + + for blob in blobs_elem.findall('Blob'): + blob_data = {'Name': blob.find('Name').text if blob.find('Name') is not None else ''} + + properties = blob.find('Properties') + if properties is not None: + props_dict = {} + for prop in properties: + if prop.text is not None: + props_dict[prop.tag] = prop.text + blob_data['Properties'] = props_dict + + result['Blob'].append(blob_data) + + for prefix in blobs_elem.findall('BlobPrefix'): + name_elem = prefix.find('Name') + if name_elem is not None: + result['BlobPrefix'].append({'Name': name_elem.text}) + logger.info(f'Parsed XML to blob list: {result}') + return result + + except ET.ParseError as e: + logger.error(f"Failed to parse XML with native parser: {e}") + return {} diff --git a/waterbutler/providers/azureblobstorage/settings.py b/waterbutler/providers/azureblobstorage/settings.py new file mode 100644 index 0000000000..4c4a88ab5b --- /dev/null +++ b/waterbutler/providers/azureblobstorage/settings.py @@ -0,0 +1,6 @@ +from waterbutler import settings + +config = settings.child('AZUREBLOBSTORAGE_PROVIDER_CONFIG') + +# Azure Blob Storage settings +BASE_URL = 'https://{}.blob.core.windows.net' From 562e9d8275a4949c244e0601c5be52fdebbe6db5 Mon Sep 17 00:00:00 2001 From: Joey An Date: Thu, 28 Aug 2025 23:07:39 +0900 Subject: [PATCH 2/5] fix the review comments --- waterbutler/core/streams/__init__.py | 2 +- waterbutler/core/streams/base.py | 18 ----- .../providers/azureblobstorage/exceptions.py | 31 -------- .../providers/azureblobstorage/provider.py | 74 +++++++++---------- .../providers/azureblobstorage/settings.py | 3 +- 5 files changed, 40 insertions(+), 88 deletions(-) delete mode 100644 waterbutler/providers/azureblobstorage/exceptions.py diff --git a/waterbutler/core/streams/__init__.py b/waterbutler/core/streams/__init__.py index 6feeb4780d..8f5230589d 100644 --- a/waterbutler/core/streams/__init__.py +++ b/waterbutler/core/streams/__init__.py @@ -4,7 +4,7 @@ from waterbutler.core.streams.base import CutoffStream # noqa from waterbutler.core.streams.base import StringStream # noqa from waterbutler.core.streams.base import EmptyStream # noqa -from waterbutler.core.streams.base import ByteStream # noqa + from waterbutler.core.streams.file import FileStreamReader # noqa from waterbutler.core.streams.file import PartialFileStreamReader # noqa diff --git a/waterbutler/core/streams/base.py b/waterbutler/core/streams/base.py index ad44f246ec..449e7922ed 100644 --- a/waterbutler/core/streams/base.py +++ b/waterbutler/core/streams/base.py @@ -220,24 +220,6 @@ async def _read(self, n=-1): return await asyncio.StreamReader.read(self, n) -class ByteStream(BaseStream): - def __init__(self, data): - super().__init__() - if not isinstance(data, bytes): - raise TypeError('Data must be either bytes, found {!r}'.format(type(data))) - - self._size = len(data) - self.feed_data(data) - self.feed_eof() - - @property - def size(self): - return self._size - - async def _read(self, n=-1): - return (await asyncio.StreamReader.read(self, n)) - - class EmptyStream(BaseStream): """An empty stream with size 0 that returns nothing when read. Useful for representing empty folders when building zipfiles. diff --git a/waterbutler/providers/azureblobstorage/exceptions.py b/waterbutler/providers/azureblobstorage/exceptions.py deleted file mode 100644 index 3dafef1110..0000000000 --- a/waterbutler/providers/azureblobstorage/exceptions.py +++ /dev/null @@ -1,31 +0,0 @@ -from waterbutler.core import exceptions - - -class AzureBlobStorageError(exceptions.ProviderError): - """Base exception for Azure Blob Storage provider.""" - pass - - -class AzureBlobStorageNotFoundError(AzureBlobStorageError, exceptions.NotFoundError): - """Exception for when a blob is not found.""" - pass - - -class AzureBlobStorageUploadError(AzureBlobStorageError, exceptions.UploadError): - """Exception for upload errors.""" - pass - - -class AzureBlobStorageDownloadError(AzureBlobStorageError, exceptions.DownloadError): - """Exception for download errors.""" - pass - - -class AzureBlobStorageDeleteError(AzureBlobStorageError, exceptions.DeleteError): - """Exception for delete errors.""" - pass - - -class AzureBlobStorageMetadataError(AzureBlobStorageError, exceptions.MetadataError): - """Exception for metadata errors.""" - pass diff --git a/waterbutler/providers/azureblobstorage/provider.py b/waterbutler/providers/azureblobstorage/provider.py index 0d37c5383f..a0f6d74d82 100644 --- a/waterbutler/providers/azureblobstorage/provider.py +++ b/waterbutler/providers/azureblobstorage/provider.py @@ -10,30 +10,39 @@ from waterbutler.core import provider from waterbutler.core import exceptions from waterbutler.core.path import WaterButlerPath -from waterbutler.core.streams import StringStream, ByteStream +from waterbutler.core.streams import StringStream +from waterbutler.providers.azureblobstorage import settings from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFileMetadata from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFolderMetadata from waterbutler.providers.azureblobstorage.metadata import AzureBlobStorageFileMetadataHeaders -MAX_UPLOAD_BLOCK_SIZE = 4 * 1024 * 1024 # 4MB -MAX_UPLOAD_ONCE_SIZE = 64 * 1024 * 1024 # 64MB -UPLOAD_PARALLEL_NUM = 2 # must be more than 1 -API_VERSION = '2025-07-05' logger = logging.getLogger(__name__) class AzureBlobStorageProvider(provider.BaseProvider): - """Provider for Azure Blob Storage cloud storage service using OAuth2 authentication.""" + """Provider for Microsoft Azure Blob Storage cloud storage service. + + API docs: https://docs.microsoft.com/en-us/rest/api/storageservices/blob-service-rest-api + + Quirks: + + * Empty directories are maintained using .osfkeep marker files, as Azure Blob Storage + does not natively support empty folders. These marker files are automatically created + when folders are created and filtered out during directory listings. + """ NAME = 'azureblobstorage' + API_VERSION = '2025-07-05' + CHUNK_SIZE = settings.CHUNK_SIZE + CONTIGUOUS_UPLOAD_SIZE_LIMIT = settings.CONTIGUOUS_UPLOAD_SIZE_LIMIT def __init__(self, auth, credentials, settings, **kwargs): """ :param dict auth: Not used :param dict credentials: Dict containing OAuth2 'token' - :param dict settings: Dict containing 'container', 'account_name', and optional 'prefix' + :param dict settings: Dict containing 'container', 'account_name', and optional 'base_folder' """ super().__init__(auth, credentials, settings, **kwargs) @@ -59,23 +68,22 @@ def _get_blob_path(self, blob_name=None): def build_url(self, *segments, **query): processed_segments = [] - + for segment in segments: - if isinstance(segment, str) and segment: - if '/' in segment and segment != '/': + if isinstance(segment, str) and segment and segment != '/': + if '/' in segment: components = [comp for comp in segment.split('/') if comp] processed_segments.extend(components) else: - if segment != '/': - processed_segments.append(segment) - + processed_segments.append(segment) + return super().build_url(*processed_segments, **query) @property def default_headers(self): return { 'Authorization': f'Bearer {self.auth_token}', - 'x-ms-version': API_VERSION, + 'x-ms-version': self.API_VERSION, 'x-ms-client-request-id': str(uuid.uuid4()), 'User-Agent': 'WaterButler-AzureBlobStorage/1.0', 'x-ms-date': datetime.datetime.now().strftime('%a, %d %b %Y %H:%M:%S GMT') @@ -163,15 +171,15 @@ async def upload(self, stream, path, conflict='replace', block_id_prefix=None, * block_id_prefix = str(uuid.uuid4()) # Use simple upload for small files, block upload for large files - if stream.size <= MAX_UPLOAD_ONCE_SIZE: - await self._upload_at_once(stream, path) + if stream.size <= self.CONTIGUOUS_UPLOAD_SIZE_LIMIT: + await self._contiguous_upload(stream, path) else: - await self._upload_blocks(stream, path, block_id_prefix) + await self._chunked_upload(stream, path, block_id_prefix) metadata = await self.metadata(path, **kwargs) return metadata, not exists - async def _upload_at_once(self, stream, path): + async def _contiguous_upload(self, stream, path): """Upload small file in one request.""" clean_path = path.path[1:] if path.path.startswith('/') else path.path url = self.build_url(self.container, self._get_blob_path(clean_path)) @@ -187,27 +195,19 @@ async def _upload_at_once(self, stream, path): expects=(201,), throws=exceptions.UploadError ) - async def _upload_blocks(self, stream, path, block_id_prefix): + async def _chunked_upload(self, stream, path, block_id_prefix): """Upload large file using block upload.""" block_id_list = [] - lock = asyncio.Lock() - - async def sub_upload(): - while True: - async with lock: - chunk = await stream.read(MAX_UPLOAD_BLOCK_SIZE) - if not chunk: - return - - sub_stream = ByteStream(chunk) - block_id = self._format_block_id(block_id_prefix, len(block_id_list)) - block_id_list.append(block_id) - - await self._put_block(sub_stream, path, block_id) - - # Run parallel uploads - tasks = [sub_upload() for _ in range(UPLOAD_PARALLEL_NUM)] - await asyncio.gather(*tasks) + parts = [self.CHUNK_SIZE for i in range(0, stream.size // self.CHUNK_SIZE)] + if stream.size % self.CHUNK_SIZE: + parts.append(stream.size - (len(parts) * self.CHUNK_SIZE)) + + # Upload each block + for chunk_number, chunk_size in enumerate(parts): + cutoff_stream = streams.CutoffStream(stream, cutoff=chunk_size) + block_id = self._format_block_id(block_id_prefix, chunk_number) + block_id_list.append(block_id) + await self._put_block(cutoff_stream, path, block_id) # Commit block list await self._put_block_list(path, block_id_list) diff --git a/waterbutler/providers/azureblobstorage/settings.py b/waterbutler/providers/azureblobstorage/settings.py index 4c4a88ab5b..830b0a20d0 100644 --- a/waterbutler/providers/azureblobstorage/settings.py +++ b/waterbutler/providers/azureblobstorage/settings.py @@ -3,4 +3,5 @@ config = settings.child('AZUREBLOBSTORAGE_PROVIDER_CONFIG') # Azure Blob Storage settings -BASE_URL = 'https://{}.blob.core.windows.net' +CHUNK_SIZE = int(config.get('CHUNK_SIZE', 4 * 1024 * 1024)) # 4MB +CONTIGUOUS_UPLOAD_SIZE_LIMIT = int(config.get('CONTIGUOUS_UPLOAD_SIZE_LIMIT', 64 * 1024 * 1024)) # 64 MB From 8b9f3f8b49432eb062bbfba3d8cd09e5f2742507 Mon Sep 17 00:00:00 2001 From: An Qiuyu Date: Tue, 2 Sep 2025 23:15:30 +0800 Subject: [PATCH 3/5] fix blob listing issue at container root --- waterbutler/providers/azureblobstorage/provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/waterbutler/providers/azureblobstorage/provider.py b/waterbutler/providers/azureblobstorage/provider.py index a0f6d74d82..261b8b4882 100644 --- a/waterbutler/providers/azureblobstorage/provider.py +++ b/waterbutler/providers/azureblobstorage/provider.py @@ -64,7 +64,7 @@ def __init__(self, auth, credentials, settings, **kwargs): def _get_blob_path(self, blob_name=None): if blob_name: return f"{self.base_folder}{blob_name}" if self.base_folder else blob_name - return self.base_folder or '/' + return self.base_folder def build_url(self, *segments, **query): processed_segments = [] From 62d4e96377ade47af19f0bbdc4e9b84cf6d3c6fc Mon Sep 17 00:00:00 2001 From: An Qiuyu Date: Fri, 5 Sep 2025 22:13:10 +0800 Subject: [PATCH 4/5] feat: Add Azure Blob Storage provider plugin configuration --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 38793a1b94..598c98f068 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ box = "waterbutler.providers.box:BoxProvider" googledrive = "waterbutler.providers.googledrive:GoogleDriveProvider" onedrive = "waterbutler.providers.onedrive:OneDriveProvider" googlecloud = "waterbutler.providers.googlecloud:GoogleCloudProvider" +azureblobstorage = "waterbutler.providers.azureblobstorage:AzureBlobStorageProvider" [tool.pytest.ini_options] asyncio_default_fixture_loop_scope = "function" From 1135da6ab6df8d96f77d944b70c8e5ff4079cfb4 Mon Sep 17 00:00:00 2001 From: An Qiuyu Date: Tue, 9 Sep 2025 08:20:35 +0800 Subject: [PATCH 5/5] fix: add metadata dehydrate/rehydrate for rename operation --- .../providers/azureblobstorage/test_metadata.py | 6 +++--- .../providers/azureblobstorage/metadata.py | 16 ++++++++++++---- .../providers/azureblobstorage/provider.py | 3 +-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/providers/azureblobstorage/test_metadata.py b/tests/providers/azureblobstorage/test_metadata.py index a7307fb34c..b651935309 100644 --- a/tests/providers/azureblobstorage/test_metadata.py +++ b/tests/providers/azureblobstorage/test_metadata.py @@ -19,7 +19,7 @@ def test_file_metadata_from_headers(self, blob_properties_headers): """Test creating file metadata from HTTP headers""" path = WaterButlerPath('/test-file.xlsx') - metadata = AzureBlobStorageFileMetadataHeaders(path.path, blob_properties_headers) + metadata = AzureBlobStorageFileMetadataHeaders(blob_properties_headers, path.path) assert metadata.name == 'test-file.xlsx' assert metadata.path == '/test-file.xlsx' @@ -74,7 +74,7 @@ def test_file_metadata_minimal(self): } path = WaterButlerPath('/minimal.txt') - metadata = AzureBlobStorageFileMetadataHeaders(path.path, headers) + metadata = AzureBlobStorageFileMetadataHeaders(headers, path.path) assert metadata.name == 'minimal.txt' assert metadata.size == 100 @@ -108,7 +108,7 @@ def test_file_metadata_serialization(self): } path = WaterButlerPath('/report.pdf') - metadata = AzureBlobStorageFileMetadataHeaders(path.path, headers) + metadata = AzureBlobStorageFileMetadataHeaders(headers, path.path) assert metadata.name == 'report.pdf' assert metadata.size == 2048000 diff --git a/waterbutler/providers/azureblobstorage/metadata.py b/waterbutler/providers/azureblobstorage/metadata.py index ccff598056..b1936cfdd9 100644 --- a/waterbutler/providers/azureblobstorage/metadata.py +++ b/waterbutler/providers/azureblobstorage/metadata.py @@ -2,9 +2,6 @@ from waterbutler.core import utils from waterbutler.core import metadata -import logging - -logger = logging.getLogger(__name__) def strip_char(str, chars): @@ -30,10 +27,21 @@ def created_utc(self): class AzureBlobStorageFileMetadataHeaders(AzureBlobStorageMetadata, metadata.BaseFileMetadata): - def __init__(self, path, headers): + def __init__(self, headers, path): self._path = path super().__init__(dict(headers)) + def _dehydrate(self): + payload = super()._dehydrate() + payload['_path'] = self._path + return payload + + @classmethod + def _rehydrate(cls, payload): + args = super()._rehydrate(payload) + args.append(payload['_path']) + return args + @property def path(self): return '/' + strip_char(self._path, self.raw.get('base_folder', '')) diff --git a/waterbutler/providers/azureblobstorage/provider.py b/waterbutler/providers/azureblobstorage/provider.py index 261b8b4882..965f1ed5f5 100644 --- a/waterbutler/providers/azureblobstorage/provider.py +++ b/waterbutler/providers/azureblobstorage/provider.py @@ -327,7 +327,7 @@ async def _metadata_file(self, path, revision=None): expects=(200,), throws=exceptions.MetadataError ) - return AzureBlobStorageFileMetadataHeaders(path.path, resp.headers) + return AzureBlobStorageFileMetadataHeaders(resp.headers, path.path) async def _metadata_folder(self, path): """Get metadata for a folder (list contents).""" @@ -462,7 +462,6 @@ def _convert_xml_to_blob_list(self, xml_body): name_elem = prefix.find('Name') if name_elem is not None: result['BlobPrefix'].append({'Name': name_elem.text}) - logger.info(f'Parsed XML to blob list: {result}') return result except ET.ParseError as e: