diff --git a/.cursor/rules/storage-provider.mdc b/.cursor/rules/storage-provider.mdc new file mode 100644 index 000000000000..fe26f031e225 --- /dev/null +++ b/.cursor/rules/storage-provider.mdc @@ -0,0 +1,464 @@ + +# Cursor Rule: Implementing New Storage Providers in Label Studio + +## Overview +This rule describes the process and best practices for adding a new storage provider to Label Studio using the declarative provider schema system. + +See comprehensive overview about storages @io_storages/README.md. + +## Architecture Overview + +Label Studio supports 2 types of cloud storages: +1. **Import Storages** (Source Cloud Storages) - for importing tasks/data +2. **Export Storages** (Target Cloud Storages) - for exporting annotations + +Each storage type follows this inheritance hierarchy: +```mermaid +graph TD + Storage-->ImportStorage + Storage-->ExportStorage + + ProjectStorageMixin-->NewImportStorage + ImportStorage-->NewImportStorageBase + + NewImportStorageBase-->NewImportStorage + + subgraph New Provider + NewImportStorage + NewImportStorageBase + NewExportStorage + end +``` + +## Backend Implementation + +### 1. Create Storage Models + +#### File Structure +Create these files in `label_studio/io_storages/yourprovider/`: +- `__init__.py` +- `models.py` - Core storage models +- `serializers.py` - API serializers +- `api.py` - API views +- `utils.py` - Provider-specific utilities +- `form_layout.yml` - Form layout (optional, for compatibility) + +#### Storage Mixin Pattern +```python +# models.py +import logging +from django.db import models +from django.utils.translation import gettext_lazy as _ +from io_storages.base_models import ( + ImportStorage, ExportStorage, ImportStorageLink, ExportStorageLink, + ProjectStorageMixin + ) +from io_storages.utils import StorageObject, load_tasks_json + +logger = logging.getLogger(__name__) + +class YourProviderStorageMixin(models.Model): + """Base mixin containing common fields for your provider""" + + # Common fields + bucket = models.TextField(_('bucket'), null=True, blank=True, help_text='Bucket name') + prefix = models.TextField(_('prefix'), null=True, blank=True, help_text='Bucket prefix') + regex_filter = models.TextField(_('regex_filter'), null=True, blank=True, + help_text='Cloud storage regex for filtering objects') + use_blob_urls = models.BooleanField(_('use_blob_urls'), default=False, + help_text='Interpret objects as BLOBs and generate URLs') + + # Provider-specific credentials + api_key = models.TextField(_('api_key'), null=True, blank=True, help_text='API Key') + secret_key = models.TextField(_('secret_key'), null=True, blank=True, help_text='Secret Key') + endpoint_url = models.TextField(_('endpoint_url'), null=True, blank=True, help_text='API Endpoint') + + def get_client(self): + """Initialize and return provider client""" + # Implement provider-specific client initialization + # Cache clients to avoid repeated initialization + pass + + def validate_connection(self, client=None): + """Validate storage connection and credentials""" + # Required method - implement provider-specific validation + # Should raise appropriate exceptions for different error types + pass + + class Meta: + abstract = True + +class YourProviderImportStorageBase(YourProviderStorageMixin, ImportStorage): + """Base class for import functionality""" + + def iter_objects(self): + """Iterate over storage objects""" + # Implement provider-specific object iteration + # Apply regex_filter if specified + # Skip directories and empty objects + pass + + def get_data(self, key) -> list[StorageObject]: + """Get task data from storage object""" + uri = f'{self.url_scheme}://{self.bucket}/{key}' + if self.use_blob_urls: + # Return blob URL + data_key = settings.DATA_UNDEFINED_NAME + task = {data_key: uri} + return [StorageObject(key=key, task_data=task)] + + # Load and parse JSON task data + obj_data = self.get_object_data(key) + return load_tasks_json(obj_data, key) + + def generate_http_url(self, url): + """Generate HTTP URL for storage object""" + # Implement provider-specific URL generation + # Support both presigned URLs and proxy mode + pass + + def can_resolve_url(self, url: str) -> bool: + """Check if this storage can resolve given URL""" + # Check if URL matches this storage's pattern + pass + + class Meta: + abstract = True + +class YourProviderImportStorage(ProjectStorageMixin, YourProviderImportStorageBase): + """Concrete import storage implementation""" + class Meta: + abstract = False + +class YourProviderExportStorage(YourProviderStorageMixin, ExportStorage): + """Export storage implementation""" + + def save_annotation(self, annotation): + """Save annotation to storage""" + # Serialize annotation data + ser_annotation = self._get_serialized_data(annotation) + + # Generate storage key + key = YourProviderExportStorageLink.get_key(annotation) + if self.prefix: + key = f"{self.prefix}/{key}" + + # Save to storage + # Handle provider-specific upload logic + + # Create storage link + YourProviderExportStorageLink.create(annotation, self) + + def delete_annotation(self, annotation): + """Delete annotation from storage""" + # Delete from storage + # Remove storage link + pass + +# Storage link models +class YourProviderImportStorageLink(ImportStorageLink): + storage = models.ForeignKey(YourProviderImportStorage, on_delete=models.CASCADE, related_name='links') + +class YourProviderExportStorageLink(ExportStorageLink): + storage = models.ForeignKey(YourProviderExportStorage, on_delete=models.CASCADE, related_name='links') + +# Signal handlers for automatic export +from django.db.models.signals import post_save, pre_delete +from django.dispatch import receiver +from tasks.models import Annotation + +@receiver(post_save, sender=Annotation) +def export_annotation_to_yourprovider_storages(sender, instance, **kwargs): + # Auto-export logic + pass + +@receiver(pre_delete, sender=Annotation) +def delete_annotation_from_yourprovider_storages(sender, instance, **kwargs): + # Auto-delete logic + pass +``` + +### 2. Create Serializers + +```python +# serializers.py +from rest_framework import serializers +from rest_framework.exceptions import ValidationError +from io_storages.serializers import ImportStorageSerializer, ExportStorageSerializer +from .models import YourProviderImportStorage, YourProviderExportStorage + +class YourProviderStorageSerializerMixin: + """Common serializer functionality""" + secure_fields = ['api_key', 'secret_key'] # Fields to hide in responses + + def to_representation(self, instance): + result = super().to_representation(instance) + # Hide secure fields in responses + for field in self.secure_fields: + result.pop(field, None) + return result + + def validate(self, data): + """Validate storage configuration""" + data = super().validate(data) + + # Create temporary storage instance for validation + storage = self.instance or self.Meta.model(**data) + if self.instance: + for key, value in data.items(): + setattr(storage, key, value) + + try: + storage.validate_connection() + except Exception as e: + raise ValidationError(f"Connection failed: {str(e)}") + + return data + +class YourProviderImportStorageSerializer(YourProviderStorageSerializerMixin, ImportStorageSerializer): + type = serializers.ReadOnlyField(default=os.path.basename(os.path.dirname(__file__))) + + class Meta: + model = YourProviderImportStorage + fields = '__all__' + +class YourProviderExportStorageSerializer(YourProviderStorageSerializerMixin, ExportStorageSerializer): + type = serializers.ReadOnlyField(default=os.path.basename(os.path.dirname(__file__))) + + class Meta: + model = YourProviderExportStorage + fields = '__all__' +``` + +### 3. Create API Views + +```python +# api.py +from django.utils.decorators import method_decorator +from drf_spectacular.utils import extend_schema +from io_storages.api import ( + ImportStorageListAPI, ImportStorageDetailAPI, ImportStorageSyncAPI, + ImportStorageValidateAPI, ImportStorageFormLayoutAPI, + ExportStorageListAPI, ExportStorageDetailAPI, ExportStorageSyncAPI, + ExportStorageValidateAPI, ExportStorageFormLayoutAPI +) +from .models import YourProviderImportStorage, YourProviderExportStorage +from .serializers import YourProviderImportStorageSerializer, YourProviderExportStorageSerializer + +@method_decorator( + name='get', + decorator=extend_schema( + tags=['Storage: YourProvider'], + summary='List YourProvider import storage', + description='Get a list of all YourProvider import storage connections.', + ), +) +@method_decorator( + name='post', + decorator=extend_schema( + tags=['Storage: YourProvider'], + summary='Create new YourProvider storage', + description='Create new YourProvider import storage', + ), +) +class YourProviderImportStorageListAPI(ImportStorageListAPI): + queryset = YourProviderImportStorage.objects.all() + serializer_class = YourProviderImportStorageSerializer + +class YourProviderImportStorageDetailAPI(ImportStorageDetailAPI): + queryset = YourProviderImportStorage.objects.all() + serializer_class = YourProviderImportStorageSerializer + +class YourProviderImportStorageSyncAPI(ImportStorageSyncAPI): + serializer_class = YourProviderImportStorageSerializer + +class YourProviderImportStorageValidateAPI(ImportStorageValidateAPI): + serializer_class = YourProviderImportStorageSerializer + +class YourProviderImportStorageFormLayoutAPI(ImportStorageFormLayoutAPI): + pass + +# Export APIs follow same pattern +class YourProviderExportStorageListAPI(ExportStorageListAPI): + queryset = YourProviderExportStorage.objects.all() + serializer_class = YourProviderExportStorageSerializer + +# ... other export APIs +``` + +### 4. Register URLs + +Add to `label_studio/io_storages/urls.py`: + +```python +# In urlpatterns, add: +path('api/storages/yourprovider/', include(('io_storages.yourprovider.urls', 'io_storages'), namespace='yourprovider-api')), + +# Import the APIs at the top +from io_storages.yourprovider.api import ( + YourProviderImportStorageListAPI, + YourProviderImportStorageDetailAPI, + # ... other APIs +) +``` + +Create `label_studio/io_storages/yourprovider/urls.py`: + +```python +from django.urls import path +from . import api + +urlpatterns = [ + # Import storage URLs + path('import/', api.YourProviderImportStorageListAPI.as_view(), name='yourprovider-import-list'), + path('import//', api.YourProviderImportStorageDetailAPI.as_view(), name='yourprovider-import-detail'), + path('import//sync/', api.YourProviderImportStorageSyncAPI.as_view(), name='yourprovider-import-sync'), + path('import/validate/', api.YourProviderImportStorageValidateAPI.as_view(), name='yourprovider-import-validate'), + path('import/form-layout/', api.YourProviderImportStorageFormLayoutAPI.as_view(), name='yourprovider-import-form-layout'), + + # Export storage URLs + path('export/', api.YourProviderExportStorageListAPI.as_view(), name='yourprovider-export-list'), + path('export//', api.YourProviderExportStorageDetailAPI.as_view(), name='yourprovider-export-detail'), + path('export//sync/', api.YourProviderExportStorageSyncAPI.as_view(), name='yourprovider-export-sync'), + path('export/validate/', api.YourProviderExportStorageValidateAPI.as_view(), name='yourprovider-export-validate'), + path('export/form-layout/', api.YourProviderExportStorageFormLayoutAPI.as_view(), name='yourprovider-export-form-layout'), +] +``` + +## Steps to Add a New Storage Provider + +1. **Create a Provider Config File** + - Add a new file under `web/lib/app-common/src/blocks/StorageProviderForm/providers/` named after your provider (e.g., `myProvider.ts`). + +2. **Define Fields** + - Use the `FieldDefinition` type for each field. + - Each field should specify: + - `name`: Unique string identifier + - `type`: One of `text`, `password`, `select`, `toggle`, `counter`, etc. + - `label`: User-facing label + - `required`: Boolean (if applicable) + - `placeholder`: Example value (if applicable) + - `description`: (optional) Help text for the user + - `autoComplete`: (optional) For password fields + - `accessKey`: Boolean for credential fields (enables edit mode handling) + - `options`: For select fields + - `min`, `max`, `step`: For counter fields + - `schema`: Zod schema for validation, with `.default()` for default values + +3. **Assemble the Layout** + - Use the `layout` array to group fields into rows. + - Each row is an object with a `fields` array listing the field names in order. + - Omit fields like `title`, `regex_filter`, and `use_blob_urls` from the provider schema; these are handled globally or in the preview step. + +4. **Validation** + - Use Zod for all field validation. + - Use `.default()` for default values where appropriate. + - For optional fields, use `.optional().default("")` or similar. + +5. **Credential Fields** + - Mark credential fields (e.g., API keys, secrets) with `accessKey: true`. + - Use `type: "password"` and set `autoComplete` as needed. + - Provide a realistic placeholder. + +6. **Placeholders and Descriptions** + - Always provide a meaningful placeholder for each field. + - Add a description if the field may be confusing or has special requirements. + +7. **Export the Provider** + - Export your provider config as the default export from the file. + +8. **Register the Provider** + - Add your provider to the central registry in `providers/index.ts`. + +## Example Field Definition +```ts +{ + name: "api_key", + type: "password", + label: "API Key", + required: true, + accessKey: true, + placeholder: "sk-...", + autoComplete: "off", + schema: z.string().min(1, "API Key is required"), +} +``` + +## Best Practices +- Do **not** include global fields like `title`, `regex_filter`, or `use_blob_urls` in provider configs. +- Use `.default()` in Zod schemas for all fields that should have a default value. +- Use `accessKey: true` for any field that is a credential or secret. +- Keep field and layout definitions minimal and focused on provider-specific configuration. +- Test your provider in both create and edit modes to ensure correct behavior. + +## Testing + +Create tests in `label_studio/io_storages/tests/test_yourprovider.py`: + +```python +from django.test import TestCase +from io_storages.yourprovider.models import YourProviderImportStorage + +class TestYourProviderStorage(TestCase): + def test_connection_validation(self): + # Test connection validation logic + pass + + def test_object_iteration(self): + # Test object listing and filtering + pass + + def test_data_loading(self): + # Test task data loading + pass +``` + +## Implementation Checklist + +### Backend Implementation +- [ ] Create provider directory structure +- [ ] Implement storage mixin with common fields +- [ ] Create import storage base class with required methods: + - [ ] `iter_objects()` - iterate over storage objects + - [ ] `get_data()` - load task data from objects + - [ ] `generate_http_url()` - create HTTP URLs + - [ ] `can_resolve_url()` - check URL resolution capability + - [ ] `validate_connection()` - validate credentials and connectivity +- [ ] Create concrete import/export storage classes +- [ ] Implement storage link models +- [ ] Create serializers with validation logic +- [ ] Implement API views following existing patterns +- [ ] Register URLs in storage URL configuration +- [ ] Add signal handlers for auto-export functionality +- [ ] Create database migrations + +### Frontend Implementation +- [ ] Create provider configuration file with: + - [ ] All required fields with proper types + - [ ] Zod validation schemas + - [ ] Meaningful labels and placeholders + - [ ] Proper field layout definition +- [ ] Register provider in central registry +- [ ] Mark credential fields with `accessKey: true` +- [ ] Test form rendering and validation +- [ ] Verify edit mode behavior for credentials + +### Testing & Documentation +- [ ] Write backend unit tests +- [ ] Test connection validation +- [ ] Test object iteration and filtering +- [ ] Test task data loading +- [ ] Test frontend form functionality +- [ ] Test both create and edit modes +- [ ] Update API documentation +- [ ] Add provider to storage documentation + +### Integration & Deployment +- [ ] Test end-to-end storage workflow +- [ ] Verify task import/export functionality +- [ ] Test URL resolution and proxy functionality +- [ ] Test with both presigned URLs and proxy mode +- [ ] Verify error handling and user feedback +- [ ] Test storage sync and status reporting + +When in doubt, use this checklist. Proactive implementation following these patterns ensures complete requirements coverage and maintains consistency with existing storage providers. diff --git a/docs/source/guide/security.md b/docs/source/guide/security.md index 7429c4235332..84a8e9247bca 100644 --- a/docs/source/guide/security.md +++ b/docs/source/guide/security.md @@ -111,9 +111,9 @@ Below, both are explained from a security perspective. After connecting a storage to a project, you have several options to load tasks into the project. Depending on the option, you need to provide specific permissions: -* **Sync media files** (**LIST** permission required): Storage Sync automatically creates Label Studio tasks based on the file list in your storage when **Treat every bucket object as a source file** is enabled. Label Studio does not read the file content; it simply references the files (e.g., `{"image": "s3://bucket/1.jpg"}`). +* **Sync media files** (**LIST** permission required): Storage Sync automatically creates Label Studio tasks based on the file list in your storage when **Tasks** import method is enabled. Label Studio does not read the file content; it simply references the files (e.g., `{"image": "s3://bucket/1.jpg"}`). -* **Sync JSON task files** (**LIST** and **GET** permissions required): Storage Sync reads Label Studio tasks from JSON files in your bucket and loads the entire JSON content into the Label Studio database when "Treat every bucket object as a source file" is enabled. +* **Sync JSON task files** (**LIST** and **GET** permissions required): Storage Sync reads Label Studio tasks from JSON files in your bucket and loads the entire JSON content into the Label Studio database when **Tasks** import method is enabled. * **No sync** (**none** permissions required): You can manually import JSON files containing Label Studio tasks and reference storage URIs (e.g., `{"image": "s3://bucket/1.jpg"}`) inside tasks. diff --git a/docs/source/guide/storage.md b/docs/source/guide/storage.md index 852c82d487c3..9c37e9f8c124 100644 --- a/docs/source/guide/storage.md +++ b/docs/source/guide/storage.md @@ -25,8 +25,8 @@ Set up the following cloud and other storage systems with Label Studio: When working with an external cloud storage connection, keep the following in mind: * For Source storage: - * When "Treat every bucket object as a source file" is checked, Label Studio doesn’t import the data stored in the bucket, but instead creates *references* to the objects. Therefore, you have full access control on the data to be synced and shown on the labeling screen. - * When "Treat every bucket object as a source file" is unchecked, bucket files are assumed to be immutable; the only way to push an updated file's state to Label Studio is to upload it with a new filename or delete all tasks that are associated with that file and resync. + * When **Files** import method is selected, Label Studio doesn’t import the data stored in the bucket, but instead creates *references* to the objects. Therefore, you have full access control on the data to be synced and shown on the labeling screen. + * When **Tasks** import method is selected, bucket files are assumed to be immutable; the only way to push an updated file's state to Label Studio is to upload it with a new filename to storage or delete all tasks that are associated with that file and resync. * Sync operations with external buckets only goes one way. It either creates tasks from objects on the bucket (Source storage) or pushes annotations to the output bucket (Target storage). Changing something on the bucket side doesn't guarantee consistency in results. * We recommend using a separate bucket folder for each Label Studio project. * Storage Regions: To minimize latency and improve efficiency, store data in cloud storage buckets that are geographically closer to your team rather than near the Label Studio server. @@ -57,7 +57,7 @@ Task data synced from cloud storage is not stored in Label Studio. Instead, the * If you set the import method to "Files", Label Studio backend will only need LIST permissions and won't download any data from your buckets. -* If you set the import method to "JSON", Label Studio backend will require GET permissions to read JSON files and convert them to Label Studio tasks. +* If you set the import method to "Tasks", Label Studio backend will require GET permissions to read JSON files and convert them to Label Studio tasks. When your users access labeling, the backend will attempt to resolve URI (e.g., s3://) to URL (https://) links. URLs will be returned to the frontend and loaded by the user's browser. To load these URLs, the browser will require HEAD and GET permissions from your Cloud Storage. The HEAD request is made at the beginning and allows the browser to determine the size of the audio, video, or other files. The browser then makes a GET request to retrieve the file body. @@ -73,11 +73,14 @@ Source storage functionality can be divided into two parts: #### Import method +!!! info + The "Treat every bucket object as a source file" option was renamed and reintroduced as the "Import method" dropdown. + Label Studio Source Storages feature an "Import method" dropdown. This setting enables two different methods of loading tasks into Label Studio. -###### JSON +###### Tasks -When set to "JSON", tasks in JSON or JSONL/NDJSON format can be loaded directly from storage buckets into Label Studio. This approach is particularly helpful when dealing with complex tasks that involve multiple media sources. +When set to "Tasks", tasks in JSON, JSONL/NDJSON or Parquet format can be loaded directly from storage buckets into Label Studio. This approach is particularly helpful when dealing with complex tasks that involve multiple media sources. @@ -392,7 +395,7 @@ After you [configure access to your S3 bucket](#Configure-access-to-your-S3-buck - In the **Session Token** field, specify a session token of the temporary security credentials for an AWS account with access to your S3 bucket. - In the **Import method** dropdown, choose how to import your data: - **Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. - - **JSON** - Treat each JSON or JSONL file as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. + - **Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. - (Optional) Enable **Scan all sub-folders** to include files from all nested folders within your S3 bucket prefix. - In the **Use pre-signed URLs (On) / Proxy through Label Studio (Off)** toggle, choose how media is loaded: - **ON** (Pre-signed URLs) - All data bypasses the platform and user browsers directly read data from storage. @@ -559,7 +562,7 @@ In the Label Studio UI, do the following to set up the connection: - In the **External ID** field, specify the external ID that identifies Label Studio to your AWS account. You can find the external ID on your **Organization** page. - In the **Import method** dropdown, choose how to import your data: - **Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. - - **JSON** - Treat each JSON or JSONL file as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. + - **Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. - Enable **Scan all sub-folders** to include files from all nested folders within your S3 bucket prefix. - In the **Use pre-signed URLs (On) / Proxy through Label Studio (Off)** toggle, choose how media is loaded: - **ON** (Pre-signed URLs) - All data bypasses the platform and user browsers directly read data from storage. @@ -703,7 +706,7 @@ In the Label Studio UI, do the following to set up the connection: - In the **File Filter Regex** field, specify a regular expression to filter bucket objects. Use `.*` to collect all objects. - In the **Import method** dropdown, choose how to import your data: - **Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. - - **JSON** - Treat each JSON or JSONL file as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. + - **Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. - In the **Use pre-signed URLs (On) / Proxy through Label Studio (Off)** toggle, choose how media is loaded: - **ON** (Pre-signed URLs) - All data bypasses the platform and user browsers directly read data from storage. - **OFF** (Proxy) - The platform proxies media using its own backend. @@ -1034,7 +1037,7 @@ Select the **GCS (WIF auth)** storage type and then complete the following field | Bucket Name | Enter the name of the Google Cloud bucket. | | Bucket Prefix | Optionally, enter the folder name within the bucket that you would like to use. For example, `data-set-1` or `data-set-1/subfolder-2`. | | File Name Filter | Optionally, specify a regular expression to filter bucket objects. | -| [Treat every bucket object as a source file](#Treat-every-bucket-object-as-a-source-file) | Enable this option if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. This setting creates a URL for each bucket object to use for labeling, such as `gs://my-gcs-bucket/image.jpg`. Leave this option disabled if you have are specifying your tasks in JSON files. | +| Import method | Choose how to interpret your data:
**Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if your bucket contains BLOB storage files such as JPG, MP3, or similar file types.
**Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the bucket with one task per JSON file. | | [Use pre-signed URLs](#Pre-signed-URLs-vs-storage-proxies) | **ON** - Label Studio generates a pre-signed URL to load media.
**OFF** - The platform proxies media using its own backend. | | Pre-signed URL counter | Adjust the counter for how many minutes the pre-signed URLs are valid. | | Workload Identity Pool ID | This is the ID you specified when creating the Work Identity Pool. You can find this in Google Cloud Console under **IAM & Admin > Workload Identity Pools**. | @@ -1159,7 +1162,7 @@ In the Label Studio UI, do the following to set up the connection: - In the **File Filter Regex** field, specify a regular expression to filter bucket objects. Use `.*` to collect all objects. - In the **Account Name** field, specify the account name for the Azure storage. You can also set this field as an environment variable,`AZURE_BLOB_ACCOUNT_NAME`. - In the **Account Key** field, specify the secret key to access the storage account. You can also set this field as an environment variable,`AZURE_BLOB_ACCOUNT_KEY`. - - Enable **Treat every bucket object as a source file** if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. This setting creates a URL for each bucket object to use for labeling, for example `azure-blob://container-name/image.jpg`. Leave this option disabled if you have multiple JSON files in the bucket with one task per JSON file. + - Set **Import method** to **"Files"** if your bucket contains BLOB storage files such as JPG, MP3, or similar file types. This setting creates a URL for each bucket object to use for labeling, for example `azure-blob://container-name/image.jpg`. Set this option to **"Tasks"** if you have multiple JSON/JSONL/Parquet files in the bucket with tasks. - Choose whether to disable [**Use pre-signed URLs**](#Pre-signed-URLs-vs-storage-proxies), or [shared access signatures](https://docs.microsoft.com/en-us/rest/api/storageservices/delegate-access-with-shared-access-signature). - **ON** - Label Studio generates a pre-signed URL to load media. - **OFF** - The platform proxies media using its own backend. @@ -1218,7 +1221,9 @@ In the Label Studio UI, do the following to set up the connection: - In the **Host** field, specify the IP of the server hosting the database, or `localhost`. - In the **Port** field, specify the port that you can use to access the database. - In the **File Filter Regex** field, specify a regular expression to filter database objects. Use `.*` to collect all objects. - - Enable **Treat every bucket object as a source file** if your database contains files such as JPG, MP3, or similar file types. This setting creates a URL for each database object to use for labeling. Leave this option disabled if you have multiple JSON files in the database, with one task per JSON file. + - In the **Import method** dropdown, choose how to import your data: + - **Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if your database contains BLOB storage files such as JPG, MP3, or similar file types. + - **Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you have multiple JSON files in the database with one task per JSON file. 8. Click **Add Storage**. 9. Repeat these steps for **Target Storage** to sync completed data annotations to a database. @@ -1268,9 +1273,9 @@ In the Label Studio UI, do the following to set up the connection: If you are using Windows, ensure that you use backslashes when entering your **Absolute local path**. 1. (Optional) In the **File Filter Regex** field, specify a regular expression to filter bucket objects. Use `.*` to collect all objects. -2. (Optional) Toggle **Treat every bucket object as a source file**. - - Enable this option if you want to create Label Studio tasks from media files automatically, such as JPG, MP3, or similar file types. Use this option for labeling configurations with one source tag. - - Disable this option if you want to import tasks in Label Studio JSON format directly from your storage. Use this option for complex labeling configurations with HyperText or multiple source tags. +2. (Optional) In the **Import method** dropdown, choose how to import your data: + - **Files** - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT). Use this if you want to create Label Studio tasks from media files automatically. Use this option for labeling configurations with one source tag. + - **Tasks** - Treat each JSON, JSONL, or Parquet as a task definition (one or more tasks per file). Use this if you want to import tasks in Label Studio JSON format directly from your storage. Use this option for complex labeling configurations with HyperText or multiple source tags. 3. Click **Add Storage**. 4. Repeat these steps for **Add Target Storage** to use a local file directory for exporting. @@ -1283,7 +1288,7 @@ In those cases, you have to repeat all stages above to create local storage, but Differences with instruction above: - **7. File Filter Regex** - stay empty (because you will specify it inside tasks) -- **8. Treat every bucket object as a source file** - switch off (because you will specify it inside tasks) +- **8. Import method** - select **"Tasks"** (because you will specify file references inside your JSON task definitions) Your window will look like this: Screenshot of the local storage settings for user task. diff --git a/docs/source/guide/troubleshooting.md b/docs/source/guide/troubleshooting.md index 054037d8f077..1a2459b5dccd 100644 --- a/docs/source/guide/troubleshooting.md +++ b/docs/source/guide/troubleshooting.md @@ -113,8 +113,8 @@ Label Studio does not support labeling PDF files directly. However, you can conv When working with an external Cloud Storage connection (S3, GCS, Azure), keep the following in mind: * For Source storage: - * When "Treat every bucket object as a source file" is checked, Label Studio doesn’t import the data stored in the bucket, but instead creates *references* to the objects. Therefore, you have full access control on the data to be synced and shown on the labeling screen. - * When "Treat every bucket object as a source file" is unchecked, bucket files are assumed to be immutable; the only way to push an updated file's state to Label Studio is to upload it with a new filename or delete all tasks that are associated with that file and resync. + * When **Files** import method is selected, Label Studio doesn’t import the data stored in the bucket, but instead creates *references* to the objects. Therefore, you have full access control on the data to be synced and shown on the labeling screen. + * When **Tasks** import method is selected, bucket files are assumed to be immutable; the only way to push an updated file's state to Label Studio is to upload it with a new filename to storage or delete all tasks that are associated with that file and resync. * Sync operations with external buckets only goes one way. It either creates tasks from objects on the bucket (Source storage) or pushes annotations to the output bucket (Target storage). Changing something on the bucket side doesn’t guarantee consistency in results. * We recommend using a separate bucket folder for each Label Studio project. @@ -169,11 +169,11 @@ First, check that you have specified the correct credentials (see the sections a Then go to the cloud storage settings page and click **Edit** next to the cloud connection. From here, you can check the following: * The **File Filter Regex** is set and correct. When no filters are specified, all found items are skipped. The filter should be a valid regular expression, not a wildcard (e.g. `.*` is a valid, `*.` is not valid) -* **Treat every bucket object as a source file** should be toggled `ON` if you work with images, audio, text files or any other binary content stored in the bucket. +* **Import method** should be set to `Files` for simple cases if you work with images, audio, text files or any other binary content stored in the bucket. - This instructs Label Studio to create URI endpoints and store this as a labeling task payload, and resolve them into presigned `https` URLs when opening the labeling screen. + This instructs Label Studio to create tasks automatically with URI links (like `s3://bucket/1.jpg`), and resolve them into presigned `https` URLs when opening the labeling screen. - If you store JSON tasks in the Label Studio format in your bucket - turn this toggle `OFF`. + If you store JSON/JSONL tasks in the Label Studio format or Parquet files in your bucket - set this option to "Tasks". * Check for rq worker failures. An easy way to check rq workers is complete an export operation. @@ -181,8 +181,8 @@ Then go to the cloud storage settings page and click **Edit** next to the cloud ### JSON files from a cloud storage are not synced and the Data Manager is empty -1. Edit the storage settings to enable **Treat every bucket object as a source file**. If you see tasks in the Data Manager, proceed to step 2. -2. Disable **Treat every bucket object as a source file**. +1. Edit the storage settings. If you see tasks in the Data Manager, proceed to step 2. +2. Set **Import method** to "Tasks". If you don’t see tasks in the Data Manager, your bucket doesn’t have GET permissions, only LIST permissions. @@ -193,7 +193,7 @@ If there is only LIST permission, Label Studio can scan the bucket for the exist If the tasks sync to Label Studio but don't appear the way that you expect, maybe with URLs instead of images or with one task where you expect to see many, check the following: - If you're placing JSON files in [cloud storage](storage.html), ensure that if you have multiple tasks in the same file, they are all formatted the same way (for example, you cannot have 1 task with the raw contents of the `data` field and another task that contains annotations and predictions in the same file). -- If you're syncing image or audio files, make sure **Treat every bucket object as a source file** is enabled. +- If you're syncing image or audio files, make sure **Import method** is set to "Files". ### Unable to access local storage when using Windows diff --git a/label_studio/core/all_urls.json b/label_studio/core/all_urls.json index 2b4d0e6dc840..b78b98c3a2ba 100644 --- a/label_studio/core/all_urls.json +++ b/label_studio/core/all_urls.json @@ -677,6 +677,12 @@ "name": "storages:api:storage-s3-form", "decorators": "" }, + { + "url": "/api/storages/s3/files", + "module": "io_storages.api.ImportStorageListFilesAPI", + "name": "storages:api:storage-s3-list-files", + "decorators": "" + }, { "url": "/api/storages/export/s3", "module": "io_storages.s3.api.S3ExportStorageListAPI", @@ -737,6 +743,12 @@ "name": "storages:api:storage-azure-form", "decorators": "" }, + { + "url": "/api/storages/azure/files", + "module": "io_storages.api.ImportStorageListFilesAPI", + "name": "storages:api:storage-azure-list-files", + "decorators": "" + }, { "url": "/api/storages/export/azure", "module": "io_storages.azure_blob.api.AzureBlobExportStorageListAPI", @@ -797,6 +809,12 @@ "name": "storages:api:storage-gcs-form", "decorators": "" }, + { + "url": "/api/storages/gcs/files", + "module": "io_storages.api.ImportStorageListFilesAPI", + "name": "storages:api:storage-gcs-list-files", + "decorators": "" + }, { "url": "/api/storages/export/gcs", "module": "io_storages.gcs.api.GCSExportStorageListAPI", @@ -857,6 +875,12 @@ "name": "storages:api:storage-redis-form", "decorators": "" }, + { + "url": "/api/storages/redis/files", + "module": "io_storages.api.ImportStorageListFilesAPI", + "name": "storages:api:storage-redis-list-files", + "decorators": "" + }, { "url": "/api/storages/export/redis", "module": "io_storages.redis.api.RedisExportStorageListAPI", @@ -917,6 +941,12 @@ "name": "storages:api:storage-localfiles-form", "decorators": "" }, + { + "url": "/api/storages/localfiles/files", + "module": "io_storages.api.ImportStorageListFilesAPI", + "name": "storages:api:storage-localfiles-list-files", + "decorators": "" + }, { "url": "/api/storages/export/localfiles", "module": "io_storages.localfiles.api.LocalFilesExportStorageListAPI", diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py index 85b4602fce95..42097c63f586 100644 --- a/label_studio/core/settings/base.py +++ b/label_studio/core/settings/base.py @@ -691,6 +691,7 @@ def collect_versions_dummy(**kwargs): FUTURE_SAVE_TASK_TO_STORAGE_JSON_EXT = get_bool_env('FUTURE_SAVE_TASK_TO_STORAGE_JSON_EXT', default=True) STORAGE_IN_PROGRESS_TIMER = float(get_env('STORAGE_IN_PROGRESS_TIMER', 5.0)) STORAGE_EXPORT_CHUNK_SIZE = int(get_env('STORAGE_EXPORT_CHUNK_SIZE', 100)) +DEFAULT_STORAGE_LIST_LIMIT = int(get_env('DEFAULT_STORAGE_LIST_LIMIT', 100)) USE_NGINX_FOR_EXPORT_DOWNLOADS = get_bool_env('USE_NGINX_FOR_EXPORT_DOWNLOADS', False) USE_NGINX_FOR_UPLOADS = get_bool_env('USE_NGINX_FOR_UPLOADS', True) diff --git a/label_studio/io_storages/api.py b/label_studio/io_storages/api.py index 6b6e79d1e378..0164ed36357b 100644 --- a/label_studio/io_storages/api.py +++ b/label_studio/io_storages/api.py @@ -3,6 +3,7 @@ import inspect import logging import os +import time from core.permissions import all_permissions from core.utils.io import read_yaml @@ -11,7 +12,7 @@ from io_storages.serializers import ExportStorageSerializer, ImportStorageSerializer from projects.models import Project from rest_framework import generics, status -from rest_framework.exceptions import NotFound, PermissionDenied, ValidationError +from rest_framework.exceptions import NotFound, ValidationError from rest_framework.parsers import FormParser, JSONParser, MultiPartParser from rest_framework.response import Response from rest_framework.settings import api_settings @@ -150,30 +151,51 @@ class StorageValidateAPI(generics.CreateAPIView): parser_classes = (JSONParser, FormParser, MultiPartParser) def create(self, request, *args, **kwargs): - storage_id = request.data.get('id') - instance = None - if storage_id: - instance = generics.get_object_or_404(self.serializer_class.Meta.model.objects.all(), pk=storage_id) - if not instance.has_permission(request.user): - raise PermissionDenied() - - # combine instance fields with request.data - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - # if storage exists, we have to use instance from DB, - # because instance from serializer won't have credentials, they were popped intentionally - if instance: - instance = serializer.update(instance, serializer.validated_data) - else: - instance = serializer.Meta.model(**serializer.validated_data) - - # double check: not all storages validate connection in serializer, just make another explicit check here + from .functions import validate_storage_instance + + validate_storage_instance(request, self.serializer_class) + return Response() + + +@extend_schema(exclude=True) +class ImportStorageListFilesAPI(generics.CreateAPIView): + + permission_required = all_permissions.projects_change + parser_classes = (JSONParser, FormParser, MultiPartParser) + serializer_class = None # Default serializer + + def __init__(self, serializer_class=None, *args, **kwargs): + self.serializer_class = serializer_class + super().__init__(*args, **kwargs) + + @extend_schema(exclude=True) + def create(self, request, *args, **kwargs): + from .functions import validate_storage_instance + + instance = validate_storage_instance(request, self.serializer_class) + limit = request.data.get('limit', settings.DEFAULT_STORAGE_LIST_LIMIT) + try: - instance.validate_connection() + files = [] + start_time = time.time() + timeout_seconds = 30 + + for object in instance.iter_objects(): + files.append(instance.get_unified_metadata(object)) + + # Check if we've reached the file limit + if len(files) >= limit: + files.append({'key': None, 'last_modified': None, 'size': None}) + break + + # Check if we've exceeded the timeout + if time.time() - start_time > timeout_seconds: + files.append({'key': '... storage scan timeout reached ...', 'last_modified': None, 'size': None}) + break + + return Response({'files': files}) except Exception as exc: - logger.error(f'Error validating storage connection: {exc}') - raise ValidationError('Error validating storage connection') - return Response() + raise ValidationError(exc) @extend_schema(exclude=True) diff --git a/label_studio/io_storages/azure_blob/form_layout.yml b/label_studio/io_storages/azure_blob/form_layout.yml index 2011a21c2258..1633ed3e021d 100644 --- a/label_studio/io_storages/azure_blob/form_layout.yml +++ b/label_studio/io_storages/azure_blob/form_layout.yml @@ -59,7 +59,7 @@ ImportStorage: - value: true label: "Files - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT)" - value: false - label: "JSON - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" + label: "Tasks - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" # 2 columns grid - columnCount: 2 diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index 87e25a3bbe70..c28fa39a2efa 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -198,7 +198,7 @@ class AzureBlobImportStorageBase(AzureBlobStorageMixin, ImportStorage): _('presign_ttl'), default=1, help_text='Presigned URLs TTL (in minutes)' ) - def iterkeys(self): + def iter_objects(self): container = self.get_container() prefix = str(self.prefix) if self.prefix else '' files = container.list_blobs(name_starts_with=prefix) @@ -212,7 +212,19 @@ def iterkeys(self): if regex and not regex.match(file.name): logger.debug(file.name + ' is skipped by regex filter') continue - yield file.name + yield file + + def iter_keys(self): + for obj in self.iter_objects(): + yield obj.name + + @staticmethod + def get_unified_metadata(obj): + return { + 'key': obj.name, + 'last_modified': obj.last_modified, + 'size': obj.size, + } def get_data(self, key) -> list[StorageObject]: if self.use_blob_urls: diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index bf06f40626d1..d4ae5834f68e 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict from datetime import datetime -from typing import Union +from typing import Any, Iterator, Union from urllib.parse import urljoin import django_rq @@ -230,8 +230,29 @@ class Meta: class ImportStorage(Storage): - def iterkeys(self): - return iter(()) + def iter_objects(self) -> Iterator[Any]: + """ + Returns: + Iterator[Any]: An iterator for objects in the storage. + """ + raise NotImplementedError + + def iter_keys(self) -> Iterator[str]: + """ + Returns: + Iterator[str]: An iterator of keys for each object in the storage. + """ + raise NotImplementedError + + def get_unified_metadata(self, obj: Any) -> dict: + """ + Args: + obj: The storage object to get metadata for + Returns: + dict: A dictionary of metadata for the object with keys: + 'key', 'last_modified', 'size'. + """ + raise NotImplementedError def get_data(self, key) -> list[StorageObject]: raise NotImplementedError @@ -430,7 +451,7 @@ def _scan_and_create_links(self, link_class): ) tasks_for_webhook = [] - for key in self.iterkeys(): + for key in self.iter_keys(): # w/o Dataflow # pubsub.push(topic, key) # -> GF.pull(topic, key) + env -> add_task() @@ -456,7 +477,7 @@ def _scan_and_create_links(self, link_class): raise UnsupportedFileFormatError( f'File "{key}" is not a JSON/JSONL/Parquet file. Only .json, .jsonl, and .parquet files can be processed.\n' f"If you're trying to import non-JSON data (images, audio, text, etc.), " - f'edit storage settings and enable "Treat every bucket object as a source file"' + f'edit storage settings and enable "Tasks" import method' ) try: @@ -466,7 +487,7 @@ def _scan_and_create_links(self, link_class): raise ValueError( f'Error loading JSON from file "{key}".\nIf you\'re trying to import non-JSON data ' f'(images, audio, text, etc.), edit storage settings and enable ' - f'"Treat every bucket object as a source file"' + f'"Tasks" import method' ) if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): diff --git a/label_studio/io_storages/functions.py b/label_studio/io_storages/functions.py index e3ff33be39fc..e2a11a3a6601 100644 --- a/label_studio/io_storages/functions.py +++ b/label_studio/io_storages/functions.py @@ -1,7 +1,9 @@ import logging from typing import Dict, Iterable, List, Union +from django.shortcuts import get_object_or_404 from io_storages.base_models import ImportStorage +from rest_framework.exceptions import PermissionDenied, ValidationError from .azure_blob.api import AzureBlobExportStorageListAPI, AzureBlobImportStorageListAPI from .gcs.api import GCSExportStorageListAPI, GCSImportStorageListAPI @@ -11,6 +13,57 @@ logger = logging.getLogger(__name__) +def validate_storage_instance(request, serializer_class): + """ + Preload and prepare a storage instance from request data. + + This function handles the common logic for loading existing storage instances + or creating new ones from request data, including permission checks and + serializer validation. + + Args: + request: The HTTP request containing storage data + serializer_class: The serializer class to use for validation + + Returns: + The prepared storage instance + + Raises: + PermissionDenied: If user doesn't have permission to access the storage + ValidationError: If serializer validation fails + """ + if not serializer_class or not hasattr(serializer_class, 'Meta'): + raise ValidationError('Invalid or missing serializer class') + + storage_id = request.data.get('id') + instance = None + + if storage_id: + instance = get_object_or_404(serializer_class.Meta.model.objects.all(), pk=storage_id) + if not instance.has_permission(request.user): + raise PermissionDenied() + + # combine instance fields with request.data + serializer = serializer_class(data=request.data) + serializer.is_valid(raise_exception=True) + + # if storage exists, we have to use instance from DB, + # because instance from serializer won't have credentials, they were popped intentionally + if instance: + instance = serializer.update(instance, serializer.validated_data) + else: + instance = serializer_class.Meta.model(**serializer.validated_data) + + # double check: not all storages validate connection in serializer, just make another explicit check here + try: + instance.validate_connection() + except Exception as exc: + logger.error(f'Error validating storage connection: {exc}') + raise ValidationError('Error validating storage connection') + + return instance + + def get_storage_list(): return [ { diff --git a/label_studio/io_storages/gcs/form_layout.yml b/label_studio/io_storages/gcs/form_layout.yml index a479fc3c757e..92a4ffbe5401 100644 --- a/label_studio/io_storages/gcs/form_layout.yml +++ b/label_studio/io_storages/gcs/form_layout.yml @@ -63,7 +63,7 @@ ImportStorage: - value: true label: "Files - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT)" - value: false - label: "JSON - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" + label: "Tasks - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" # 2 columns grid - columnCount: 2 diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 304b14a49f87..2609dcec722a 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -176,15 +176,26 @@ class GCSImportStorageBase(GCSStorageMixin, ImportStorage): _('presign_ttl'), default=1, help_text='Presigned URLs TTL (in minutes)' ) - def iterkeys(self): + def iter_objects(self): return GCS.iter_blobs( client=self.get_client(), bucket_name=self.bucket, prefix=self.prefix, regex_filter=self.regex_filter, - return_key=True, + return_key=False, ) + def iter_keys(self): + for obj in self.iter_objects(): + yield obj.name + + def get_unified_metadata(self, obj): + return { + 'key': obj.name, + 'last_modified': obj.updated, + 'size': obj.size, + } + def get_data(self, key) -> list[StorageObject]: if self.use_blob_urls: task = {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} diff --git a/label_studio/io_storages/localfiles/form_layout.yml b/label_studio/io_storages/localfiles/form_layout.yml index c9146a0bd115..8c0c5f2c928d 100644 --- a/label_studio/io_storages/localfiles/form_layout.yml +++ b/label_studio/io_storages/localfiles/form_layout.yml @@ -36,7 +36,7 @@ ImportStorage: - value: true label: "Files - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT)" - value: false - label: "JSON - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" + label: "Tasks - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" ExportStorage: - columnCount: 3 diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index c64130288235..74f74ca96b69 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -5,6 +5,7 @@ import logging import os import re +from datetime import datetime, timezone from pathlib import Path from urllib.parse import quote @@ -66,7 +67,7 @@ class LocalFilesImportStorageBase(LocalFilesMixin, ImportStorage): def can_resolve_url(self, url): return False - def iterkeys(self): + def iter_objects(self): path = Path(self.path) regex = re.compile(str(self.regex_filter)) if self.regex_filter else None # For better control of imported tasks, file reading has been changed to ascending order of filenames. @@ -77,7 +78,19 @@ def iterkeys(self): if regex and not regex.match(key): logger.debug(key + ' is skipped by regex filter') continue - yield str(file) + yield file + + def iter_keys(self): + for obj in self.iter_objects(): + yield str(obj) + + def get_unified_metadata(self, obj): + stat = obj.stat() + return { + 'key': str(obj), + 'last_modified': datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc), + 'size': stat.st_size, + } def get_data(self, key) -> list[StorageObject]: path = Path(key) diff --git a/label_studio/io_storages/redis/form_layout.yml b/label_studio/io_storages/redis/form_layout.yml index 38de52df3fc6..6a4efbb7bd09 100644 --- a/label_studio/io_storages/redis/form_layout.yml +++ b/label_studio/io_storages/redis/form_layout.yml @@ -55,7 +55,7 @@ ImportStorage: - value: true label: "Files - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT)" - value: false - label: "JSON - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" + label: "Tasks - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" ExportStorage: - columnCount: 2 diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 1557752b7609..3f43c5de1921 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -84,12 +84,24 @@ class RedisImportStorageBase(ImportStorage, RedisStorageMixin): def can_resolve_url(self, url): return False - def iterkeys(self): + def iter_objects(self): client = self.get_client() path = str(self.path) for key in client.keys(path + '*'): yield key + def iter_keys(self): + for key in self.iter_objects(): + yield key + + def get_unified_metadata(self, obj): + self.get_client() + return { + 'key': obj, + 'last_modified': '', + 'size': self.client.get(self.key), + } + def get_data(self, key) -> list[StorageObject]: client = self.get_client() value_str = client.get(key) diff --git a/label_studio/io_storages/s3/form_layout.yml b/label_studio/io_storages/s3/form_layout.yml index 2fca946765b0..c5dd61bff462 100644 --- a/label_studio/io_storages/s3/form_layout.yml +++ b/label_studio/io_storages/s3/form_layout.yml @@ -112,7 +112,7 @@ ImportStorage: - value: true label: "Files - Automatically creates a task for each storage object (e.g. JPG, MP3, TXT)" - value: false - label: "JSON - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" + label: "Tasks - Treat each JSON or JSONL file as a task definition (one or more tasks per file)" # 2 column grid - columnCount: 2 diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 1aa217598329..e366afebfcd2 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -192,7 +192,7 @@ class S3ImportStorageBase(S3StorageMixin, ImportStorage): ) @catch_and_reraise_from_none - def iterkeys(self): + def iter_objects(self): client, bucket = self.get_client_and_bucket() if self.prefix: list_kwargs = {'Prefix': self.prefix.rstrip('/') + '/'} @@ -210,7 +210,20 @@ def iterkeys(self): if regex and not regex.match(key): logger.debug(key + ' is skipped by regex filter') continue - yield key + logger.debug(f's3 {key} has passed the regex filter') + yield obj + + @catch_and_reraise_from_none + def iter_keys(self): + for obj in self.iter_objects(): + yield obj.key + + def get_unified_metadata(self, obj): + return { + 'key': obj.key, + 'last_modified': obj.last_modified, + 'size': obj.size, + } @catch_and_reraise_from_none def scan_and_create_links(self): diff --git a/label_studio/io_storages/tests/test_import_storage_list_files_api.py b/label_studio/io_storages/tests/test_import_storage_list_files_api.py new file mode 100644 index 000000000000..878d350cbc56 --- /dev/null +++ b/label_studio/io_storages/tests/test_import_storage_list_files_api.py @@ -0,0 +1,261 @@ +"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. +""" +import unittest +from unittest.mock import MagicMock, patch + +import pytest +from io_storages.api import ImportStorageListFilesAPI +from io_storages.serializers import ImportStorageSerializer +from rest_framework import status +from rest_framework.exceptions import ValidationError + + +class TestImportStorageListFilesAPI(unittest.TestCase): + """Unit tests for ImportStorageListFilesAPI. + + This test class validates the file listing functionality of storage imports, + including limit handling, timeout behavior, and error scenarios. + """ + + def setUp(self): + """Set up test dependencies and mock objects for each test.""" + self.api = ImportStorageListFilesAPI(serializer_class=ImportStorageSerializer) + self.user = MagicMock() + self.storage_instance = MagicMock() + + # Configure default storage instance behavior + self.storage_instance.get_unified_metadata.side_effect = lambda obj: { + 'key': f'file_{obj}.txt', + 'last_modified': '2024-01-01T00:00:00Z', + 'size': 1024, + } + + def _create_mock_request(self, data=None): + """Helper method to create mock request with data attribute.""" + request = MagicMock() + request.data = data or {} + request.user = self.user + return request + + @patch('io_storages.functions.validate_storage_instance') + def test_successful_file_listing_under_limit(self, mock_validate): + """Test successful file listing when object count is under the limit. + + This test validates: + - Successful API response with 200 status + - Correct file metadata structure in response + - No limit marker added when under limit + - Proper integration with validate_storage_instance + """ + # Setup: Configure mock storage with 5 objects (under default limit of 100) + mock_validate.return_value = self.storage_instance + self.storage_instance.iter_objects.return_value = range(5) + + # Create mock request with data attribute + request = self._create_mock_request({'id': 1, 'limit': 100}) + + # Execute: Call the API + response = self.api.create(request) + + # Validate: Check response structure and content + assert response.status_code == status.HTTP_200_OK + assert response.data is not None + assert 'files' in response.data + assert len(response.data['files']) == 5 + + # Verify file metadata structure + for i, file_data in enumerate(response.data['files']): + assert file_data['key'] == f'file_{i}.txt' + assert file_data['last_modified'] == '2024-01-01T00:00:00Z' + assert file_data['size'] == 1024 + + # Verify storage validation was called + mock_validate.assert_called_once_with(request, ImportStorageSerializer) + + @patch('io_storages.functions.validate_storage_instance') + def test_file_listing_reaches_limit(self, mock_validate): + """Test file listing behavior when reaching the specified limit. + + This test validates: + - Limit enforcement stops iteration at specified count + - Limit marker is added as final entry when limit is reached + - File count matches exactly the specified limit + 1 (for marker) + """ + # Setup: Configure mock storage with many objects, set limit to 3 + mock_validate.return_value = self.storage_instance + self.storage_instance.iter_objects.return_value = range(100) # More than limit + + request = self._create_mock_request({'id': 1, 'limit': 3}) + + # Execute: Call the API + response = self.api.create(request) + + # Validate: Check limit enforcement + assert response.status_code == status.HTTP_200_OK + assert response.data is not None + assert len(response.data['files']) == 4 # 3 files + 1 limit marker + + # Verify first 3 entries are real files + for i in range(3): + assert response.data['files'][i]['key'] == f'file_{i}.txt' + + # Verify limit marker + limit_marker = response.data['files'][3] + assert limit_marker['key'] is None + assert limit_marker['last_modified'] is None + assert limit_marker['size'] is None + + @patch('io_storages.functions.validate_storage_instance') + def test_uses_default_limit_when_not_specified(self, mock_validate): + """Test that API uses DEFAULT_STORAGE_LIST_LIMIT when limit not in request. + + This test validates: + - Fallback to settings.DEFAULT_STORAGE_LIST_LIMIT (100) when no limit specified + - Proper handling of request data without limit parameter + """ + # Setup: Configure mock storage and no limit in request + mock_validate.return_value = self.storage_instance + self.storage_instance.iter_objects.return_value = range(50) # Under default limit + + request = self._create_mock_request({'id': 1}) + + # Execute: Call the API + response = self.api.create(request) + + # Validate: Should process all 50 files without limit marker + assert response.status_code == status.HTTP_200_OK + assert response.data is not None + assert len(response.data['files']) == 50 # No limit marker + + @patch('io_storages.api.time') + @patch('io_storages.functions.validate_storage_instance') + def test_timeout_handling(self, mock_validate, mock_time): + """Test timeout handling when file scanning exceeds 30 seconds. + + This test validates: + - Timeout detection after 30 seconds + - Timeout marker added to response + - Processing stops when timeout is reached + - Time checking occurs during iteration + """ + # Setup: Configure time mock to simulate timeout + mock_validate.return_value = self.storage_instance + mock_time.time.side_effect = [0, 35] # Start at 0, check at 35 seconds (timeout) + self.storage_instance.iter_objects.return_value = range(100) + + request = self._create_mock_request({'id': 1, 'limit': 100}) + + # Execute: Call the API + response = self.api.create(request) + + # Validate: Check timeout behavior + assert response.status_code == status.HTTP_200_OK + assert response.data is not None + assert len(response.data['files']) == 2 # 1 file + 1 timeout marker + + # Verify first entry is a real file + assert response.data['files'][0]['key'] == 'file_0.txt' + + # Verify timeout marker + timeout_marker = response.data['files'][1] + assert timeout_marker['key'] == '... storage scan timeout reached ...' + assert timeout_marker['last_modified'] is None + assert timeout_marker['size'] is None + + @patch('io_storages.functions.validate_storage_instance') + def test_iter_objects_exception_raises_validation_error(self, mock_validate): + """Test that exceptions during object iteration are converted to ValidationError. + + This test validates: + - Exception handling during storage object iteration + - Proper conversion to ValidationError for API consistency + - Error message preservation for debugging + """ + # Setup: Configure storage to raise exception during iteration + mock_validate.return_value = self.storage_instance + test_exception = Exception('Storage connection failed') + self.storage_instance.iter_objects.side_effect = test_exception + + request = self._create_mock_request({'id': 1}) + + # Execute & Validate: Should raise ValidationError + with pytest.raises(ValidationError) as exc_info: + self.api.create(request) + + # Verify exception details + assert str(exc_info.value.detail[0]) == 'Storage connection failed' + + @patch('io_storages.functions.validate_storage_instance') + def test_get_unified_metadata_exception_raises_validation_error(self, mock_validate): + """Test that exceptions during metadata retrieval are converted to ValidationError. + + This test validates: + - Exception handling during metadata extraction + - Proper error propagation from get_unified_metadata + - ValidationError conversion for API consistency + """ + # Setup: Configure storage to raise exception during metadata retrieval + mock_validate.return_value = self.storage_instance + self.storage_instance.iter_objects.return_value = range(1) + test_exception = Exception('Metadata extraction failed') + self.storage_instance.get_unified_metadata.side_effect = test_exception + + request = self._create_mock_request({'id': 1}) + + # Execute & Validate: Should raise ValidationError + with pytest.raises(ValidationError) as exc_info: + self.api.create(request) + + # Verify exception details + assert str(exc_info.value.detail[0]) == 'Metadata extraction failed' + + @patch('io_storages.functions.validate_storage_instance') + def test_validate_storage_instance_exception_propagates(self, mock_validate): + """Test that validate_storage_instance exceptions are properly propagated. + + This test validates: + - Exception handling during storage validation + - Proper propagation of validation errors + - No additional error wrapping occurs + """ + # Setup: Configure validate_storage_instance to raise ValidationError + validation_error = ValidationError('Invalid storage configuration') + mock_validate.side_effect = validation_error + + request = self._create_mock_request({'invalid': 'data'}) + + # Execute & Validate: Should propagate ValidationError + with pytest.raises(ValidationError) as exc_info: + self.api.create(request) + + # Verify it's the same exception + assert exc_info.value == validation_error + + def test_api_initialization_with_serializer_class(self): + """Test API initialization with custom serializer class. + + This test validates: + - Proper initialization with custom serializer + - Serializer class assignment + - Instance creation without errors + """ + # Execute: Initialize API with custom serializer + custom_serializer = ImportStorageSerializer + api = ImportStorageListFilesAPI(serializer_class=custom_serializer) + + # Validate: Check serializer assignment + assert api.serializer_class == custom_serializer + + def test_api_initialization_without_serializer_class(self): + """Test API initialization without serializer class (default behavior). + + This test validates: + - Initialization with None serializer (default) + - No errors during instance creation + - Proper handling of missing serializer + """ + # Execute: Initialize API without serializer + api = ImportStorageListFilesAPI() + + # Validate: Check default serializer state + assert api.serializer_class is None diff --git a/label_studio/io_storages/urls.py b/label_studio/io_storages/urls.py index 8c98b04951c7..42e686eb8ff1 100644 --- a/label_studio/io_storages/urls.py +++ b/label_studio/io_storages/urls.py @@ -9,6 +9,7 @@ AllImportStorageListAPI, AllImportStorageTypesAPI, ) +from io_storages.api import ImportStorageListFilesAPI from io_storages.azure_blob.api import ( AzureBlobExportStorageDetailAPI, AzureBlobExportStorageFormLayoutAPI, @@ -18,6 +19,7 @@ AzureBlobImportStorageDetailAPI, AzureBlobImportStorageFormLayoutAPI, AzureBlobImportStorageListAPI, + AzureBlobImportStorageSerializer, AzureBlobImportStorageSyncAPI, AzureBlobImportStorageValidateAPI, ) @@ -30,6 +32,7 @@ GCSImportStorageDetailAPI, GCSImportStorageFormLayoutAPI, GCSImportStorageListAPI, + GCSImportStorageSerializer, GCSImportStorageSyncAPI, GCSImportStorageValidateAPI, ) @@ -42,6 +45,7 @@ LocalFilesImportStorageDetailAPI, LocalFilesImportStorageFormLayoutAPI, LocalFilesImportStorageListAPI, + LocalFilesImportStorageSerializer, LocalFilesImportStorageSyncAPI, LocalFilesImportStorageValidateAPI, ) @@ -54,6 +58,7 @@ RedisImportStorageDetailAPI, RedisImportStorageFormLayoutAPI, RedisImportStorageListAPI, + RedisImportStorageSerializer, RedisImportStorageSyncAPI, RedisImportStorageValidateAPI, ) @@ -66,6 +71,7 @@ S3ImportStorageDetailAPI, S3ImportStorageFormLayoutAPI, S3ImportStorageListAPI, + S3ImportStorageSerializer, S3ImportStorageSyncAPI, S3ImportStorageValidateAPI, ) @@ -85,6 +91,11 @@ path('s3//sync', S3ImportStorageSyncAPI.as_view(), name='storage-s3-sync'), path('s3/validate', S3ImportStorageValidateAPI.as_view(), name='storage-s3-validate'), path('s3/form', S3ImportStorageFormLayoutAPI.as_view(), name='storage-s3-form'), + path( + 's3/files', + ImportStorageListFilesAPI().as_view(serializer_class=S3ImportStorageSerializer), + name='storage-s3-list-files', + ), path('export/s3', S3ExportStorageListAPI.as_view(), name='export-storage-s3-list'), path('export/s3/', S3ExportStorageDetailAPI.as_view(), name='export-storage-s3-detail'), path('export/s3//sync', S3ExportStorageSyncAPI.as_view(), name='export-storage-s3-sync'), @@ -96,6 +107,11 @@ path('azure//sync', AzureBlobImportStorageSyncAPI.as_view(), name='storage-azure-sync'), path('azure/validate', AzureBlobImportStorageValidateAPI.as_view(), name='storage-azure-validate'), path('azure/form', AzureBlobImportStorageFormLayoutAPI.as_view(), name='storage-azure-form'), + path( + 'azure/files', + ImportStorageListFilesAPI().as_view(serializer_class=AzureBlobImportStorageSerializer), + name='storage-azure-list-files', + ), path('export/azure', AzureBlobExportStorageListAPI.as_view(), name='export-storage-azure-list'), path('export/azure/', AzureBlobExportStorageDetailAPI.as_view(), name='export-storage-azure-detail'), path('export/azure//sync', AzureBlobExportStorageSyncAPI.as_view(), name='export-storage-azure-sync'), @@ -107,6 +123,11 @@ path('gcs//sync', GCSImportStorageSyncAPI.as_view(), name='storage-gcs-sync'), path('gcs/validate', GCSImportStorageValidateAPI.as_view(), name='storage-gcs-validate'), path('gcs/form', GCSImportStorageFormLayoutAPI.as_view(), name='storage-gcs-form'), + path( + 'gcs/files', + ImportStorageListFilesAPI().as_view(serializer_class=GCSImportStorageSerializer), + name='storage-gcs-list-files', + ), path('export/gcs', GCSExportStorageListAPI.as_view(), name='export-storage-gcs-list'), path('export/gcs/', GCSExportStorageDetailAPI.as_view(), name='export-storage-gcs-detail'), path('export/gcs//sync', GCSExportStorageSyncAPI.as_view(), name='export-storage-gcs-sync'), @@ -118,6 +139,11 @@ path('redis//sync', RedisImportStorageSyncAPI.as_view(), name='storage-redis-sync'), path('redis/validate', RedisImportStorageValidateAPI.as_view(), name='storage-redis-validate'), path('redis/form', RedisImportStorageFormLayoutAPI.as_view(), name='storage-redis-form'), + path( + 'redis/files', + ImportStorageListFilesAPI().as_view(serializer_class=RedisImportStorageSerializer), + name='storage-redis-list-files', + ), path('export/redis', RedisExportStorageListAPI.as_view(), name='export-storage-redis-list'), path('export/redis/', RedisExportStorageDetailAPI.as_view(), name='export-storage-redis-detail'), path('export/redis//sync', RedisExportStorageSyncAPI.as_view(), name='export-storage-redis-sync'), @@ -132,6 +158,11 @@ path('localfiles//sync', LocalFilesImportStorageSyncAPI.as_view(), name='storage-localfiles-sync'), path('localfiles/validate', LocalFilesImportStorageValidateAPI.as_view(), name='storage-localfiles-validate'), path('localfiles/form', LocalFilesImportStorageFormLayoutAPI.as_view(), name='storage-localfiles-form'), + path( + 'localfiles/files', + ImportStorageListFilesAPI().as_view(serializer_class=LocalFilesImportStorageSerializer), + name='storage-localfiles-list-files', + ), path('export/localfiles', LocalFilesExportStorageListAPI.as_view(), name='export-storage-localfiles-list'), path( 'export/localfiles/', diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 5f6ede637006..f5eb74a1255c 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -154,7 +154,7 @@ def _error_wrapper(exc: Optional[Exception] = None): raise ValueError( ( f'Can’t import JSON-formatted tasks from {key}. If you’re trying to import binary objects, ' - f'perhaps you forgot to enable "Treat every bucket object as a source file" option?' + f'perhaps you forgot to enable "Tasks" import method?' ) ) from exc diff --git a/web/apps/labelstudio/src/app/ErrorBoundary.jsx b/web/apps/labelstudio/src/app/ErrorBoundary.jsx index be249054a30c..dd2de5c6b75b 100644 --- a/web/apps/labelstudio/src/app/ErrorBoundary.jsx +++ b/web/apps/labelstudio/src/app/ErrorBoundary.jsx @@ -19,6 +19,7 @@ export default class ErrorBoundary extends Component { } componentDidCatch(error, { componentStack }) { + console.error(error); // Capture the error in Sentry, so we can fix it directly // Don't make the users copy and paste the stacktrace, it's not actionable captureException(error, { diff --git a/web/apps/labelstudio/src/components/Error/Error.scss b/web/apps/labelstudio/src/components/Error/Error.scss index a99898b7d311..e2540a2f7909 100644 --- a/web/apps/labelstudio/src/components/Error/Error.scss +++ b/web/apps/labelstudio/src/components/Error/Error.scss @@ -1,13 +1,13 @@ .inline-error { width: 100%; - padding: 16px; - border-radius: 5px; + border-radius: 0.5rem; box-sizing: border-box; - background-color: var(--color-neutral-background); + background-color: var(--color-negative-background); } .error-message { max-width: 100%; + padding: 1rem; &__heidi { display: block; @@ -15,64 +15,62 @@ } &__title { - text-transform: uppercase; - text-align: center; - font-size: 20px; - margin: 32px auto; color: var(--color-negative-content); + font-size: 1.25rem; + font-weight: 600; } &__detail { - font-size: 24px; - font-weight: bold; + font-size: 1rem; + font-weight: 400; color: var(--color-neutral-content); - margin: 16px 0; + margin-top: 0.5rem; white-space: pre-line; word-break: break-word; } &__exception { - margin: 15px 0; + margin: 0.5rem 1rem; } - &__stacktrace { - margin: 16px 0; + &__stracktrace { + margin: 0.5rem 1rem; padding: 16px; overflow: auto; line-height: 26px; max-height: 200px; white-space: pre; border-radius: 5px; - content: var(--color-neutral-content); - background-color: var(--color-neutral-surface); + color: var(--color-neutral-content); font-family: var(--font-mono); } &__version { font-size: 14px; - font-weight: bold; - margin: 16px 0; + font-weight: 600; + margin: 0.5rem 0; + color: var(--color-neutral-content-subtlest); } &__validation { padding: 0; - margin: 16px 0; + margin: 0.5rem 1rem; list-style-type: none; max-height: 300px; overflow-y: auto; } &__message { - margin: 5px 0; - color: var(--color-neutral-content-subtle); + color: var(--color-neutral-content-subtler); padding: 0; white-space: pre-line; + line-height: 1.4; word-break: break-word; } &__actions { display: flex; - padding: 16px 0 0; + margin: 1rem; } &__slack { @@ -86,4 +84,21 @@ margin-right: 8px; } } -} \ No newline at end of file + + &_kind_paused { + padding: 32px; + } + + &_kind_paused &__detail { + margin-block: 16px; + } + + &_kind_paused &__actions { + margin-inline: 0; + } +} + +.paused-error .modal-ls__content { + border-radius: 16px; + overflow: hidden; +} diff --git a/web/apps/labelstudio/src/components/Form/Elements/Counter/Counter.jsx b/web/apps/labelstudio/src/components/Form/Elements/Counter/Counter.jsx index d32437243564..8a8dee801043 100644 --- a/web/apps/labelstudio/src/components/Form/Elements/Counter/Counter.jsx +++ b/web/apps/labelstudio/src/components/Form/Elements/Counter/Counter.jsx @@ -85,6 +85,13 @@ const Counter = ({ label, className, validate, required, skip, labelProps, ...pr if (type === "decrease") return decrease(); }; + // Update currentValue when props.value changes + React.useEffect(() => { + if (props.value !== undefined && props.value !== null) { + setCurrentValue(normalizeValue(Number(props.value))); + } + }, [props.value]); + const field = (
- {text} + {text} {tooltip && (
{tooltipIcon ? tooltipIcon : } diff --git a/web/apps/labelstudio/src/components/Form/Elements/Label/Label.scss b/web/apps/labelstudio/src/components/Form/Elements/Label/Label.scss index bf60737eb870..333430c37051 100644 --- a/web/apps/labelstudio/src/components/Form/Elements/Label/Label.scss +++ b/web/apps/labelstudio/src/components/Form/Elements/Label/Label.scss @@ -89,7 +89,7 @@ width: 100%; } - &[data-required] &__text::after { + &[data-required] &__text span::after { content: "Required"; font-size: 0.825rem; color: var(--color-neutral-content-subtler); diff --git a/web/apps/labelstudio/src/components/Modal/Modal.jsx b/web/apps/labelstudio/src/components/Modal/Modal.jsx index a3c8bf7593fb..7dd60ddd6669 100644 --- a/web/apps/labelstudio/src/components/Modal/Modal.jsx +++ b/web/apps/labelstudio/src/components/Modal/Modal.jsx @@ -9,6 +9,8 @@ import { Button } from "@humansignal/ui"; import { Space } from "../Space/Space"; import { Modal } from "./ModalPopup"; import { ToastProvider, ToastViewport } from "@humansignal/ui"; +import { QueryClientProvider } from "@tanstack/react-query"; +import { queryClient } from "../../utils/query-client"; const standaloneModal = (props) => { const modalRef = createRef(); @@ -33,6 +35,7 @@ const standaloneModal = (props) => { , , , + , ] } > diff --git a/web/apps/labelstudio/src/config/ApiConfig.js b/web/apps/labelstudio/src/config/ApiConfig.js index 3acbfd36ad13..62c8f95cb572 100644 --- a/web/apps/labelstudio/src/config/ApiConfig.js +++ b/web/apps/labelstudio/src/config/ApiConfig.js @@ -50,6 +50,7 @@ export const API_CONFIG = { updateStorage: "PATCH:/storages/:target?/:type/:pk", syncStorage: "POST:/storages/:target?/:type/:pk/sync", validateStorage: "POST:/storages/:target?/:type/validate", + storageFiles: "POST:/storages/:target?/:type/files", // ML mlBackends: "GET:/ml", diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/StorageSet.jsx b/web/apps/labelstudio/src/pages/Settings/StorageSettings/StorageSet.jsx index 65c8265d377a..99371483d154 100644 --- a/web/apps/labelstudio/src/pages/Settings/StorageSettings/StorageSet.jsx +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/StorageSet.jsx @@ -9,12 +9,16 @@ import { StorageCard } from "./StorageCard"; import { StorageForm } from "./StorageForm"; import { useAtomValue } from "jotai"; import { useStorageCard } from "./hooks/useStorageCard"; +import { ff } from "@humansignal/core"; +import { StorageProviderForm } from "@humansignal/app-common/blocks/StorageProviderForm"; +import { providers } from "./providers"; export const StorageSet = ({ title, target, rootClass, buttonLabel }) => { const api = useContext(ApiContext); const project = useAtomValue(projectAtom); const storageTypesQueryKey = ["storage-types", target]; const storagesQueryKey = ["storages", target, project?.id]; + const useNewStorageScreen = ff.isActive(ff.FF_NEW_STORAGES) && target !== "export"; const { storageTypes, @@ -32,15 +36,38 @@ export const StorageSet = ({ title, target, rootClass, buttonLabel }) => { const showStorageFormModal = useCallback( (storage) => { - const action = storage ? "Edit" : "Add"; + const action = storage ? "Edit" : "Connect"; const actionTarget = target === "export" ? "Target" : "Source"; const title = `${action} ${actionTarget} Storage`; const modalRef = modal({ title, closeOnClickOutside: false, - style: { width: 760 }, - body: ( + style: { width: 960 }, + bare: useNewStorageScreen, + onHidden: () => { + // Reset state when modal is closed (including Escape key) + // This ensures clean state for next modal open + }, + body: useNewStorageScreen ? ( + { + modalRef.close(); + fetchStorages(); + }} + onHide={() => { + // This will be called when the modal is closed via Escape key + // The state reset is handled inside StorageProviderForm + }} + /> + ) : ( { }} /> ), - footer: ( - <> - - Learn more - {" "} - about importing data and saving annotations to Cloud Storage. - - ), }); }, [project, fetchStorages, target, rootClass], diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/hooks/useStorageCard.tsx b/web/apps/labelstudio/src/pages/Settings/StorageSettings/hooks/useStorageCard.tsx index 63cff702610e..4891ae47cd33 100644 --- a/web/apps/labelstudio/src/pages/Settings/StorageSettings/hooks/useStorageCard.tsx +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/hooks/useStorageCard.tsx @@ -34,7 +34,7 @@ function useStorageTypes(target: "import" | "export") { const { data, isLoading, isSuccess, refetch } = useQuery({ queryKey: storageTypesQueryKey, async queryFn() { - const result = await api.callApi("storageTypes", { + const result = await api.callApi<{ title: string; name: string }[]>("storageTypes", { params: { target }, errorFilter: () => true, }); diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/azure.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/azure.ts new file mode 100644 index 000000000000..40f601e36908 --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/azure.ts @@ -0,0 +1,69 @@ +import type { ProviderConfig } from "@humansignal/app-common/blocks/StorageProviderForm/types/provider"; +import { IconCloudProviderAzure } from "@humansignal/icons"; +import { z } from "zod"; + +export const azureProvider: ProviderConfig = { + name: "azure", + title: "Azure Blob Storage", + description: "Configure your Azure Blob Storage connection with all required Label Studio settings", + icon: IconCloudProviderAzure, + fields: [ + { + name: "container", + type: "text", + label: "Container Name", + required: true, + placeholder: "my-azure-container", + schema: z.string().min(1, "Container name is required"), + }, + { + name: "account_name", + type: "password", + label: "Account Name", + autoComplete: "off", + accessKey: true, + placeholder: "mystorageaccount", + schema: z.string().min(1, "Account Name is required"), + }, + { + name: "account_key", + type: "password", + label: "Account Key", + autoComplete: "new-password", + accessKey: true, + placeholder: "Your storage account key", + schema: z.string().min(1, "Account Key is required"), + }, + { + name: "presign", + type: "toggle", + label: "Use pre-signed URLs (On) / Proxy through the platform (Off)", + description: + "When pre-signed URLs are enabled, all data bypasses the platform and user browsers directly read data from storage", + schema: z.boolean().default(true), + }, + { + name: "presign_ttl", + type: "counter", + label: "Expire pre-signed URLs (minutes)", + min: 1, + max: 10080, + step: 1, + schema: z.number().min(1).max(10080).default(15), + // dependency: "presign" // Not implemented in UI yet + }, + ], + layout: [ + { + fields: ["container"], + }, + { + fields: ["account_name", "account_key"], + }, + { + fields: ["presign", "presign_ttl"], + }, + ], +}; + +export default azureProvider; diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/gcs.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/gcs.ts new file mode 100644 index 000000000000..59fcda06bfb1 --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/gcs.ts @@ -0,0 +1,69 @@ +import { z } from "zod"; +import type { ProviderConfig } from "@humansignal/app-common/blocks/StorageProviderForm/types/provider"; +import { IconCloudProviderGCS } from "@humansignal/icons"; + +export const gcsProvider: ProviderConfig = { + name: "gcs", + title: "Google Cloud Storage", + description: "Configure your Google Cloud Storage connection with all required Label Studio settings", + icon: IconCloudProviderGCS, + fields: [ + { + name: "bucket", + type: "text", + label: "Bucket Name", + required: true, + schema: z.string().min(1, "Bucket name is required"), + }, + { + name: "google_application_credentials", + type: "password", + label: "Google Application Credentials", + description: "Paste the contents of credentials.json in this field OR leave it blank to use ADC.", + autoComplete: "new-password", + accessKey: true, + schema: z.string().optional().default(""), // JSON validation could be added if needed + }, + { + name: "google_project_id", + type: "text", + label: "Google Project ID", + description: "Leave blank to inherit from Google Application Credentials.", + schema: z.string().optional().default(""), + }, + { + name: "presign", + type: "toggle", + label: "Use pre-signed URLs (On) / Proxy through the platform (Off)", + description: + "When pre-signed URLs are enabled, all data bypasses the platform and user browsers directly read data from storage", + schema: z.boolean().default(true), + }, + { + name: "presign_ttl", + type: "counter", + label: "Expire pre-signed URLs (minutes)", + min: 1, + max: 10080, + step: 1, + schema: z.number().min(1).max(10080).default(15), + // dependency: "presign" // Not implemented in UI yet + }, + ], + layout: [ + { + fields: ["bucket"], + }, + { + fields: ["google_application_credentials"], + }, + { + fields: ["google_project_id"], + }, + { + fields: ["presign", "presign_ttl"], + }, + ], +}; + +export default gcsProvider; diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/index.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/index.ts new file mode 100644 index 000000000000..0795f79cf0d3 --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/index.ts @@ -0,0 +1,13 @@ +import azureProvider from "./azure"; +import gcsProvider from "./gcs"; +import localFilesProvider from "./localFiles"; +import redisProvider from "./redis"; +import { s3Provider } from "./s3"; + +export const providers = { + s3: s3Provider, + gcs: gcsProvider, + azure: azureProvider, + redis: redisProvider, + localfiles: localFilesProvider, +}; diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/localFiles.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/localFiles.ts new file mode 100644 index 000000000000..926859ccf4bc --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/localFiles.ts @@ -0,0 +1,27 @@ +import { z } from "zod"; +import type { ProviderConfig } from "@humansignal/app-common/blocks/StorageProviderForm/types/provider"; +import { IconDocument } from "@humansignal/icons"; + +export const localFilesProvider: ProviderConfig = { + name: "localfiles", + title: "Local Files", + description: "Configure your local file storage connection with all required Label Studio settings", + icon: IconDocument, + fields: [ + { + name: "path", + type: "text", + label: "Absolute local path", + required: true, + placeholder: "/data/my-folder/", + schema: z.string().min(1, "Path is required"), + }, + ], + layout: [ + { + fields: ["path"], + }, + ], +}; + +export default localFilesProvider; diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/redis.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/redis.ts new file mode 100644 index 000000000000..3f5d124fa526 --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/redis.ts @@ -0,0 +1,48 @@ +import { z } from "zod"; +import type { ProviderConfig } from "@humansignal/app-common/blocks/StorageProviderForm/types/provider"; +import { IconCloudProviderRedis } from "@humansignal/icons"; + +export const redisProvider: ProviderConfig = { + name: "redis", + title: "Redis Storage", + description: "Configure your Redis storage connection with all required Label Studio settings", + icon: IconCloudProviderRedis, + fields: [ + { + name: "db", + type: "text", + label: "Database Number (db)", + placeholder: "1", + schema: z.string().default("1"), + }, + { + name: "password", + type: "password", + label: "Password", + autoComplete: "new-password", + placeholder: "Your redis password", + schema: z.string().optional().default(""), + }, + { + name: "host", + type: "text", + label: "Host", + placeholder: "redis://example.com", + schema: z.string(), + }, + { + name: "port", + type: "text", + label: "Port", + placeholder: "6379", + schema: z.string().default("6379"), + }, + ], + layout: [ + { + fields: ["host", "port", "db", "password"], + }, + ], +}; + +export default redisProvider; diff --git a/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/s3.ts b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/s3.ts new file mode 100644 index 000000000000..1e1a6de29a5d --- /dev/null +++ b/web/apps/labelstudio/src/pages/Settings/StorageSettings/providers/s3.ts @@ -0,0 +1,93 @@ +import { z } from "zod"; +import type { ProviderConfig } from "@humansignal/app-common/blocks/StorageProviderForm/types/provider"; +import { IconCloudProviderS3 } from "@humansignal/icons"; + +export const s3Provider: ProviderConfig = { + name: "s3", + title: "Amazon S3", + description: "Configure your AWS S3 connection with all required Label Studio settings", + icon: IconCloudProviderS3, + fields: [ + { + name: "bucket", + type: "text", + label: "Bucket Name", + required: true, + placeholder: "my-storage-bucket", + schema: z.string().min(1, "Bucket name is required"), + }, + { + name: "region_name", + type: "text", + label: "Region Name", + placeholder: "us-east-1", + schema: z.string().optional().default(""), + }, + { + name: "s3_endpoint", + type: "text", + label: "S3 Endpoint", + placeholder: "https://s3.amazonaws.com", + schema: z.string().optional().default(""), + }, + { + name: "aws_access_key_id", + type: "password", + label: "Access Key ID", + required: true, + placeholder: "AKIAIOSFODNN7EXAMPLE", + autoComplete: "off", + accessKey: true, + schema: z.string().min(1, "Access Key ID is required"), + }, + { + name: "aws_secret_access_key", + type: "password", + label: "Secret Access Key", + required: true, + placeholder: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + autoComplete: "new-password", + accessKey: true, + schema: z.string().min(1, "Secret Access Key is required"), + }, + { + name: "aws_session_token", + type: "password", + label: "Session Token", + placeholder: "Session token (optional)", + autoComplete: "new-password", + schema: z.string().optional().default(""), + }, + { + name: "presign", + type: "toggle", + label: "Use pre-signed URLs (On) / Proxy through the platform (Off)", + description: + "When pre-signed URLs are enabled, all data bypasses the platform and user browsers directly read data from storage", + schema: z.boolean().default(true), + }, + { + name: "presign_ttl", + type: "counter", + label: "Expire pre-signed URLs (minutes)", + min: 1, + max: 10080, + step: 1, + schema: z.number().min(1).max(10080).default(15), + }, + ], + layout: [ + { + fields: ["bucket"], + }, + { + fields: ["region_name", "s3_endpoint"], + }, + { + fields: ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"], + }, + { + fields: ["presign", "presign_ttl"], + }, + ], +}; diff --git a/web/apps/labelstudio/src/providers/ApiProvider.tsx b/web/apps/labelstudio/src/providers/ApiProvider.tsx index 3b9674ffd35b..98510d14337b 100644 --- a/web/apps/labelstudio/src/providers/ApiProvider.tsx +++ b/web/apps/labelstudio/src/providers/ApiProvider.tsx @@ -128,6 +128,7 @@ const handleError = async ( const errorDetails = errorFormatter(result); // Allow inline error handling + console.log(showGlobalError); if (!showGlobalError) { return errorDetails.isShutdown; } diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/README.md b/web/libs/app-common/src/blocks/StorageProviderForm/README.md new file mode 100644 index 000000000000..a0f08ad72573 --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/README.md @@ -0,0 +1,216 @@ +# Storage Provider Configuration System + +This system allows you to easily add new storage providers by defining their configuration in a declarative way. + +## How to Add a New Provider + +### 1. Create Provider Configuration + +Create a new file in `providers/` directory (e.g., `providers/myProvider.ts`): + +```typescript +import { z } from "zod"; +import { ProviderConfig } from "../types/provider"; + +export const myProvider: ProviderConfig = { + name: "myprovider", + title: "My Storage Provider", + description: "Configure your My Storage Provider connection", + fields: [ + { + name: "api_key", + type: "password", + label: "API Key", + required: true, + placeholder: "Enter your API key", + schema: z.string().min(1, "API Key is required"), + }, + { + name: "endpoint", + type: "text", + label: "API Endpoint", + required: true, + placeholder: "https://api.mystorage.com", + schema: z.string().url("Must be a valid URL"), + }, + { + name: "use_ssl", + type: "toggle", + label: "Use SSL", + description: "Enable SSL for secure connections", + schema: z.boolean().default(true), // Default value defined in schema + }, + { + name: "timeout", + type: "counter", + label: "Connection Timeout (seconds)", + min: 1, + max: 300, + step: 5, + schema: z.number().min(1).max(300).default(30), // Default value defined in schema + }, + ], + layout: [ + { + fields: ["api_key"], + }, + { + fields: ["endpoint"], + }, + { + fields: ["use_ssl", "timeout"], + }, + ], +}; +``` + +### 2. Register the Provider + +Add your provider to the registry in `providers/index.ts`: + +```typescript +import { myProvider } from "./myProvider"; + +export const providerRegistry: Record = { + s3: s3Provider, + gcp: gcpProvider, + azure: azureProvider, + redis: redisProvider, + localfiles: localFilesProvider, + myprovider: myProvider, // Add your provider here +}; +``` + +## Field Types + +### Available Field Types + +- `text`: Regular text input +- `password`: Password input (hidden) +- `number`: Numeric input +- `select`: Dropdown selection +- `toggle`: Boolean toggle switch +- `counter`: Numeric counter with min/max +- `textarea`: Multi-line text input + +### Field Properties + +```typescript +{ + name: string; // Field name (used in form data) + type: FieldType; // Field type (see above) + label: string; // Display label + description?: string; // Help text + placeholder?: string; // Placeholder text + required?: boolean; // Whether field is required + schema: z.ZodTypeAny; // Zod validation schema with defaults + options?: Array<{ value: string; label: string }>; // For select fields + min?: number; // For number/counter fields + max?: number; // For number/counter fields + step?: number; // For number/counter fields + autoComplete?: string; // For input fields + gridCols?: number; // How many columns this field should span (1-12) +} +``` + +## Default Values + +Default values are now defined directly in the Zod schema using `.default()`: + +```typescript +{ + name: "use_ssl", + type: "toggle", + label: "Use SSL", + schema: z.boolean().default(true), // Default: true +}, +{ + name: "timeout", + type: "counter", + label: "Connection Timeout", + schema: z.number().min(1).max(300).default(30), // Default: 30 +}, +{ + name: "region", + type: "select", + label: "Region", + schema: z.string().default("us-east-1"), // Default: "us-east-1" +}, +``` + +The system automatically extracts these default values from the schemas using the `extractDefaultValues()` function. + +## Layout Configuration + +The `layout` array defines how fields are arranged in rows: + +```typescript +layout: [ + { + fields: ["field1"], // Single field on one row + }, + { + fields: ["field2", "field3"], // Two fields on the same row + }, + { + fields: ["field4"], // Another single field + }, +] +``` + +## Validation + +Each field includes a Zod schema for validation: + +```typescript +{ + name: "api_key", + type: "password", + label: "API Key", + required: true, + schema: z.string().min(1, "API Key is required"), +} +``` + +The system automatically assembles all field schemas into a complete validation schema for the entire form. + +## Helper Functions + +The system provides several helper functions: + +- `getProviderConfig(providerName)`: Get provider configuration +- `getProviderSchema(providerName)`: Get validation schema for provider +- `getProviderDefaultValues(providerName)`: Get default values for provider +- `extractDefaultValues(fields)`: Extract defaults from field schemas + +## Example: Complete Provider + +See `providers/example.ts` for a complete example of a new provider configuration. + +## Benefits + +1. **Declarative**: Define providers in a simple configuration object +2. **Type-safe**: Full TypeScript support with Zod validation +3. **Flexible**: Easy to add new field types and layouts +4. **Maintainable**: All provider logic in one place +5. **Consistent**: All providers follow the same structure +6. **Extensible**: Easy to add new features to all providers at once +7. **Single Source of Truth**: Default values are defined in the schema, not separately + +## Migration from Old System + +The old system used hardcoded React components for each provider. The new system: + +1. Uses a generic `ProviderForm` component +2. Renders fields based on configuration +3. Handles validation automatically +4. Makes adding new providers much easier +5. Uses Zod schemas for both validation and defaults + +To migrate an existing provider: + +1. Extract field definitions from the old component +2. Create a new provider configuration file +3. Define the layout and validation schemas with defaults +4. Register the provider in the registry +5. Remove the old component file \ No newline at end of file diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/Steps/index.ts b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/index.ts new file mode 100644 index 000000000000..ea64abfbfdde --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/index.ts @@ -0,0 +1,5 @@ +export { Stepper } from "./stepper"; +export { ProviderSelectionStep } from "./provider-selection-step"; +export { ProviderDetailsStep } from "./provider-details-step"; +export { PreviewStep } from "./preview-step"; +export { ReviewStep } from "./review-step"; diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/Steps/preview-step.tsx b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/preview-step.tsx new file mode 100644 index 000000000000..2bf7cbf3c941 --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/preview-step.tsx @@ -0,0 +1,318 @@ +import { Label, Toggle, Select } from "@humansignal/ui"; +import { Form, Input } from "apps/labelstudio/src/components/Form"; +import { IconDocument, IconSearch } from "@humansignal/icons"; +import { formatDistanceToNow } from "date-fns"; +import type { ForwardedRef } from "react"; +import { InlineError } from "apps/labelstudio/src/components/Error/InlineError"; + +interface PreviewStepProps { + formData: any; + formState: any; + setFormState: (updater: (prevState: any) => any) => void; + handleChange: (e: React.ChangeEvent) => void; + action: string; + target: string; + type: string; + project: string; + storage?: any; + onSubmit: () => void; + formRef: ForwardedRef; + filesPreview: any[] | null; + formatSize: (bytes: number) => string; + onImportSettingsChange?: () => void; +} + +const regexFilters = [ + { + title: "Images", + regex: ".*.(jpe?g|png|gif)$", + blob: true, + }, + { + title: "Videos", + regex: ".*\\.(mp4|avi|mov|wmv|webm)$", + blob: true, + }, + { + title: "Audio", + regex: ".*\\.(mp3|wav|ogg|flac)$", + blob: true, + }, + { + title: "Tabular", + regex: ".*\\.(csv|tsv)$", + blob: true, + }, + { + title: "JSON", + regex: ".*\\.json$", + blob: false, + }, + { + title: "JSONL", + regex: ".*\\.jsonl$", + blob: false, + }, + { + title: "Parquet", + regex: ".*\\.parquet$", + blob: false, + }, + { + title: "All Tasks Files", + regex: ".*\\.(json|jsonl|parquet)$", + blob: false, + }, +] as const; + +export const PreviewStep = ({ + formData, + formState, + setFormState, + handleChange, + action, + target, + type, + project, + storage, + onSubmit, + formRef, + filesPreview, + formatSize, + onImportSettingsChange, +}: PreviewStepProps) => { + return ( +
+
+

Configure Import Settings & Preview Data

+

Set up filters for your files and preview what will be synchronized

+
+ +
+ {/* Left Column Header */} +

Import Configuration

+ + {/* Right Column Header with Button */} +
+

Files Preview

+
+ + {/* Left Column: Configuration */} +
+
+
+ {/* Path/Bucket Prefix Section - Hide for localfiles since it has its own path field */} + {type !== "localfiles" && ( +
+
+ )} + + {/* File Filter Section */} +
+
+ + {/* Import Method */} +
+
+ + + + +
+ ); +}; diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/Steps/provider-selection-step.tsx b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/provider-selection-step.tsx new file mode 100644 index 000000000000..93e82deac407 --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/provider-selection-step.tsx @@ -0,0 +1,53 @@ +import { Label } from "@humansignal/ui"; +import { useEffect } from "react"; +import { ProviderGrid } from "../components"; +import type { ProviderConfig } from "../types/provider"; + +interface ProviderSelectionStepProps { + formData: { + provider: string; + }; + errors: { + provider?: string; + }; + handleSelectChange: (name: string, value: string) => void; + setFormState: (updater: (prevState: any) => any) => void; + storageTypesLoading?: boolean; + target?: "import" | "export"; + providers: Record; +} + +export const ProviderSelectionStep = ({ + formData, + errors, + handleSelectChange, + providers, +}: ProviderSelectionStepProps) => { + // Set default provider if none is selected and we have options + useEffect(() => { + if (!formData.provider && Object.entries(providers).length > 0) { + handleSelectChange("provider", providers[0].name); + } + }, [providers, formData.provider, handleSelectChange]); + + return ( +
+
+

Choose your cloud storage provider

+

Select the cloud storage service where your data is stored

+
+ +
+ + handleSelectChange("provider", providerName)} + error={errors.provider} + /> +
+
+ ); +}; diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/Steps/review-step.tsx b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/review-step.tsx new file mode 100644 index 000000000000..3f3b56acf122 --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/review-step.tsx @@ -0,0 +1,107 @@ +interface ReviewStepProps { + formData: any; + filesPreview?: any; + formatSize?: (bytes: number) => string; +} + +export const ReviewStep = ({ formData, filesPreview, formatSize }: ReviewStepProps) => { + const getProviderDisplayName = (provider: string) => { + const providerMap: Record = { + s3: "Amazon S3", + gcp: "Google Cloud Storage", + azure: "Azure Blob Storage", + redis: "Redis", + localfiles: "Local Files", + }; + return providerMap[provider] || provider; + }; + + const getBucketName = () => { + return formData.bucket || formData.container || "Not specified"; + }; + + const getFileCount = () => { + if (!filesPreview) return "0 files"; + + // Check if the last file is the "preview limit reached" indicator + const lastFile = filesPreview[filesPreview.length - 1]; + const hasMoreFiles = lastFile && lastFile.key === null; + + if (hasMoreFiles) { + // Subtract 1 to exclude the placeholder file + const visibleFileCount = filesPreview.length - 1; + return `More than ${visibleFileCount} files`; + } + + return `${filesPreview.length} files`; + }; + + const getTotalSize = () => { + if (!filesPreview || !formatSize) return "0 Bytes"; + + // Check if the last file is the "preview limit reached" indicator + const lastFile = filesPreview[filesPreview.length - 1]; + const hasMoreFiles = lastFile && lastFile.key === null; + + // Calculate total size excluding the placeholder file if it exists + const filesToCount = hasMoreFiles ? filesPreview.slice(0, -1) : filesPreview; + const totalBytes = filesToCount.reduce((sum: number, file: any) => sum + (file.size || 0), 0); + + if (hasMoreFiles) { + return `More than ${formatSize(totalBytes)}`; + } + + return formatSize(totalBytes); + }; + + return ( +
+
+

Ready to Connect

+

Review your connection details and confirm to start importing

+
+ + {/* Connection Details Section */} +
+
+

Provider

+

{getProviderDisplayName(formData.provider)}

+
+ +
+

Storage Location

+

{getBucketName()}

+
+ + {formData.prefix && ( +
+

Prefix

+

{formData.prefix}

+
+ )} + + {filesPreview && ( + <> +
+

Files to import

+

{getFileCount()}

+
+ +
+

Total size

+

{getTotalSize()}

+
+ + )} +
+ + {/* Import Process Section */} +
+

Import Process

+

+ Files will be imported in the background. You can continue working while the import is in progress. +

+
+
+ ); +}; diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/Steps/stepper.tsx b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/stepper.tsx new file mode 100644 index 000000000000..afd958287ee9 --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/Steps/stepper.tsx @@ -0,0 +1,116 @@ +import { cn } from "@humansignal/ui"; + +interface StepperProps { + steps: { title: string }[]; + currentStep: number; + onStepClick?: (stepIndex: number) => void; + isEditMode?: boolean; +} + +const MAX_STEPS_COUNT = 4; + +export const Stepper = ({ steps, currentStep, onStepClick, isEditMode = false }: StepperProps) => { + // Calculate progress that aligns with circle centers + const calculateProgressWidth = () => { + if (currentStep === 0) return 0; + if (currentStep >= steps.length - 1) return 100; + + // Calculate the position of the current step circle (left edge) + const stepWidth = 100 / MAX_STEPS_COUNT; + // Stop at the current step's circle, not extend to the next step + const progressToCurrentStep = currentStep * stepWidth; + + return Math.max(0, Math.min(100, progressToCurrentStep)); + }; + + const handleStepClick = (stepIndex: number) => { + // In edit mode, allow clicking on all steps + // In create mode, only allow clicking on completed steps + if (onStepClick && (isEditMode || stepIndex < currentStep)) { + onStepClick(stepIndex); + } + }; + + return ( +
+
+ {/* Step titles at the top */} +
+ {steps.map((step, index) => ( +
+ = index ? "text-primary-content font-semibold" : "text-neutral-content-subtle", + // Make steps clickable in edit mode or completed steps in create mode + ((isEditMode && onStepClick) || (index < currentStep && onStepClick)) && + "cursor-pointer hover:text-primary-content-subtle transition-colors", + )} + onClick={() => handleStepClick(index)} + > + {step.title} + +
+ ))} +
+ + {/* Progress bar with integrated circles */} +
+ {/* Background progress bar */} +
+ + {/* Progress fill */} +
+ + {/* Step circles positioned along the progress bar */} +
+ {steps.map((step, index) => ( +
+
index + ? "bg-primary-surface text-primary-surface-content shadow-sm border-primary-surface cursor-pointer hover:bg-primary-emphasis" // completed - clickable + : currentStep === index + ? isEditMode + ? "bg-primary-surface text-primary-surface-content shadow-sm border-primary-surface cursor-pointer hover:bg-primary-emphasis" // current - clickable in edit mode + : "bg-primary-surface text-primary-surface-content shadow-sm border-primary-surface" // current - not clickable in create mode + : isEditMode + ? "bg-neutral-surface border-neutral-border text-neutral-content-subtle cursor-pointer hover:bg-neutral-emphasis" // upcoming - clickable in edit mode + : "bg-neutral-surface border-neutral-border text-neutral-content-subtle", // upcoming - not clickable in create mode + )} + onClick={() => handleStepClick(index)} + > + {currentStep > index ? ( + + Line + + + ) : ( + {index + 1} + )} +
+
+ ))} +
+
+
+
+ ); +}; diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/atoms.ts b/web/libs/app-common/src/blocks/StorageProviderForm/atoms.ts new file mode 100644 index 000000000000..f4c01c9826eb --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/atoms.ts @@ -0,0 +1,28 @@ +import { atom } from "jotai"; + +export interface FormState { + currentStep: number; + formData: { + project: number; + provider: string; + title: string; + use_blob_urls: boolean; + recursive_scan: boolean; + regex_filter: string; + [key: string]: any; + }; + isComplete: boolean; +} + +export const formStateAtom = atom({ + currentStep: 0, + formData: { + project: 0, + provider: "s3", + title: "", + use_blob_urls: false, + recursive_scan: true, + regex_filter: "", + }, + isComplete: false, +}); diff --git a/web/libs/app-common/src/blocks/StorageProviderForm/components/field-renderer.tsx b/web/libs/app-common/src/blocks/StorageProviderForm/components/field-renderer.tsx new file mode 100644 index 000000000000..68900c7fb06d --- /dev/null +++ b/web/libs/app-common/src/blocks/StorageProviderForm/components/field-renderer.tsx @@ -0,0 +1,165 @@ +import type React from "react"; +import { Label, Toggle, Select } from "@humansignal/ui"; +import Counter from "apps/labelstudio/src/components/Form/Elements/Counter/Counter"; +import Input from "apps/labelstudio/src/components/Form/Elements/Input/Input"; +import type { FieldDefinition } from "../types/common"; + +interface FieldRendererProps { + field: FieldDefinition; + value: any; + onChange: (name: string, value: any) => void; + onBlur?: (name: string, value: any) => void; + error?: string; + isEditMode?: boolean; +} + +export const FieldRenderer: React.FC = ({ + field, + value, + onChange, + onBlur, + error, + isEditMode = false, +}) => { + const handleInputChange = (e: React.ChangeEvent) => { + const { name, value: inputValue, type } = e.target; + const parsedValue = type === "number" ? Number(inputValue) : inputValue; + onChange(name, parsedValue); + }; + + const handleInputBlur = (e: React.FocusEvent) => { + if (onBlur) { + const { name, value: inputValue, type } = e.target; + const parsedValue = type === "number" ? Number(inputValue) : inputValue; + onBlur(name, parsedValue); + } + }; + + const handleToggleChange = (checked: boolean) => { + onChange(field.name, checked); + }; + + const handleSelectChange = (value: string) => { + onChange(field.name, value); + }; + + const handleCounterChange = (e: any) => { + onChange(field.name, Number(e.target.value)); + }; + + // Common props for Input component + const getInputProps = () => ({ + validate: "", + skip: false, + labelProps: {}, + ghost: false, + tooltip: "", + tooltipIcon: null, + required: field.required, + label: field.label, + description: field.description || "", + footer: error ?
{error}
: "", + className: error ? "border-red-500" : "", + placeholder: field.placeholder, + autoComplete: field.autoComplete, + }); + + // Enhanced description for access key fields in edit mode + const getEnhancedDescription = () => { + return field.description || ""; + }; + + switch (field.type) { + case "text": + case "password": + return ( + + ); + + case "number": + return ( + + ); + + case "textarea": + return ( + + ); + + case "select": + return ( +
+