Skip to content

Commit ff2cd38

Browse files
authored
Merge pull request #150 from Labelbox/ms/bulk-export-datarows
[DIAG-101] bulk export queued data rows
2 parents ceb664f + 3285e48 commit ff2cd38

File tree

4 files changed

+96
-41
lines changed

4 files changed

+96
-41
lines changed

labelbox/schema/bulk_import_request.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,12 @@ class BulkImportRequest(DbObject):
119119
def inputs(self) -> List[Dict[str, Any]]:
120120
"""
121121
Inputs for each individual annotation uploaded.
122-
This should match the ndjson annotations that you have uploaded.
123-
122+
This should match the ndjson annotations that you have uploaded.
123+
124124
Returns:
125125
Uploaded ndjson.
126126
127-
* This information will expire after 24 hours.
127+
* This information will expire after 24 hours.
128128
"""
129129
return self._fetch_remote_ndjson(self.input_file_url)
130130

@@ -137,7 +137,7 @@ def errors(self) -> List[Dict[str, Any]]:
137137
List of dicts containing error messages. Empty list means there were no errors
138138
See `BulkImportRequest.statuses` for more details.
139139
140-
* This information will expire after 24 hours.
140+
* This information will expire after 24 hours.
141141
"""
142142
self.wait_until_done()
143143
return self._fetch_remote_ndjson(self.error_file_url)
@@ -150,14 +150,14 @@ def statuses(self) -> List[Dict[str, Any]]:
150150
Returns:
151151
A status for each annotation if the upload is done running.
152152
See below table for more details
153-
153+
154154
.. list-table::
155155
:widths: 15 150
156-
:header-rows: 1
156+
:header-rows: 1
157157
158158
* - Field
159159
- Description
160-
* - uuid
160+
* - uuid
161161
- Specifies the annotation for the status row.
162162
* - dataRow
163163
- JSON object containing the Labelbox data row ID for the annotation.
@@ -166,7 +166,7 @@ def statuses(self) -> List[Dict[str, Any]]:
166166
* - errors
167167
- An array of error messages included when status is FAILURE. Each error has a name, message and optional (key might not exist) additional_info.
168168
169-
* This information will expire after 24 hours.
169+
* This information will expire after 24 hours.
170170
"""
171171
self.wait_until_done()
172172
return self._fetch_remote_ndjson(self.status_file_url)
@@ -390,27 +390,26 @@ def create_from_local_file(cls,
390390

391391
def _validate_ndjson(lines: Iterable[Dict[str, Any]],
392392
project: "labelbox.Project") -> None:
393-
"""
393+
"""
394394
Client side validation of an ndjson object.
395395
396396
Does not guarentee that an upload will succeed for the following reasons:
397397
* We are not checking the data row types which will cause the following errors to slip through
398398
* Missing frame indices will not causes an error for videos
399-
* Uploaded annotations for the wrong data type will pass (Eg. entity on images)
399+
* Uploaded annotations for the wrong data type will pass (Eg. entity on images)
400400
* We are not checking bounds of an asset (Eg. frame index, image height, text location)
401-
401+
402402
Args:
403403
lines (Iterable[Dict[str,Any]]): An iterable of ndjson lines
404404
project (Project): id of project for which predictions will be imported
405-
405+
406406
Raises:
407407
MALValidationError: Raise for invalid NDJson
408408
UuidError: Duplicate UUID in upload
409409
"""
410-
data_row_ids = {
411-
data_row.uid: data_row for dataset in project.datasets()
412-
for data_row in dataset.data_rows()
413-
}
410+
export_url = project.export_queued_data_rows()
411+
data_row_json = ndjson.loads(requests.get(export_url).text)
412+
data_row_ids = {row['id'] for row in data_row_json}
414413
feature_schemas = get_mal_schemas(project.ontology())
415414
uids: Set[str] = set()
416415
for idx, line in enumerate(lines):
@@ -539,7 +538,7 @@ def build(cls: Any, data: Union[dict, BaseModel]) -> "NDBase":
539538
raises:
540539
KeyError: data does not contain the determinant fields for any of the types supported by this SpecialUnion
541540
ValidationError: Error while trying to construct a specific object in the union
542-
541+
543542
"""
544543
if isinstance(data, BaseModel):
545544
data = data.dict()

labelbox/schema/project.py

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from labelbox.schema.data_row import DataRow
1212
from labelbox.orm import query
1313
from labelbox.schema.bulk_import_request import BulkImportRequest
14-
from labelbox.exceptions import InvalidQueryError
14+
from labelbox.exceptions import InvalidQueryError, LabelboxError
1515
from labelbox.orm.db_object import DbObject, Updateable, Deletable
1616
from labelbox.orm.model import Entity, Field, Relationship
1717
from labelbox.pagination import PaginatedCollection
@@ -160,6 +160,39 @@ def labels(self, datasets=None, order_by=None):
160160
return PaginatedCollection(self.client, query_str, {id_param: self.uid},
161161
["project", "labels"], Label)
162162

163+
def export_queued_data_rows(self, timeout_seconds=120):
164+
""" Returns all data rows that are currently enqueued for this project.
165+
166+
Args:
167+
timeout_seconds (float): Max waiting time, in seconds.
168+
Returns:
169+
URL of the data file with this DataRow information. If the server didn't
170+
generate during the `timeout_seconds` period, None is returned.
171+
"""
172+
id_param = "projectId"
173+
query_str = """mutation GetQueuedDataRowsExportUrlPyApi($%s: ID!)
174+
{exportQueuedDataRows(data:{projectId: $%s }) {downloadUrl createdAt status} }
175+
""" % (id_param, id_param)
176+
sleep_time = 2
177+
while True:
178+
res = self.client.execute(query_str, {id_param: self.uid})
179+
res = res["exportQueuedDataRows"]
180+
if res["status"] == "COMPLETE":
181+
return res["downloadUrl"]
182+
elif res["status"] == "FAILED":
183+
raise LabelboxError("Data row export failed.")
184+
185+
timeout_seconds -= sleep_time
186+
if timeout_seconds <= 0:
187+
raise LabelboxError(
188+
f"Unable to export data rows within {timeout_seconds} seconds."
189+
)
190+
191+
logger.debug(
192+
"Project '%s' queued data row export, waiting for server...",
193+
self.uid)
194+
time.sleep(sleep_time)
195+
163196
def export_labels(self, timeout_seconds=60):
164197
""" Calls the server-side Label exporting that generates a JSON
165198
payload, and returns the URL to that payload.
@@ -193,13 +226,13 @@ def export_labels(self, timeout_seconds=60):
193226
time.sleep(sleep_time)
194227

195228
def export_issues(self, status=None):
196-
""" Calls the server-side Issues exporting that
229+
""" Calls the server-side Issues exporting that
197230
returns the URL to that payload.
198231
199232
Args:
200233
status (string): valid values: Open, Resolved
201234
Returns:
202-
URL of the data file with this Project's issues.
235+
URL of the data file with this Project's issues.
203236
"""
204237
id_param = "projectId"
205238
status_param = "status"
@@ -229,14 +262,14 @@ def export_issues(self, status=None):
229262
def upsert_instructions(self, instructions_file: str):
230263
"""
231264
* Uploads instructions to the UI. Running more than once will replace the instructions
232-
265+
233266
Args:
234267
instructions_file (str): Path to a local file.
235268
* Must be either a pdf, text, or html file.
236269
237270
Raises:
238271
ValueError:
239-
* project must be setup
272+
* project must be setup
240273
* instructions file must end with one of ".text", ".txt", ".pdf", ".html"
241274
"""
242275

@@ -267,18 +300,18 @@ def upsert_instructions(self, instructions_file: str):
267300

268301
self.client.execute(
269302
"""mutation UpdateFrontendWithExistingOptionsPyApi (
270-
$frontendId: ID!,
271-
$optionsId: ID!,
272-
$name: String!,
273-
$description: String!,
303+
$frontendId: ID!,
304+
$optionsId: ID!,
305+
$name: String!,
306+
$description: String!,
274307
$customizationOptions: String!
275308
) {
276309
updateLabelingFrontend(
277-
where: {id: $frontendId},
310+
where: {id: $frontendId},
278311
data: {name: $name, description: $description}
279312
) {id}
280313
updateLabelingFrontendOptions(
281-
where: {id: $optionsId},
314+
where: {id: $optionsId},
282315
data: {customizationOptions: $customizationOptions}
283316
) {id}
284317
}""", {
@@ -390,10 +423,10 @@ def validate_labeling_parameter_overrides(self, data):
390423

391424
def set_labeling_parameter_overrides(self, data):
392425
""" Adds labeling parameter overrides to this project.
393-
426+
394427
See information on priority here:
395428
https://docs.labelbox.com/en/configure-editor/queue-system#reservation-system
396-
429+
397430
>>> project.set_labeling_parameter_overrides([
398431
>>> (data_row_1, 2, 3), (data_row_2, 1, 4)])
399432
@@ -407,11 +440,11 @@ def set_labeling_parameter_overrides(self, data):
407440
- Minimum priority is 1.
408441
* Priority is not the queue position.
409442
- The position is determined by the relative priority.
410-
- E.g. [(data_row_1, 5,1), (data_row_2, 2,1), (data_row_3, 10,1)]
443+
- E.g. [(data_row_1, 5,1), (data_row_2, 2,1), (data_row_3, 10,1)]
411444
will be assigned in the following order: [data_row_2, data_row_1, data_row_3]
412445
* Datarows with parameter overrides will appear before datarows without overrides.
413446
* The priority only effects items in the queue.
414-
- Assigning a priority will not automatically add the item back into the queue.
447+
- Assigning a priority will not automatically add the item back into the queue.
415448
Number of labels:
416449
* The number of times a data row should be labeled.
417450
- Creates duplicate data rows in a project (one for each number of labels).
@@ -458,7 +491,7 @@ def unset_labeling_parameter_overrides(self, data_rows):
458491
def upsert_review_queue(self, quota_factor):
459492
""" Sets the the proportion of total assets in a project to review.
460493
461-
More information can be found here:
494+
More information can be found here:
462495
https://docs.labelbox.com/en/quality-assurance/review-labels#configure-review-percentage
463496
464497
Args:

tests/integration/conftest.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from collections import namedtuple
22
from enum import Enum
33
from datetime import datetime
4-
from labelbox.orm.db_object import experimental
54
from random import randint
65
from string import ascii_letters
76
from types import SimpleNamespace
@@ -14,6 +13,7 @@
1413
from labelbox.schema.invite import Invite
1514
from labelbox.pagination import PaginatedCollection
1615
from labelbox.schema.user import User
16+
from labelbox import LabelingFrontend
1717
from labelbox import Client
1818

1919
IMG_URL = "https://picsum.photos/200/300"
@@ -83,7 +83,7 @@ def get_invites(client):
8383
Do not use. Only for testing.
8484
"""
8585
query_str = """query GetOrgInvitationsPyApi($from: ID, $first: PageSize) {
86-
organization { id invites(from: $from, first: $first) {
86+
organization { id invites(from: $from, first: $first) {
8787
nodes { id createdAt organizationRoleName inviteeEmail } nextCursor }}}"""
8888
invites = PaginatedCollection(
8989
client,
@@ -199,13 +199,13 @@ def organization(client):
199199
def project_based_user(client, rand_gen):
200200
email = rand_gen(str)
201201
# Use old mutation because it doesn't require users to accept email invites
202-
query_str = """mutation MakeNewUserPyApi {
203-
addMembersToOrganization(
204-
data: {
205-
emails: ["%[email protected]"],
202+
query_str = """mutation MakeNewUserPyApi {
203+
addMembersToOrganization(
204+
data: {
205+
emails: ["%[email protected]"],
206206
orgRoleId: "%s",
207207
projectRoles: []
208-
}
208+
}
209209
) {
210210
newUserId
211211
}
@@ -227,3 +227,17 @@ def project_pack(client):
227227
yield projects
228228
for proj in projects:
229229
proj.delete()
230+
231+
232+
@pytest.fixture
233+
def configured_project(project, client, rand_gen):
234+
dataset = client.create_dataset(name=rand_gen(str), projects=project)
235+
dataset.create_data_row(row_data=IMG_URL)
236+
editor = list(
237+
project.client.get_labeling_frontends(
238+
where=LabelingFrontend.name == "editor"))[0]
239+
empty_ontology = {"tools": [], "classifications": []}
240+
project.setup(editor, empty_ontology)
241+
yield project
242+
dataset.delete()
243+
project.delete()

tests/integration/test_project.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
import pytest
21
import json
2+
33
import requests
4+
import ndjson
5+
import pytest
46

57
from labelbox import Project, LabelingFrontend
68
from labelbox.exceptions import InvalidQueryError
@@ -100,3 +102,10 @@ def test_attach_instructions(client, project):
100102
with pytest.raises(ValueError) as exc_info:
101103
project.upsert_instructions('/tmp/file.invalid_file_extension')
102104
assert "instructions_file must end with one of" in str(exc_info.value)
105+
106+
107+
def test_queued_data_row_export(configured_project):
108+
url = configured_project.export_queued_data_rows()
109+
assert url
110+
result = ndjson.loads(requests.get(url).text)
111+
assert len(result) == 1

0 commit comments

Comments
 (0)