Skip to content

Commit 2952d33

Browse files
authored
MRG: Merge pull request #525 from octue/fix/fix-serialisation-of-datasets-instantiated-from-files
Fix serialisation of datasets instantiated from files
2 parents 476c930 + 642b5e4 commit 2952d33

File tree

9 files changed

+650
-439
lines changed

9 files changed

+650
-439
lines changed

docs/source/inter_service_compatibility.rst

Lines changed: 109 additions & 105 deletions
Large diffs are not rendered by default.

octue/metadata/recorded_questions.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,4 @@
5050
{"parent_sdk_version": "0.17.0", "question": {"data": "{\"input_values\": {\"height\": 4, \"width\": 72}, \"input_manifest\": \"{\\n \\\"datasets\\\": {\\n \\\"my_dataset\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmp5az2xpjx\\\"\\n },\\n \\\"id\\\": \\\"9c7cde00-f78f-4c01-b46f-80dd738ab95f\\\",\\n \\\"name\\\": null\\n}\"}", "attributes": {"question_uuid": "b9a02ab8-c5e2-4182-9979-f65923beef8e", "forward_logs": "1"}}}
5151
{"parent_sdk_version": "0.16.0", "question": {"data": "{\"input_values\": {\"height\": 4, \"width\": 72}, \"input_manifest\": \"{\\n \\\"datasets\\\": {\\n \\\"my_dataset\\\": {\\n \\\"files\\\": [\\n {\\n \\\"_cloud_metadata\\\": {},\\n \\\"cloud_path\\\": null,\\n \\\"id\\\": \\\"d04f8c48-c0b4-4ed6-a5ce-308d69ab0d3f\\\",\\n \\\"labels\\\": [],\\n \\\"name\\\": \\\"a_test_file.csv\\\",\\n \\\"path\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmphspb3xz1/path-within-dataset/a_test_file.csv\\\",\\n \\\"tags\\\": {},\\n \\\"timestamp\\\": null\\n },\\n {\\n \\\"_cloud_metadata\\\": {},\\n \\\"cloud_path\\\": null,\\n \\\"id\\\": \\\"b3492362-59b3-4d17-b08f-b88ebd1bd2cd\\\",\\n \\\"labels\\\": [],\\n \\\"name\\\": \\\"another_test_file.csv\\\",\\n \\\"path\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmphspb3xz1/path-within-dataset/another_test_file.csv\\\",\\n \\\"tags\\\": {},\\n \\\"timestamp\\\": null\\n }\\n ],\\n \\\"id\\\": \\\"59eb7ee8-3908-4a23-8c08-568b0f6d24dd\\\",\\n \\\"labels\\\": [],\\n \\\"name\\\": \\\"tmphspb3xz1\\\",\\n \\\"path\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmphspb3xz1\\\",\\n \\\"tags\\\": {}\\n }\\n },\\n \\\"id\\\": \\\"6a9c77d9-3f25-4fa6-8657-e3c44761c624\\\",\\n \\\"keys\\\": null,\\n \\\"name\\\": null,\\n \\\"path\\\": \\\".\\\"\\n}\"}", "attributes": {"question_uuid": "694169c2-16a6-488e-bc23-bef05c1cfdac", "forward_logs": "1"}}}
5252
{"parent_sdk_version": "0.38.0", "question": {"data": "{\"input_values\": {\"height\": 4, \"width\": 72}, \"input_manifest\": \"{\\n \\\"datasets\\\": {\\n \\\"my_dataset\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmpc_nmpiz2\\\"\\n },\\n \\\"id\\\": \\\"739b1ae5-99f8-4bd0-b344-38f35d74b8a8\\\",\\n \\\"name\\\": null\\n}\"}", "attributes": {"question_uuid": "7bb5e29f-c8de-4fb4-923b-b5c854a24f49", "forward_logs": "1", "allow_save_diagnostics_data_on_crash": "1", "octue_sdk_version": "0.38.0"}}}
53+
{"parent_sdk_version": "0.38.1", "question": {"data": "{\"input_values\": {\"height\": 4, \"width\": 72}, \"input_manifest\": \"{\\n \\\"datasets\\\": {\\n \\\"my_dataset\\\": \\\"/var/folders/sk/hf5fbp616c77tsys9lz55qn40000gp/T/tmpt1uu7685\\\"\\n },\\n \\\"id\\\": \\\"8b3c39f9-5232-4898-998d-934defdc5626\\\",\\n \\\"name\\\": null\\n}\"}", "attributes": {"question_uuid": "a96e6ec5-3798-4160-b45f-6f248834f740", "forward_logs": "1", "allow_save_diagnostics_data_on_crash": "1", "octue_sdk_version": "0.38.1"}}}

octue/metadata/version_compatibilities.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

octue/resources/analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def finalise(self, upload_output_datasets_to=None):
131131
serialised_strands["output_values"] = json.dumps(self.output_values, cls=OctueJSONEncoder)
132132

133133
if self.output_manifest:
134-
serialised_strands["output_manifest"] = self.output_manifest.to_primitive()
134+
serialised_strands["output_manifest"] = self.output_manifest.serialise()
135135

136136
self.twine.validate(**serialised_strands)
137137
self._finalised = True

octue/resources/dataset.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def __init__(
7373
self._recursive = recursive
7474
self._ignore_stored_metadata = ignore_stored_metadata
7575
self._cloud_metadata = {}
76+
self._instantiated_from_files_argument = False
7677

7778
if files:
7879
if not any((isinstance(files, list), isinstance(files, set), isinstance(files, tuple))):
@@ -82,6 +83,7 @@ def __init__(
8283
)
8384

8485
self.files = self._instantiate_datafiles(files)
86+
self._instantiated_from_files_argument = True
8587
return
8688

8789
if storage.path.is_cloud_path(self.path):
@@ -377,13 +379,18 @@ def to_primitive(self, include_files=True):
377379
"""
378380
serialised_dataset = super().to_primitive()
379381

380-
if self.exists_in_cloud:
381-
path_type = "cloud_path"
382-
else:
383-
path_type = "local_path"
384-
385382
if include_files:
386-
serialised_dataset["files"] = sorted(getattr(datafile, path_type) for datafile in self.files)
383+
serialised_dataset["files"] = []
384+
385+
for datafile in self.files:
386+
if datafile.exists_in_cloud:
387+
datafile_path_type = "cloud_path"
388+
else:
389+
datafile_path_type = "local_path"
390+
391+
serialised_dataset["files"].append(getattr(datafile, datafile_path_type))
392+
393+
serialised_dataset["files"] = sorted(serialised_dataset["files"])
387394

388395
return serialised_dataset
389396

octue/resources/manifest.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,14 @@ def to_primitive(self):
114114
:return dict:
115115
"""
116116
self_as_primitive = super().to_primitive()
117-
self_as_primitive["datasets"] = {name: dataset.path for name, dataset in self.datasets.items()}
117+
self_as_primitive["datasets"] = {}
118+
119+
for name, dataset in self.datasets.items():
120+
if dataset._instantiated_from_files_argument:
121+
self_as_primitive["datasets"][name] = dataset.to_primitive()
122+
else:
123+
self_as_primitive["datasets"][name] = dataset.path
124+
118125
return self_as_primitive
119126

120127
def _instantiate_datasets(self, datasets):
@@ -150,9 +157,9 @@ def _instantiate_dataset(self, key_and_dataset):
150157
if isinstance(dataset, str):
151158
return (key, Dataset(path=dataset, recursive=True))
152159

153-
# If `dataset` is a dictionary including a "path" key:
154-
if storage.path.is_cloud_path(dataset["path"]):
155-
return (key, Dataset(path=dataset["path"], recursive=True))
160+
# If `dataset` is a cloud dataset and is represented as a dictionary including a "path" key:
161+
if storage.path.is_cloud_path(dataset.get("path", "")):
162+
return (key, Dataset(path=dataset["path"], files=dataset.get("files"), recursive=True))
156163

157164
return (key, Dataset(**dataset))
158165

0 commit comments

Comments
 (0)