Skip to content

Commit 267f3ed

Browse files
authored
MRG: Merge pull request #445 from octue/enhancement/add-simpler-dataset-instantiation
Unify cloud and local dataset instantiation
2 parents 280bb5f + 4a356d1 commit 267f3ed

File tree

7 files changed

+233
-157
lines changed

7 files changed

+233
-157
lines changed

octue/resources/dataset.py

Lines changed: 104 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import os
77
import tempfile
8+
import warnings
89

910
import coolname
1011
import requests
@@ -40,6 +41,7 @@ class Dataset(Labelable, Taggable, Serialisable, Identifiable, Hashable, Metadat
4041
:param str|None path:
4142
:param dict|octue.resources.tag.TagDict|None tags:
4243
:param iter(str)|octue.resources.label.LabelSet|None labels:
44+
:param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
4345
:param bool hypothetical: if `True`, ignore any metadata stored for this dataset locally or in the cloud and use whatever is given at instantiation
4446
:return None:
4547
"""
@@ -50,10 +52,45 @@ class Dataset(Labelable, Taggable, Serialisable, Identifiable, Hashable, Metadat
5052
# Paths to files are added to the serialisation in `Dataset.to_primitive`.
5153
_SERIALISE_FIELDS = (*_METADATA_ATTRIBUTES, "path")
5254

53-
def __init__(self, files=None, name=None, id=None, path=None, tags=None, labels=None):
55+
def __init__(
56+
self,
57+
files=None,
58+
name=None,
59+
id=None,
60+
path=None,
61+
tags=None,
62+
labels=None,
63+
recursive=False,
64+
hypothetical=False,
65+
):
5466
super().__init__(name=name, id=id, tags=tags, labels=labels)
5567
self.path = path or os.getcwd()
56-
self.files = self._instantiate_datafiles(files or [])
68+
self.files = FilterSet()
69+
70+
if files:
71+
self.files = self._instantiate_datafiles(files)
72+
return
73+
74+
if storage.path.is_cloud_path(self.path):
75+
self._instantiate_from_cloud(path=self.path, recursive=recursive, hypothetical=hypothetical)
76+
else:
77+
self._instantiate_from_local_directory(path=self.path, recursive=recursive, hypothetical=hypothetical)
78+
79+
if hypothetical:
80+
logger.debug("Ignored stored metadata for %r.", self)
81+
return
82+
83+
if self.metadata(include_sdk_version=False) != {
84+
"name": name or self.name,
85+
"id": id or self.id,
86+
"tags": TagDict(tags),
87+
"labels": LabelSet(labels),
88+
}:
89+
logger.warning(
90+
"Overriding metadata given at instantiation with stored metadata for %r - set `hypothetical` to `True` "
91+
"at instantiation to avoid this.",
92+
self,
93+
)
5794

5895
@classmethod
5996
def from_local_directory(cls, path_to_directory, recursive=False, hypothetical=False, **kwargs):
@@ -66,26 +103,13 @@ def from_local_directory(cls, path_to_directory, recursive=False, hypothetical=F
66103
:param kwargs: other keyword arguments for the `Dataset` instantiation
67104
:return Dataset:
68105
"""
69-
datafiles = FilterSet()
70-
71-
for level, (directory_path, _, filenames) in enumerate(os.walk(path_to_directory)):
72-
for filename in filenames:
73-
74-
if filename == METADATA_FILENAME:
75-
continue
76-
77-
if not recursive and level > 0:
78-
break
79-
80-
datafiles.add(Datafile(path=os.path.join(directory_path, filename)))
81-
82-
dataset = Dataset(path=path_to_directory, files=datafiles, **kwargs)
83-
84-
if not hypothetical:
85-
dataset._use_local_metadata()
106+
warnings.warn(
107+
"The `Dataset.from_local_directory` class method is now deprecated. Please use the `Dataset` constructor "
108+
"instead, passing it the `path`, `recursive`, and `hypothetical` kwargs as necessary.",
109+
category=DeprecationWarning,
110+
)
86111

87-
dataset._warn_about_metadata_override(hypothetical=hypothetical, **kwargs)
88-
return dataset
112+
return Dataset(path=path_to_directory, recursive=recursive, hypothetical=hypothetical, **kwargs)
89113

90114
@classmethod
91115
def from_cloud(
@@ -107,32 +131,16 @@ def from_cloud(
107131
:param kwargs: other keyword arguments for the `Dataset` instantiation
108132
:return Dataset:
109133
"""
134+
warnings.warn(
135+
"The `Dataset.from_cloud` class method is now deprecated. Please use the `Dataset` constructor instead, "
136+
"passing it the `path`, `recursive`, and `hypothetical` kwargs as necessary.",
137+
category=DeprecationWarning,
138+
)
139+
110140
if not cloud_path:
111141
cloud_path = translate_bucket_name_and_path_in_bucket_to_cloud_path(bucket_name, path_to_dataset_directory)
112142

113-
bucket_name = storage.path.split_bucket_name_from_cloud_path(cloud_path)[0]
114-
115-
dataset = Dataset(path=cloud_path, **kwargs)
116-
117-
if not hypothetical:
118-
dataset._use_cloud_metadata()
119-
120-
if not dataset.files:
121-
dataset.files = FilterSet(
122-
Datafile(path=storage.path.generate_gs_path(bucket_name, blob.name))
123-
for blob in GoogleCloudStorageClient().scandir(
124-
cloud_path,
125-
recursive=recursive,
126-
filter=(
127-
lambda blob: (
128-
not blob.name.endswith(METADATA_FILENAME) and SIGNED_METADATA_DIRECTORY not in blob.name
129-
)
130-
),
131-
)
132-
)
133-
134-
dataset._warn_about_metadata_override(hypothetical=hypothetical, **kwargs)
135-
return dataset
143+
return Dataset(path=cloud_path, recursive=recursive, hypothetical=hypothetical, **kwargs)
136144

137145
@property
138146
def name(self):
@@ -434,7 +442,7 @@ def download(iterable_element):
434442
for path in executor.map(download, files_and_paths):
435443
logger.debug("Downloaded datafile to %r.", path)
436444

437-
logger.info("Downloaded %r dataset to %r.", self.name, local_directory)
445+
logger.info("Downloaded %r to %r.", self, local_directory)
438446

439447
def to_primitive(self, include_files=True):
440448
"""Convert the dataset to a dictionary of primitives, converting its files into their paths for a lightweight
@@ -455,6 +463,57 @@ def to_primitive(self, include_files=True):
455463

456464
return serialised_dataset
457465

466+
def _instantiate_from_cloud(self, path, recursive=True, hypothetical=False):
467+
"""Instantiate the dataset from a cloud directory.
468+
469+
:param str path: the cloud path to a directory in cloud storage
470+
:param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
471+
:param bool hypothetical: if `True`, ignore any metadata stored for this dataset in the cloud and use whatever is given at instantiation
472+
:return None:
473+
"""
474+
if not hypothetical:
475+
self._use_cloud_metadata()
476+
477+
if not self.files:
478+
bucket_name = storage.path.split_bucket_name_from_cloud_path(path)[0]
479+
480+
self.files = FilterSet(
481+
Datafile(path=storage.path.generate_gs_path(bucket_name, blob.name))
482+
for blob in GoogleCloudStorageClient().scandir(
483+
path,
484+
recursive=recursive,
485+
filter=(
486+
lambda blob: (
487+
not blob.name.endswith(METADATA_FILENAME) and SIGNED_METADATA_DIRECTORY not in blob.name
488+
)
489+
),
490+
)
491+
)
492+
493+
def _instantiate_from_local_directory(self, path, recursive=False, hypothetical=False):
494+
"""Instantiate the dataset from a local directory.
495+
496+
:param str path: the path to a local directory
497+
:param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
498+
:param bool hypothetical: if `True`, ignore any metadata stored for this dataset locally and use whatever is given at instantiation
499+
:return None:
500+
"""
501+
self.files = FilterSet()
502+
503+
for level, (directory_path, _, filenames) in enumerate(os.walk(path)):
504+
for filename in filenames:
505+
506+
if filename == METADATA_FILENAME:
507+
continue
508+
509+
if not recursive and level > 0:
510+
break
511+
512+
self.files.add(Datafile(path=os.path.join(directory_path, filename)))
513+
514+
if not hypothetical:
515+
self._use_local_metadata()
516+
458517
def _instantiate_datafiles(self, files):
459518
"""Instantiate and add the given files to a `FilterSet`.
460519
@@ -561,27 +620,3 @@ def _datafile_path_relative_to_self(self, datafile, path_type):
561620
datafile_path = datafile_path.split("?")[0]
562621

563622
return storage.path.relpath(datafile_path, dataset_path)
564-
565-
def _warn_about_metadata_override(self, hypothetical, **kwargs):
566-
"""Issue a warning about instantiation metadata override if `hypothetical` is `False` and the dataset's metadata
567-
is different from the provided instantiation keyword arguments.
568-
569-
:param bool hypothetical: if `True`, don't raise any warnings.
570-
:param kwargs: the Dataset instantiation keyword arguments
571-
:return None:
572-
"""
573-
if hypothetical:
574-
logger.debug("Ignored stored metadata for %r.", self)
575-
return
576-
577-
if self.metadata(include_sdk_version=False) != {
578-
"name": kwargs.get("name") or self.name,
579-
"id": kwargs.get("id") or self.id,
580-
"tags": TagDict(kwargs.get("tags")),
581-
"labels": LabelSet(kwargs.get("labels")),
582-
}:
583-
logger.warning(
584-
"Overriding metadata given at instantiation with stored metadata for %r - set `hypothetical` to `True` "
585-
"at instantiation to avoid this.",
586-
self,
587-
)

octue/resources/manifest.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def from_cloud(cls, cloud_path=None, bucket_name=None, path_to_manifest_file=Non
4949

5050
return Manifest(
5151
id=serialised_manifest["id"],
52-
datasets={key: Dataset.from_cloud(dataset) for key, dataset in serialised_manifest["datasets"].items()},
52+
datasets={key: Dataset(path=dataset) for key, dataset in serialised_manifest["datasets"].items()},
5353
)
5454

5555
@property
@@ -153,15 +153,11 @@ def _instantiate_dataset(self, key_and_dataset):
153153

154154
# If `dataset` is just a path to a dataset:
155155
if isinstance(dataset, str):
156-
157-
if storage.path.is_cloud_path(dataset):
158-
return (key, Dataset.from_cloud(cloud_path=dataset, recursive=True))
159-
160-
return (key, Dataset.from_local_directory(path_to_directory=dataset, recursive=True))
156+
return (key, Dataset(path=dataset, recursive=True))
161157

162158
# If `dataset` is a dictionary including a "path" key:
163159
if storage.path.is_cloud_path(dataset["path"]):
164-
return (key, Dataset.from_cloud(cloud_path=dataset["path"], recursive=True))
160+
return (key, Dataset(path=dataset["path"], recursive=True))
165161

166162
return (key, Dataset(**dataset))
167163

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "octue"
3-
version = "0.23.3"
3+
version = "0.23.4"
44
description = "A package providing template applications for data services, and a python SDK to the Octue API."
55
readme = "README.md"
66
authors = ["Thomas Clark <[email protected]>", "cortadocodes <[email protected]>"]

tests/resources/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ def create_dataset_with_two_files(dataset_directory_path):
1515
with open(path, "w") as f:
1616
f.write(str(data))
1717

18-
return Dataset.from_local_directory(dataset_directory_path)
18+
return Dataset(path=dataset_directory_path)

tests/resources/test_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def test_finalise_with_upload(self):
147147
)
148148
)
149149

150-
downloaded_dataset = Dataset.from_cloud(signed_url_for_dataset)
150+
downloaded_dataset = Dataset(path=signed_url_for_dataset)
151151
self.assertEqual(downloaded_dataset.name, "the_dataset")
152152
self.assertEqual(len(downloaded_dataset.files), 1)
153153
self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"})

0 commit comments

Comments
 (0)