5
5
import logging
6
6
import os
7
7
import tempfile
8
+ import warnings
8
9
9
10
import coolname
10
11
import requests
@@ -40,6 +41,7 @@ class Dataset(Labelable, Taggable, Serialisable, Identifiable, Hashable, Metadat
40
41
:param str|None path:
41
42
:param dict|octue.resources.tag.TagDict|None tags:
42
43
:param iter(str)|octue.resources.label.LabelSet|None labels:
44
+ :param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
43
45
:param bool hypothetical: if `True`, ignore any metadata stored for this dataset locally or in the cloud and use whatever is given at instantiation
44
46
:return None:
45
47
"""
@@ -50,10 +52,45 @@ class Dataset(Labelable, Taggable, Serialisable, Identifiable, Hashable, Metadat
50
52
# Paths to files are added to the serialisation in `Dataset.to_primitive`.
51
53
_SERIALISE_FIELDS = (* _METADATA_ATTRIBUTES , "path" )
52
54
53
- def __init__ (self , files = None , name = None , id = None , path = None , tags = None , labels = None ):
55
+ def __init__ (
56
+ self ,
57
+ files = None ,
58
+ name = None ,
59
+ id = None ,
60
+ path = None ,
61
+ tags = None ,
62
+ labels = None ,
63
+ recursive = False ,
64
+ hypothetical = False ,
65
+ ):
54
66
super ().__init__ (name = name , id = id , tags = tags , labels = labels )
55
67
self .path = path or os .getcwd ()
56
- self .files = self ._instantiate_datafiles (files or [])
68
+ self .files = FilterSet ()
69
+
70
+ if files :
71
+ self .files = self ._instantiate_datafiles (files )
72
+ return
73
+
74
+ if storage .path .is_cloud_path (self .path ):
75
+ self ._instantiate_from_cloud (path = self .path , recursive = recursive , hypothetical = hypothetical )
76
+ else :
77
+ self ._instantiate_from_local_directory (path = self .path , recursive = recursive , hypothetical = hypothetical )
78
+
79
+ if hypothetical :
80
+ logger .debug ("Ignored stored metadata for %r." , self )
81
+ return
82
+
83
+ if self .metadata (include_sdk_version = False ) != {
84
+ "name" : name or self .name ,
85
+ "id" : id or self .id ,
86
+ "tags" : TagDict (tags ),
87
+ "labels" : LabelSet (labels ),
88
+ }:
89
+ logger .warning (
90
+ "Overriding metadata given at instantiation with stored metadata for %r - set `hypothetical` to `True` "
91
+ "at instantiation to avoid this." ,
92
+ self ,
93
+ )
57
94
58
95
@classmethod
59
96
def from_local_directory (cls , path_to_directory , recursive = False , hypothetical = False , ** kwargs ):
@@ -66,26 +103,13 @@ def from_local_directory(cls, path_to_directory, recursive=False, hypothetical=F
66
103
:param kwargs: other keyword arguments for the `Dataset` instantiation
67
104
:return Dataset:
68
105
"""
69
- datafiles = FilterSet ()
70
-
71
- for level , (directory_path , _ , filenames ) in enumerate (os .walk (path_to_directory )):
72
- for filename in filenames :
73
-
74
- if filename == METADATA_FILENAME :
75
- continue
76
-
77
- if not recursive and level > 0 :
78
- break
79
-
80
- datafiles .add (Datafile (path = os .path .join (directory_path , filename )))
81
-
82
- dataset = Dataset (path = path_to_directory , files = datafiles , ** kwargs )
83
-
84
- if not hypothetical :
85
- dataset ._use_local_metadata ()
106
+ warnings .warn (
107
+ "The `Dataset.from_local_directory` class method is now deprecated. Please use the `Dataset` constructor "
108
+ "instead, passing it the `path`, `recursive`, and `hypothetical` kwargs as necessary." ,
109
+ category = DeprecationWarning ,
110
+ )
86
111
87
- dataset ._warn_about_metadata_override (hypothetical = hypothetical , ** kwargs )
88
- return dataset
112
+ return Dataset (path = path_to_directory , recursive = recursive , hypothetical = hypothetical , ** kwargs )
89
113
90
114
@classmethod
91
115
def from_cloud (
@@ -107,32 +131,16 @@ def from_cloud(
107
131
:param kwargs: other keyword arguments for the `Dataset` instantiation
108
132
:return Dataset:
109
133
"""
134
+ warnings .warn (
135
+ "The `Dataset.from_cloud` class method is now deprecated. Please use the `Dataset` constructor instead, "
136
+ "passing it the `path`, `recursive`, and `hypothetical` kwargs as necessary." ,
137
+ category = DeprecationWarning ,
138
+ )
139
+
110
140
if not cloud_path :
111
141
cloud_path = translate_bucket_name_and_path_in_bucket_to_cloud_path (bucket_name , path_to_dataset_directory )
112
142
113
- bucket_name = storage .path .split_bucket_name_from_cloud_path (cloud_path )[0 ]
114
-
115
- dataset = Dataset (path = cloud_path , ** kwargs )
116
-
117
- if not hypothetical :
118
- dataset ._use_cloud_metadata ()
119
-
120
- if not dataset .files :
121
- dataset .files = FilterSet (
122
- Datafile (path = storage .path .generate_gs_path (bucket_name , blob .name ))
123
- for blob in GoogleCloudStorageClient ().scandir (
124
- cloud_path ,
125
- recursive = recursive ,
126
- filter = (
127
- lambda blob : (
128
- not blob .name .endswith (METADATA_FILENAME ) and SIGNED_METADATA_DIRECTORY not in blob .name
129
- )
130
- ),
131
- )
132
- )
133
-
134
- dataset ._warn_about_metadata_override (hypothetical = hypothetical , ** kwargs )
135
- return dataset
143
+ return Dataset (path = cloud_path , recursive = recursive , hypothetical = hypothetical , ** kwargs )
136
144
137
145
@property
138
146
def name (self ):
@@ -434,7 +442,7 @@ def download(iterable_element):
434
442
for path in executor .map (download , files_and_paths ):
435
443
logger .debug ("Downloaded datafile to %r." , path )
436
444
437
- logger .info ("Downloaded %r dataset to %r." , self . name , local_directory )
445
+ logger .info ("Downloaded %r to %r." , self , local_directory )
438
446
439
447
def to_primitive (self , include_files = True ):
440
448
"""Convert the dataset to a dictionary of primitives, converting its files into their paths for a lightweight
@@ -455,6 +463,57 @@ def to_primitive(self, include_files=True):
455
463
456
464
return serialised_dataset
457
465
466
+ def _instantiate_from_cloud (self , path , recursive = True , hypothetical = False ):
467
+ """Instantiate the dataset from a cloud directory.
468
+
469
+ :param str path: the cloud path to a directory in cloud storage
470
+ :param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
471
+ :param bool hypothetical: if `True`, ignore any metadata stored for this dataset in the cloud and use whatever is given at instantiation
472
+ :return None:
473
+ """
474
+ if not hypothetical :
475
+ self ._use_cloud_metadata ()
476
+
477
+ if not self .files :
478
+ bucket_name = storage .path .split_bucket_name_from_cloud_path (path )[0 ]
479
+
480
+ self .files = FilterSet (
481
+ Datafile (path = storage .path .generate_gs_path (bucket_name , blob .name ))
482
+ for blob in GoogleCloudStorageClient ().scandir (
483
+ path ,
484
+ recursive = recursive ,
485
+ filter = (
486
+ lambda blob : (
487
+ not blob .name .endswith (METADATA_FILENAME ) and SIGNED_METADATA_DIRECTORY not in blob .name
488
+ )
489
+ ),
490
+ )
491
+ )
492
+
493
+ def _instantiate_from_local_directory (self , path , recursive = False , hypothetical = False ):
494
+ """Instantiate the dataset from a local directory.
495
+
496
+ :param str path: the path to a local directory
497
+ :param bool recursive: if `True`, include in the dataset all files in the subdirectories recursively contained within the dataset directory
498
+ :param bool hypothetical: if `True`, ignore any metadata stored for this dataset locally and use whatever is given at instantiation
499
+ :return None:
500
+ """
501
+ self .files = FilterSet ()
502
+
503
+ for level , (directory_path , _ , filenames ) in enumerate (os .walk (path )):
504
+ for filename in filenames :
505
+
506
+ if filename == METADATA_FILENAME :
507
+ continue
508
+
509
+ if not recursive and level > 0 :
510
+ break
511
+
512
+ self .files .add (Datafile (path = os .path .join (directory_path , filename )))
513
+
514
+ if not hypothetical :
515
+ self ._use_local_metadata ()
516
+
458
517
def _instantiate_datafiles (self , files ):
459
518
"""Instantiate and add the given files to a `FilterSet`.
460
519
@@ -561,27 +620,3 @@ def _datafile_path_relative_to_self(self, datafile, path_type):
561
620
datafile_path = datafile_path .split ("?" )[0 ]
562
621
563
622
return storage .path .relpath (datafile_path , dataset_path )
564
-
565
- def _warn_about_metadata_override (self , hypothetical , ** kwargs ):
566
- """Issue a warning about instantiation metadata override if `hypothetical` is `False` and the dataset's metadata
567
- is different from the provided instantiation keyword arguments.
568
-
569
- :param bool hypothetical: if `True`, don't raise any warnings.
570
- :param kwargs: the Dataset instantiation keyword arguments
571
- :return None:
572
- """
573
- if hypothetical :
574
- logger .debug ("Ignored stored metadata for %r." , self )
575
- return
576
-
577
- if self .metadata (include_sdk_version = False ) != {
578
- "name" : kwargs .get ("name" ) or self .name ,
579
- "id" : kwargs .get ("id" ) or self .id ,
580
- "tags" : TagDict (kwargs .get ("tags" )),
581
- "labels" : LabelSet (kwargs .get ("labels" )),
582
- }:
583
- logger .warning (
584
- "Overriding metadata given at instantiation with stored metadata for %r - set `hypothetical` to `True` "
585
- "at instantiation to avoid this." ,
586
- self ,
587
- )
0 commit comments