Skip to content

Commit 8d5b2be

Browse files
committed
Avoid name conflicts when adding WARCs to collection
Append -index to end of files until there is no conflict
1 parent e89924b commit 8d5b2be

File tree

2 files changed

+34
-9
lines changed

2 files changed

+34
-9
lines changed

pywb/manager/manager.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -147,18 +147,29 @@ def add_archives(self, archives, unpack_wacz=False):
147147
if invalid_archives:
148148
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
149149

150+
def _rename_warc(self, source_dir, warc_basename):
151+
dupe_idx = 1
152+
while True:
153+
new_basename = f'{warc_basename}-{dupe_idx}'
154+
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
155+
break
156+
dupe_idx += 1
157+
158+
return new_basename
159+
150160
def _add_warc(self, warc):
151-
filename = os.path.abspath(warc)
161+
warc_source = os.path.abspath(warc)
162+
source_dir, warc_basename = os.path.split(warc_source)
152163

153164
# don't overwrite existing warcs with duplicate names
154-
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
155-
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
156-
return None
157-
158-
shutil.copy2(filename, self.archive_dir)
159-
full_path = os.path.join(self.archive_dir, filename)
160-
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
161-
return full_path
165+
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
166+
warc_basename = self._rename_warc(source_dir, warc_basename)
167+
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
168+
169+
warc_dest = os.path.join(self.archive_dir, warc_basename)
170+
shutil.copy2(warc_source, warc_dest)
171+
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
172+
return warc_dest
162173

163174
def _add_wacz_unpacked(self, wacz):
164175
wacz = os.path.abspath(wacz)

tests/test_manager.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path):
2020
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
2121
assert '"filename": "valid_example_1-0.warc"' in f.read()
2222

23+
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
24+
"""Test if warc that already exists is renamed with -index suffix"""
25+
manager = self.get_test_collections_manager(tmp_path)
26+
manager._add_wacz_unpacked(VALID_WACZ_PATH)
27+
# Add it again to see if there are name conflicts
28+
manager._add_wacz_unpacked(VALID_WACZ_PATH)
29+
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
30+
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
31+
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
32+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
33+
data = f.read()
34+
assert '"filename": "valid_example_1-0.warc"' in data
35+
assert '"filename": "valid_example_1-0-1.warc"' in data
36+
2337
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
2438
"""Test if adding an invalid wacz file to a collection fails"""
2539
manager = self.get_test_collections_manager(tmp_path)

0 commit comments

Comments
 (0)