Skip to content

Commit e358ce0

Browse files
committed
Use vcf model in vcf code
1 parent 0a96dbf commit e358ce0

File tree

3 files changed

+73
-107
lines changed

3 files changed

+73
-107
lines changed

python/tests/test_vcf.py

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import os
3030
import tempfile
3131
import textwrap
32+
import warnings
3233

3334
import msprime
3435
import numpy as np
@@ -335,12 +336,13 @@ class TestInterface:
335336
def test_bad_ploidy(self):
336337
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
337338
for bad_ploidy in [-1, 0]:
338-
with pytest.raises(ValueError, match="Ploidy must be >= 1"):
339+
with pytest.raises(ValueError, match="Ploidy must be a positive integer"):
339340
ts.write_vcf(io.StringIO, bad_ploidy)
340341
# Non divisible
341342
for bad_ploidy in [3, 7]:
342343
with pytest.raises(
343-
ValueError, match="Sample size must be divisible by ploidy"
344+
ValueError,
345+
match="Number of sample nodes 10 is not a multiple of ploidy",
344346
):
345347
ts.write_vcf(io.StringIO, bad_ploidy)
346348

@@ -349,17 +351,32 @@ def test_individuals_no_nodes_default_args(self):
349351
tables = ts1.dump_tables()
350352
tables.individuals.add_row()
351353
ts2 = tables.tree_sequence()
352-
assert ts1.as_vcf(allow_position_zero=True) == ts2.as_vcf(
353-
allow_position_zero=True
354-
)
354+
# ts1 should work as it has no individuals
355+
ts1.as_vcf(allow_position_zero=True)
356+
# ts2 should fail as it has individuals but no nodes
357+
with warnings.catch_warnings(record=True) as w:
358+
with pytest.raises(ValueError, match="No samples in resulting VCF model"):
359+
ts2.as_vcf(allow_position_zero=True)
360+
assert len(w) == 2
361+
assert "At least one sample node does not have an individual ID" in str(
362+
w[0].message
363+
)
364+
assert "Individual 0 has no nodes associated with it." in str(w[1].message)
355365

356366
def test_individuals_no_nodes_as_argument(self):
357367
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
358368
tables = ts1.dump_tables()
359369
tables.individuals.add_row()
360370
ts2 = tables.tree_sequence()
361-
with pytest.raises(ValueError, match="0 not associated with a node"):
362-
ts2.as_vcf(individuals=[0])
371+
with warnings.catch_warnings(record=True) as w:
372+
with pytest.raises(
373+
ValueError, match="Individual 0 has no nodes associated with it."
374+
):
375+
ts2.as_vcf(individuals=[0])
376+
assert len(w) == 1
377+
assert "At least one sample node does not have an individual ID" in str(
378+
w[0].message
379+
)
363380

364381
def test_ploidy_with_sample_individuals(self):
365382
ts = msprime.sim_ancestry(3, random_seed=2)
@@ -378,7 +395,7 @@ def test_ploidy_with_no_node_individuals(self):
378395
def test_empty_individuals(self):
379396
ts = msprime.sim_ancestry(3, random_seed=2)
380397
ts = tsutil.insert_branch_sites(ts)
381-
with pytest.raises(ValueError, match="List of sample individuals empty"):
398+
with pytest.raises(ValueError, match="No samples in resulting VCF model"):
382399
ts.as_vcf(individuals=[])
383400

384401
def test_duplicate_individuals(self):
@@ -397,12 +414,17 @@ def test_mixed_sample_non_sample_individuals(self):
397414
tables.nodes.individual = individual
398415
ts = tables.tree_sequence()
399416
ts = tsutil.insert_branch_sites(ts)
400-
with pytest.raises(
401-
ValueError, match="0 has nodes that are sample and non-sample"
402-
):
403-
ts.as_vcf()
404-
# but it's OK if we run without the affected individual
405-
assert len(ts.as_vcf(individuals=[1, 2], allow_position_zero=True)) > 0
417+
with warnings.catch_warnings(record=True) as w:
418+
ts.map_to_vcf_model()
419+
assert len(w) == 2
420+
assert (
421+
"Individual 0 has both sample and non-sample nodes associated with it."
422+
in str(w[0].message)
423+
)
424+
assert "Individual 3 has no nodes associated with it." in str(w[1].message)
425+
with warnings.catch_warnings(record=True) as w:
426+
assert len(ts.as_vcf(individuals=[1, 2], allow_position_zero=True)) > 0
427+
assert len(w) == 0
406428

407429
def test_samples_with_and_without_individuals(self):
408430
ts = tskit.Tree.generate_balanced(3).tree_sequence
@@ -414,19 +436,19 @@ def test_samples_with_and_without_individuals(self):
414436
tables.nodes.individual = individual
415437
ts = tables.tree_sequence()
416438
ts = tsutil.insert_branch_sites(ts)
417-
with pytest.raises(
418-
ValueError, match="Sample nodes must either all be associated"
419-
):
420-
ts.as_vcf()
421-
# But it's OK if explicitly specify that sample
422-
assert len(ts.as_vcf(individuals=[0], allow_position_zero=True)) > 0
439+
with warnings.catch_warnings(record=True) as w:
440+
ts.as_vcf(allow_position_zero=True)
441+
assert len(w) == 1
442+
assert "At least one sample node does not have an individual ID" in str(
443+
w[0].message
444+
)
423445

424446
def test_bad_individuals(self):
425447
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
426448
ts = tsutil.insert_individuals(ts, ploidy=2)
427-
with pytest.raises(ValueError, match="Invalid individual IDs provided."):
449+
with pytest.raises(ValueError, match="Invalid individual ID"):
428450
ts.write_vcf(io.StringIO(), individuals=[0, -1])
429-
with pytest.raises(ValueError, match="Invalid individual IDs provided."):
451+
with pytest.raises(ValueError, match="Invalid individual ID"):
430452
ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals])
431453

432454
def test_ploidy_positional(self):
@@ -527,20 +549,17 @@ def test_bad_length_individuals(self):
527549
ts = tsutil.insert_individuals(ts, ploidy=2)
528550
with pytest.raises(
529551
ValueError,
530-
match="individual_names must have length equal to"
531-
" the number of individuals",
552+
match="The number of individuals does not match the number of names",
532553
):
533554
ts.write_vcf(io.StringIO(), individual_names=[])
534555
with pytest.raises(
535556
ValueError,
536-
match="individual_names must have length equal to"
537-
" the number of individuals",
557+
match="The number of individuals does not match the number of names",
538558
):
539559
ts.write_vcf(io.StringIO(), individual_names=["x" for _ in range(4)])
540560
with pytest.raises(
541561
ValueError,
542-
match="individual_names must have length equal to"
543-
" the number of individuals",
562+
match="The number of individuals does not match the number of names",
544563
):
545564
ts.write_vcf(
546565
io.StringIO(),
@@ -549,8 +568,7 @@ def test_bad_length_individuals(self):
549568
)
550569
with pytest.raises(
551570
ValueError,
552-
match="individual_names must have length equal to"
553-
" the number of individuals",
571+
match="The number of individuals does not match the number of names",
554572
):
555573
ts.write_vcf(
556574
io.StringIO(),
@@ -563,14 +581,12 @@ def test_bad_length_ploidy(self):
563581
assert ts.num_sites > 0
564582
with pytest.raises(
565583
ValueError,
566-
match="individual_names must have length equal to"
567-
" the number of individuals",
584+
match="The number of individuals does not match the number of names",
568585
):
569586
ts.write_vcf(io.StringIO(), ploidy=2, individual_names=[])
570587
with pytest.raises(
571588
ValueError,
572-
match="individual_names must have length equal to"
573-
" the number of individuals",
589+
match="The number of individuals does not match the number of names",
574590
):
575591
ts.write_vcf(
576592
io.StringIO(), ploidy=2, individual_names=["x" for _ in range(4)]
@@ -907,7 +923,7 @@ def test_no_individuals_ploidy_3_names(self):
907923
expected = textwrap.dedent(s)
908924
assert (
909925
drop_header(
910-
ts.as_vcf(ploidy=3, individual_names="A", allow_position_zero=True)
926+
ts.as_vcf(ploidy=3, individual_names=["A"], allow_position_zero=True)
911927
)
912928
== expected
913929
)
@@ -940,7 +956,7 @@ def test_individual_0(self):
940956
def test_individual_1(self):
941957
ts = self.ts()
942958
s = """\
943-
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
959+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_1
944960
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0
945961
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1
946962
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1
@@ -954,7 +970,7 @@ def test_individual_1(self):
954970
def test_reversed(self):
955971
ts = self.ts()
956972
s = """\
957-
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
973+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_1\ttsk_0
958974
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
959975
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
960976
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0

python/tskit/trees.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10513,6 +10513,8 @@ def sample_nodes_by_ploidy(self, ploidy):
1051310513
:return: A 2D array of node IDs, where each row has length `ploidy`.
1051410514
:rtype: numpy.ndarray
1051510515
"""
10516+
if ploidy <= 0 or ploidy != int(ploidy):
10517+
raise ValueError("Ploidy must be a positive integer")
1051610518
sample_node_ids = np.flatnonzero(self.nodes_flags & tskit.NODE_IS_SAMPLE)
1051710519
num_samples = len(sample_node_ids)
1051810520
if num_samples == 0:

python/tskit/vcf.py

Lines changed: 18 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
"""
2626
import numpy as np
2727

28-
import tskit
2928
from . import provenance
3029

3130

@@ -69,14 +68,24 @@ def __init__(
6968
self.contig_id = contig_id
7069
self.isolated_as_missing = isolated_as_missing
7170

72-
self.__make_sample_mapping(ploidy, individuals)
73-
if individual_names is None:
74-
individual_names = [f"tsk_{j}" for j in range(self.num_individuals)]
75-
self.individual_names = individual_names
76-
if len(self.individual_names) != self.num_individuals:
77-
raise ValueError(
78-
"individual_names must have length equal to the number of individuals"
79-
)
71+
vcf_model = tree_sequence.map_to_vcf_model(
72+
individuals=individuals, ploidy=ploidy, individual_names=individual_names
73+
)
74+
self.individual_names = vcf_model.individuals_name
75+
self.individual_ploidies = [
76+
len(nodes[nodes >= 0]) for nodes in vcf_model.individuals_nodes
77+
]
78+
self.num_individuals = len(self.individual_names)
79+
80+
if len(vcf_model.individuals_nodes) == 0:
81+
raise ValueError("No samples in resulting VCF model")
82+
83+
# Flatten the array of node IDs, filtering out the -1 padding values
84+
self.samples = []
85+
for row in vcf_model.individuals_nodes:
86+
for node_id in row:
87+
if node_id != -1:
88+
self.samples.append(node_id)
8089

8190
# Transform coordinates for VCF
8291
if position_transform is None:
@@ -126,68 +135,7 @@ def __init__(
126135
'"position_transform = lambda x: np.fmax(1, x)"'
127136
)
128137

129-
def __make_sample_mapping(self, ploidy, individuals):
130-
"""
131-
Compute the sample IDs for each VCF individual and the template for
132-
writing out genotypes.
133-
"""
134-
ts = self.tree_sequence
135-
self.samples = None
136-
self.individual_ploidies = []
137-
138-
# Cannot use "ploidy" when *any* individuals are present.
139-
if ts.num_individuals > 0 and ploidy is not None:
140-
raise ValueError(
141-
"Cannot specify ploidy when individuals are present in tables "
142-
)
143-
144-
if individuals is None:
145-
# Find all sample nodes that reference individuals
146-
individuals = np.unique(ts.nodes_individual[ts.samples()])
147-
if len(individuals) == 1 and individuals[0] == tskit.NULL:
148-
# No samples refer to individuals
149-
individuals = None
150-
else:
151-
# np.unique sorts the argument, so if NULL (-1) is present it
152-
# will be the first value.
153-
if individuals[0] == tskit.NULL:
154-
raise ValueError(
155-
"Sample nodes must either all be associated with individuals "
156-
"or not associated with any individuals"
157-
)
158-
else:
159-
individuals = np.array(individuals, dtype=np.int32)
160-
if len(individuals) == 0:
161-
raise ValueError("List of sample individuals empty")
162138

163-
if individuals is not None:
164-
self.samples = []
165-
# FIXME this could probably be done more efficiently.
166-
for i in individuals:
167-
if i < 0 or i >= self.tree_sequence.num_individuals:
168-
raise ValueError("Invalid individual IDs provided.")
169-
ind = self.tree_sequence.individual(i)
170-
if len(ind.nodes) == 0:
171-
raise ValueError(f"Individual {i} not associated with a node")
172-
is_sample = {ts.node(u).is_sample() for u in ind.nodes}
173-
if len(is_sample) != 1:
174-
raise ValueError(
175-
f"Individual {ind.id} has nodes that are sample and "
176-
"non-samples"
177-
)
178-
self.samples.extend(ind.nodes)
179-
self.individual_ploidies.append(len(ind.nodes))
180-
else:
181-
if ploidy is None:
182-
ploidy = 1
183-
if ploidy < 1:
184-
raise ValueError("Ploidy must be >= 1")
185-
if ts.num_samples % ploidy != 0:
186-
raise ValueError("Sample size must be divisible by ploidy")
187-
self.individual_ploidies = np.full(
188-
ts.sample_size // ploidy, ploidy, dtype=np.int32
189-
)
190-
self.num_individuals = len(self.individual_ploidies)
191139

192140
def __write_header(self, output):
193141
print("##fileformat=VCFv4.2", file=output)

0 commit comments

Comments
 (0)