Skip to content

Commit 9680c5c

Browse files
committed
Add individual name arguements and output
1 parent 46095b8 commit 9680c5c

File tree

1 file changed

+76
-40
lines changed

1 file changed

+76
-40
lines changed

python/tskit/trees.py

Lines changed: 76 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@
5757
LEGACY_MS_LABELS = "legacy_ms"
5858

5959

60+
@dataclass
61+
class VcfModelMapping:
62+
individuals_nodes: np.ndarray
63+
individuals_name: np.ndarray
64+
65+
6066
class CoalescenceRecord(NamedTuple):
6167
left: float
6268
right: float
@@ -10520,7 +10526,13 @@ def sample_nodes_by_ploidy(self, ploidy):
1052010526
sample_node_ids = sample_node_ids.reshape((num_samples_per_individual, ploidy))
1052110527
return sample_node_ids
1052210528

10523-
def map_samples_to_vcf(self, individuals=None, ploidy=None):
10529+
def map_to_vcf_model(
10530+
self,
10531+
individuals=None,
10532+
ploidy=None,
10533+
name_metadata_key=None,
10534+
individual_names=None,
10535+
):
1052410536
"""
1052510537
Returns a list of lists of node IDs, where each sublist contains the
1052610538
sample nodes associated with the same individual.
@@ -10529,6 +10541,11 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
1052910541
only the nodes for the specified individuals are returned.
1053010542
"""
1053110543

10544+
if name_metadata_key is not None and individual_names is not None:
10545+
raise ValueError(
10546+
"Cannot specify both name_metadata_key and individual_names"
10547+
)
10548+
1053210549
if self.num_individuals > 0 and ploidy is not None:
1053310550
raise ValueError(
1053410551
"Cannot specify ploidy when individuals are present in the tree sequence"
@@ -10548,47 +10565,66 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
1054810565
if self.num_individuals == 0 and individuals is None:
1054910566
if ploidy is None:
1055010567
ploidy = 1
10551-
return self.sample_nodes_by_ploidy(ploidy)
10552-
10553-
individuals_nodes = []
10554-
if individuals is None:
10555-
for ind in self.individuals():
10556-
if len(ind.nodes) == 0:
10557-
warnings.warn(
10558-
f"Individual {ind.id} has no nodes associated with it.",
10559-
stacklevel=1,
10568+
individuals_nodes = self.sample_nodes_by_ploidy(ploidy)
10569+
else:
10570+
individuals_nodes = []
10571+
ts_individual_names = []
10572+
if individuals is None:
10573+
for ind in self.individuals():
10574+
if len(ind.nodes) == 0:
10575+
warnings.warn(
10576+
f"Individual {ind.id} has no nodes associated with it.",
10577+
stacklevel=1,
10578+
)
10579+
continue
10580+
is_sample = np.array(
10581+
[self.node_flags(u) & tskit.NODE_IS_SAMPLE for u in ind.nodes]
1056010582
)
10561-
continue
10562-
is_sample = np.array(
10563-
[self.node_flags(u) & tskit.NODE_IS_SAMPLE for u in ind.nodes]
10564-
)
10565-
if all(is_sample):
10583+
if all(is_sample):
10584+
individuals_nodes.append(ind.nodes)
10585+
if name_metadata_key is not None:
10586+
ts_individual_names.append(ind.metadata[name_metadata_key])
10587+
else:
10588+
ts_individual_names.append(f"tsk_{ind.id}")
10589+
elif all(~is_sample):
10590+
continue
10591+
else:
10592+
warnings.warn(
10593+
f"Individual {ind.id} has both sample and non-sample nodes "
10594+
"associated with it.",
10595+
stacklevel=1,
10596+
)
10597+
else:
10598+
for i in individuals:
10599+
if i < 0 or i >= self.num_individuals:
10600+
raise ValueError(f"Invalid individual ID {i}")
10601+
ind = self.individual(i)
10602+
if len(ind.nodes) == 0:
10603+
raise ValueError(
10604+
f"Individual {i} has no nodes associated with it."
10605+
)
1056610606
individuals_nodes.append(ind.nodes)
10567-
elif all(~is_sample):
10568-
continue
10569-
else:
10570-
warnings.warn(
10571-
f"Individual {ind.id} has both sample and non-sample nodes "
10572-
"associated with it.",
10573-
stacklevel=1,
10574-
)
10575-
else:
10576-
for i in individuals:
10577-
if i < 0 or i >= self.num_individuals:
10578-
raise ValueError(f"Invalid individual ID {i}")
10579-
ind = self.individual(i)
10580-
if len(ind.nodes) == 0:
10581-
raise ValueError(f"Individual {i} has no nodes associated with it.")
10582-
individuals_nodes.append(ind.nodes)
10583-
10584-
if len(individuals_nodes) == 0:
10585-
return np.array([], dtype=np.int32).reshape((0, 0))
10586-
10587-
max_nodes = max(len(nodes) for nodes in individuals_nodes)
10588-
result = np.full((len(individuals_nodes), max_nodes), -1, dtype=np.int32)
10589-
for i, nodes in enumerate(individuals_nodes):
10590-
result[i, : len(nodes)] = nodes
10591-
return result
10607+
if name_metadata_key is not None:
10608+
ts_individual_names.append(ind.metadata[name_metadata_key])
10609+
else:
10610+
ts_individual_names.append(f"tsk_{ind.id}")
10611+
10612+
max_nodes = max(len(nodes) for nodes in individuals_nodes)
10613+
result = np.full((len(individuals_nodes), max_nodes), -1, dtype=np.int32)
10614+
for i, nodes in enumerate(individuals_nodes):
10615+
result[i, : len(nodes)] = nodes
10616+
individuals_nodes = result
10617+
10618+
if individual_names is None:
10619+
individual_names = ts_individual_names
10620+
individual_names = np.array(individual_names, dtype=object)
10621+
10622+
if len(individuals_nodes) != len(individual_names):
10623+
raise ValueError(
10624+
"The number of individuals does not match the number of names"
10625+
)
10626+
10627+
return VcfModelMapping(result, individual_names)
1059210628

1059310629
############################################
1059410630
#

0 commit comments

Comments
 (0)