57
57
LEGACY_MS_LABELS = "legacy_ms"
58
58
59
59
60
+ @dataclass
61
+ class VcfModelMapping :
62
+ individuals_nodes : np .ndarray
63
+ individuals_name : np .ndarray
64
+
65
+
60
66
class CoalescenceRecord (NamedTuple ):
61
67
left : float
62
68
right : float
@@ -10520,7 +10526,13 @@ def sample_nodes_by_ploidy(self, ploidy):
10520
10526
sample_node_ids = sample_node_ids .reshape ((num_samples_per_individual , ploidy ))
10521
10527
return sample_node_ids
10522
10528
10523
- def map_samples_to_vcf (self , individuals = None , ploidy = None ):
10529
+ def map_to_vcf_model (
10530
+ self ,
10531
+ individuals = None ,
10532
+ ploidy = None ,
10533
+ name_metadata_key = None ,
10534
+ individual_names = None ,
10535
+ ):
10524
10536
"""
10525
10537
Returns a list of lists of node IDs, where each sublist contains the
10526
10538
sample nodes associated with the same individual.
@@ -10529,6 +10541,11 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
10529
10541
only the nodes for the specified individuals are returned.
10530
10542
"""
10531
10543
10544
+ if name_metadata_key is not None and individual_names is not None :
10545
+ raise ValueError (
10546
+ "Cannot specify both name_metadata_key and individual_names"
10547
+ )
10548
+
10532
10549
if self .num_individuals > 0 and ploidy is not None :
10533
10550
raise ValueError (
10534
10551
"Cannot specify ploidy when individuals are present in the tree sequence"
@@ -10548,47 +10565,66 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
10548
10565
if self .num_individuals == 0 and individuals is None :
10549
10566
if ploidy is None :
10550
10567
ploidy = 1
10551
- return self .sample_nodes_by_ploidy (ploidy )
10552
-
10553
- individuals_nodes = []
10554
- if individuals is None :
10555
- for ind in self .individuals ():
10556
- if len (ind .nodes ) == 0 :
10557
- warnings .warn (
10558
- f"Individual { ind .id } has no nodes associated with it." ,
10559
- stacklevel = 1 ,
10568
+ individuals_nodes = self .sample_nodes_by_ploidy (ploidy )
10569
+ else :
10570
+ individuals_nodes = []
10571
+ ts_individual_names = []
10572
+ if individuals is None :
10573
+ for ind in self .individuals ():
10574
+ if len (ind .nodes ) == 0 :
10575
+ warnings .warn (
10576
+ f"Individual { ind .id } has no nodes associated with it." ,
10577
+ stacklevel = 1 ,
10578
+ )
10579
+ continue
10580
+ is_sample = np .array (
10581
+ [self .node_flags (u ) & tskit .NODE_IS_SAMPLE for u in ind .nodes ]
10560
10582
)
10561
- continue
10562
- is_sample = np .array (
10563
- [self .node_flags (u ) & tskit .NODE_IS_SAMPLE for u in ind .nodes ]
10564
- )
10565
- if all (is_sample ):
10583
+ if all (is_sample ):
10584
+ individuals_nodes .append (ind .nodes )
10585
+ if name_metadata_key is not None :
10586
+ ts_individual_names .append (ind .metadata [name_metadata_key ])
10587
+ else :
10588
+ ts_individual_names .append (f"tsk_{ ind .id } " )
10589
+ elif all (~ is_sample ):
10590
+ continue
10591
+ else :
10592
+ warnings .warn (
10593
+ f"Individual { ind .id } has both sample and non-sample nodes "
10594
+ "associated with it." ,
10595
+ stacklevel = 1 ,
10596
+ )
10597
+ else :
10598
+ for i in individuals :
10599
+ if i < 0 or i >= self .num_individuals :
10600
+ raise ValueError (f"Invalid individual ID { i } " )
10601
+ ind = self .individual (i )
10602
+ if len (ind .nodes ) == 0 :
10603
+ raise ValueError (
10604
+ f"Individual { i } has no nodes associated with it."
10605
+ )
10566
10606
individuals_nodes .append (ind .nodes )
10567
- elif all (~ is_sample ):
10568
- continue
10569
- else :
10570
- warnings .warn (
10571
- f"Individual { ind .id } has both sample and non-sample nodes "
10572
- "associated with it." ,
10573
- stacklevel = 1 ,
10574
- )
10575
- else :
10576
- for i in individuals :
10577
- if i < 0 or i >= self .num_individuals :
10578
- raise ValueError (f"Invalid individual ID { i } " )
10579
- ind = self .individual (i )
10580
- if len (ind .nodes ) == 0 :
10581
- raise ValueError (f"Individual { i } has no nodes associated with it." )
10582
- individuals_nodes .append (ind .nodes )
10583
-
10584
- if len (individuals_nodes ) == 0 :
10585
- return np .array ([], dtype = np .int32 ).reshape ((0 , 0 ))
10586
-
10587
- max_nodes = max (len (nodes ) for nodes in individuals_nodes )
10588
- result = np .full ((len (individuals_nodes ), max_nodes ), - 1 , dtype = np .int32 )
10589
- for i , nodes in enumerate (individuals_nodes ):
10590
- result [i , : len (nodes )] = nodes
10591
- return result
10607
+ if name_metadata_key is not None :
10608
+ ts_individual_names .append (ind .metadata [name_metadata_key ])
10609
+ else :
10610
+ ts_individual_names .append (f"tsk_{ ind .id } " )
10611
+
10612
+ max_nodes = max (len (nodes ) for nodes in individuals_nodes )
10613
+ result = np .full ((len (individuals_nodes ), max_nodes ), - 1 , dtype = np .int32 )
10614
+ for i , nodes in enumerate (individuals_nodes ):
10615
+ result [i , : len (nodes )] = nodes
10616
+ individuals_nodes = result
10617
+
10618
+ if individual_names is None :
10619
+ individual_names = ts_individual_names
10620
+ individual_names = np .array (individual_names , dtype = object )
10621
+
10622
+ if len (individuals_nodes ) != len (individual_names ):
10623
+ raise ValueError (
10624
+ "The number of individuals does not match the number of names"
10625
+ )
10626
+
10627
+ return VcfModelMapping (result , individual_names )
10592
10628
10593
10629
############################################
10594
10630
#
0 commit comments