@@ -43,7 +43,6 @@ class FivePSeqCounts:
43
43
downsample_constant = None
44
44
outlier_probability = None
45
45
46
-
47
46
config = None
48
47
alignment = None
49
48
annotation = None
@@ -56,12 +55,19 @@ class FivePSeqCounts:
56
55
meta_count_series_term = None
57
56
frame_counts_df_start = None
58
57
frame_counts_df_term = None
58
+ codon_genome_usage_df = None
59
59
codon_count_df = None
60
60
amino_acid_count_df = None
61
61
dicodon_count_df = None
62
62
dipeptide_count_df = None
63
63
tricodon_count_df = None
64
64
tripeptide_count_df = None
65
+ codon_stats_df = None
66
+ amino_acid_stats_df = None
67
+
68
+ codon_genome_usage_df = None
69
+ amino_acid_genome_usage_df = None
70
+
65
71
start_codon_dict = None
66
72
stop_codon_dict = None
67
73
canonical_transcript_index = None
@@ -80,10 +86,10 @@ class FivePSeqCounts:
80
86
TRIPEPTIDE_POS = - 11
81
87
DIPEPTIDE_POS = - 14
82
88
83
-
84
89
missing_chroms = []
85
90
86
- def __init__ (self , alignment , annotation , genome , config , downsample_constant , is_geneset = False , transcript_filter = None ):
91
+ def __init__ (self , alignment , annotation , genome , config , downsample_constant , is_geneset = False ,
92
+ transcript_filter = None ):
87
93
"""
88
94
Initializes a FivePSeqCounts object with Alignment and Annotation instances.
89
95
@@ -210,7 +216,8 @@ def generate_transcript_descriptors(self):
210
216
211
217
count_vector_downsampled = self .get_count_vector (transcript , span_size = 0 ,
212
218
region = self .FULL_LENGTH , downsample = True )
213
- self .transcript_descriptors .at [transcript_ind , self .NUMBER_READS_DOWNSAMPLED ] = int (np .sum (count_vector_downsampled ))
219
+ self .transcript_descriptors .at [transcript_ind , self .NUMBER_READS_DOWNSAMPLED ] = int (
220
+ np .sum (count_vector_downsampled ))
214
221
215
222
self .logger .info ("Done generating transcript descriptors" )
216
223
@@ -487,7 +494,7 @@ def get_sequence(self, transcript, transcript_span_size, desired_span_size):
487
494
return desired_seq
488
495
489
496
def get_cds_sequence_safe (self , transcript , span_size ):
490
- #NOTE a dangerous code here. Works correctly only if the input span size is the same as in the transcript.
497
+ # NOTE a dangerous code here. Works correctly only if the input span size is the same as in the transcript.
491
498
# TOCHANGE
492
499
try :
493
500
sequence = transcript .get_sequence (self .genome .genome_dict )
@@ -684,7 +691,6 @@ def get_unique_sequences(self, region, span_before, span_after):
684
691
i += 1
685
692
return sequences
686
693
687
-
688
694
def get_amino_acid_pauses (self ):
689
695
if self .amino_acid_count_df is None :
690
696
self .compute_codon_pauses ()
@@ -745,7 +751,7 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
745
751
746
752
if self .config .args .no_mask :
747
753
mask_dist = 0
748
- self .logger .info ("Transcript boundaries will not be masked" )
754
+ self .logger .info ("Transcript boundaries will not be masked" )
749
755
else :
750
756
if hasattr (config .args , "codon_mask_size" ):
751
757
mask_dist = config .args .codon_mask_size
@@ -757,17 +763,22 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
757
763
columns = range (dist_from , dist_to ))
758
764
759
765
dicodon_count_df = pd .DataFrame (data = 0 , index = Codons .get_dicodon_table ().keys (),
760
- columns = range (dist_from + 3 , dist_to + 3 ))
766
+ columns = range (dist_from + 3 , dist_to + 3 ))
761
767
762
768
dipeptide_count_df = pd .DataFrame (data = 0 , index = Codons .get_dipeptide_list (),
763
- columns = range (dist_from + 3 , dist_to + 3 ))
769
+ columns = range (dist_from + 3 , dist_to + 3 ))
764
770
765
771
tricodon_count_df = pd .DataFrame (data = 0 , index = Codons .get_tricodon_table ().keys (),
766
772
columns = range (dist_from + 6 , dist_to + 6 ))
767
773
768
774
tripeptide_count_df = pd .DataFrame (data = 0 , index = Codons .get_tripeptide_list (),
769
775
columns = range (dist_from + 6 , dist_to + 6 ))
770
776
777
+ self .codon_genome_usage_df = pd .DataFrame (data = 0 , index = Codons .CODON_TABLE .keys (),
778
+ columns = ['abs' , 'fraction' ])
779
+ self .amino_acid_genome_usage_df = pd .DataFrame (data = 0 , index = Codons .AMINO_ACID_TABLE .keys (),
780
+ columns = ['abs' , 'fraction' ])
781
+
771
782
counter = 1
772
783
773
784
transcript_assembly = self .annotation .get_transcript_assembly (
@@ -805,6 +816,14 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
805
816
count_vector = [0 ] * (- 1 * dist_from ) + count_vector + [0 ] * dist_to
806
817
cds_sequence = '' .join ('N' * (- 1 * dist_from )) + cds_sequence + '' .join ('N' * dist_to )
807
818
819
+ # store genome usage stats
820
+ for i in range (0 , len (cds_sequence ), 3 ):
821
+ codon = cds_sequence [i : i + 3 ].upper ()
822
+ if codon in self .codon_genome_usage_df .index :
823
+ self .codon_genome_usage_df .at [codon , "abs" ] += 1
824
+ amino_acid = Codons .CODON_TABLE .get (codon )
825
+ self .amino_acid_genome_usage_df .at [amino_acid , "abs" ] += 1
826
+
808
827
# identify 3nt bins with non-zero counts
809
828
ind = np .array (range (0 , len (count_vector ), 3 ))
810
829
hits = [sum (count_vector [i :i + 3 ]) > 0 for i in ind ]
@@ -849,6 +868,10 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
849
868
self .logger .warn ("Index out of range: i: %d, j: %d, p: %d, d: %d. %s"
850
869
% (i , j , p , d , str (e )))
851
870
871
+ self .codon_genome_usage_df .loc [:, "fraction" ] = self .codon_genome_usage_df .loc [:, "abs" ] / sum (
872
+ self .codon_genome_usage_df .loc [:, "abs" ])
873
+ self .amino_acid_genome_usage_df .loc [:, "fraction" ] = self .amino_acid_genome_usage_df .loc [:, "abs" ] / sum (
874
+ self .amino_acid_genome_usage_df .loc [:, "abs" ])
852
875
self .amino_acid_count_df = self .codon_to_amino_acid_count_df (codon_count_df )
853
876
self .tripeptide_count_df = self .filter_codon_counts (tripeptide_count_df , self .get_tripeptide_pos ())
854
877
self .dipeptide_count_df = self .filter_codon_counts (dipeptide_count_df , self .get_dipeptide_pos ())
@@ -881,7 +904,6 @@ def codon_to_amino_acid_count_df(self, codon_count_df):
881
904
882
905
return amino_acid_count_df
883
906
884
-
885
907
def get_tripeptide_pos (self ):
886
908
887
909
if hasattr (config .args , "tripeptide_pos" ):
@@ -900,8 +922,7 @@ def get_dipeptide_pos(self):
900
922
901
923
return pos
902
924
903
-
904
- def filter_codon_counts (self , codon_count_df , pos , top = 50 ):
925
+ def filter_codon_counts (self , codon_count_df , pos , top = 50 ):
905
926
"""
906
927
Filter the di/tricodon (or di/tripeptide) counts to exclude low counts (rowSums less than the specified threshold) and
907
928
to include only the top di/tricodons with highest relative counts at the given position
@@ -916,10 +937,97 @@ def filter_codon_counts(self, codon_count_df, pos, top = 50):
916
937
(top , pos ))
917
938
918
939
codon_filtered_df = codon_count_df [codon_count_df .sum (1 ) >= self .COUNT_THRESHOLD ]
919
- pos_rel_counts = codon_filtered_df [pos ]/ codon_filtered_df .sum (1 )
920
- codon_filtered_df = codon_filtered_df .iloc [sorted (range (len (pos_rel_counts )), reverse = True , key = lambda k : pos_rel_counts [k ])[0 :top ]]
940
+ pos_rel_counts = codon_filtered_df [pos ] / codon_filtered_df .sum (1 )
941
+ codon_filtered_df = codon_filtered_df .iloc [
942
+ sorted (range (len (pos_rel_counts )), reverse = True , key = lambda k : pos_rel_counts [k ])[0 :top ]]
921
943
return codon_filtered_df
922
944
945
+ def get_amino_acid_stats (self ):
946
+ if self .amino_acid_stats_df is None :
947
+ self .amino_acid_stats_df = self .compute_codon_stats_amino_acid ()
948
+
949
+ return self .amino_acid_stats_df
950
+
951
+ def get_codon_stats (self ):
952
+ if self .codon_stats_df is None :
953
+ self .codon_stats_df = self .compute_codon_stats_codon ()
954
+
955
+ return self .codon_stats_df
956
+
957
+ def compute_codon_genome_usage (self ):
958
+ self .codon_genome_usage_df = pd .DataFrame (data = 0 , index = Codons .CODON_TABLE .keys (),
959
+ columns = ['abs' , 'fraction' ])
960
+ self .amino_acid_genome_usage_df = pd .DataFrame (data = 0 , index = Codons .AMINO_ACID_TABLE .keys (),
961
+ columns = ['abs' , 'fraction' ])
962
+
963
+ def compute_codon_stats_amino_acid (self ):
964
+ return self .compute_codon_stats (self .get_amino_acid_pauses (), self .amino_acid_genome_usage_df )
965
+
966
+ def compute_codon_stats_codon (self ):
967
+ return self .compute_codon_stats (self .get_codon_pauses (), self .codon_genome_usage_df )
968
+
969
+ def compute_codon_stats (self , codon_counts , codon_genome_usage , until = - 3 ):
970
+ """
971
+ Counts usage and frame protection stats for each codon/amino-acid.
972
+
973
+ The following dataframe will be generated based on codon counts table:
974
+
975
+ codon/aminoacid FPI Frame peak(pos) peak(scale) usage(sum of counts) genome_presence
976
+
977
+ :return: dataframe
978
+ """
979
+
980
+ self .logger .info ("Counting codon usage statistics" )
981
+
982
+ try :
983
+ stop_ind = codon_counts .keys ().to_list ().index (until )
984
+ codon_counts = codon_counts .iloc [:, 0 :stop_ind ]
985
+ f2 = sum ([codon_counts .iloc [:, i ] for i in reversed (range (stop_ind - 1 , - 1 , - 3 ))])
986
+ f1 = sum ([codon_counts .iloc [:, i ] for i in reversed (range (stop_ind - 2 , - 1 , - 3 ))])
987
+ f0 = sum ([codon_counts .iloc [:, i ] for i in reversed (range (stop_ind - 3 , - 1 , - 3 ))])
988
+
989
+ codon_stats = pd .DataFrame (list (zip (f0 , f1 , f2 )), columns = ['F0' , 'F1' , 'F2' ])
990
+
991
+ codon_stats ['FPI' ] = np .zeros (len (codon_stats ))
992
+ codon_stats ['F' ] = np .zeros (len (codon_stats ))
993
+ codon_stats ['F_perc' ] = np .zeros (len (codon_stats ))
994
+
995
+ for i in range (len (codon_stats )):
996
+ fpi , fmax , fperc = CountManager .fpi_stats_from_frame_counts (codon_stats .iloc [i , :])
997
+ codon_stats .loc [i , 'FPI' ] = fpi
998
+ codon_stats .loc [i , 'F' ] = fmax
999
+ codon_stats .loc [i , 'F_perc' ] = fperc
1000
+
1001
+ codon_stats ['peak_pos' ] = [np .argmax (codon_counts .iloc [i , :]) for i in range (len (codon_stats ))]
1002
+ codon_stats ['peak_scale' ] = np .zeros (len (codon_stats ))
1003
+
1004
+
1005
+ for i in range (len (codon_stats )):
1006
+ for i in range (len (codon_stats )):
1007
+ counts = list (codon_counts .iloc [i , :])
1008
+ if sum (counts ) > 0 :
1009
+ frame = int (codon_stats .loc [i , 'F' ])
1010
+ frame_inds = [j for j in reversed (range (len (counts ) - 3 + frame , - 1 , - 3 ))]
1011
+ frame_counts = [counts [j ] for j in frame_inds ]
1012
+ codon_stats .loc [i , 'peak_scale' ] = len (frame_counts ) * max (frame_counts ) / sum (frame_counts )
1013
+ codon_stats .loc [i , 'peak_pos' ] = codon_counts .columns [frame_inds [np .argmax (frame_counts )]]
1014
+
1015
+ codon_stats ['usage' ] = list (sum ([codon_counts .iloc [:, i ] for i in range (0 , stop_ind )]))
1016
+ codon_stats ['genome_usage_abs' ] = list (codon_genome_usage .loc [:, 'abs' ])
1017
+ codon_stats ['genome_usage_fraction' ] = list (codon_genome_usage .loc [:, 'fraction' ])
1018
+ usage_norm = codon_stats ['usage' ] / codon_stats ['genome_usage_fraction' ]
1019
+ usage_norm /= sum (usage_norm )
1020
+ codon_stats ['usage_normalized' ] = usage_norm
1021
+
1022
+ codon_stats .index = codon_counts .index
1023
+
1024
+ return codon_stats
1025
+
1026
+ except :
1027
+ self .logger .warning ("Could not compute codon stats. Codon counts dataframe did not have column %d." % until )
1028
+ return None
1029
+ # exclude the counts downstream from -3
1030
+
923
1031
@preconditions (lambda loci_file : str )
924
1032
def get_pauses_from_loci (self , loci_file , read_locations = READ_LOCATIONS_ALL ):
925
1033
"""
@@ -1590,9 +1698,9 @@ def read_count_dict(file_path):
1590
1698
:return: float
1591
1699
"""
1592
1700
count_freq_dict = {}
1593
- dict_mat = pd .read_csv (file_path , header = None , delimiter = "\t " , index_col = 0 )
1701
+ dict_mat = pd .read_csv (file_path , header = None , delimiter = "\t " , index_col = 0 )
1594
1702
for i in range (len (dict_mat )):
1595
- count_freq_dict [dict_mat .index [i ]] = dict_mat .iloc [i ,0 ]
1703
+ count_freq_dict [dict_mat .index [i ]] = dict_mat .iloc [i , 0 ]
1596
1704
1597
1705
return collections .OrderedDict (sorted (count_freq_dict .items ()))
1598
1706
@@ -1725,4 +1833,30 @@ def combine_amino_acid_dfs(amino_acid_df_dict, lib_size_dict=None):
1725
1833
1726
1834
return amino_acid_df_combined
1727
1835
1836
+ @staticmethod
1837
+ def fpi_stats_from_frame_counts (frame_counts ):
1838
+ """
1839
+ Takes as input a vector named [F0, F1, F2] and returns:
1840
+ (fpi, fmax, f_perc)
1841
+ fpi = frame protection index of the maximum frame
1842
+ fmax = the maximum frame
1843
+ f_perc = the fraction of counts in the maximum frame
1844
+
1845
+ :param frame_counts:
1846
+ :return:
1847
+ """
1848
+ f_counts = (frame_counts ['F0' ], frame_counts ['F1' ], frame_counts ['F2' ])
1849
+ fmax = np .argmax (f_counts )
1850
+ nom = f_counts [fmax ]
1851
+ if nom == 0 :
1852
+ fpi = None
1853
+ f_perc = None
1854
+ else :
1855
+ denom = (sum (f_counts ) - nom ) / 2.
1856
+ if denom == 0 :
1857
+ fpi = np .log2 (float (nom ) / 0.5 )
1858
+ else :
1859
+ fpi = np .log2 (float (nom / denom ))
1860
+ f_perc = 100 * (float (f_counts [fmax ]) / sum (f_counts ))
1728
1861
1862
+ return fpi , fmax , f_perc
0 commit comments