Skip to content

Commit 9a53152

Browse files
1.0.0 stable
1 parent bd2c532 commit 9a53152

File tree

6 files changed

+175
-32
lines changed

6 files changed

+175
-32
lines changed

README.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ User guide
1313
Below is a quick manual to get you started.
1414
For detailed instructions and explanations on fivepseq output, please see the user guide at: https://fivepseq.readthedocs.io/en/latest/.
1515

16+
Citation
17+
------------
18+
`Nersisyan L, Ropat M, Pelechano V. Improved computational analysis of ribosome dynamics from 5′P degradome data using fivepseq. NAR Genomics and Bioinformatics, 2:4, 2020. <https://doi.org/10.1093/nargab/lqaa099>`_
19+
1620
Installation
1721
------------
1822
Install dependencies:

fivepseq/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '1.0b7'
1+
__version__ = '1.0.0'

fivepseq/logic/algorithms/count_stats/count_stats.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -371,23 +371,11 @@ def compute_fpi_per_transcript(self):
371371
columns=[self.COUNT, "F", self.FRAME_COUNT,
372372
self.FRAME_PERC, self.FPI])
373373
for index, row in frame_counts_df.iterrows():
374-
f_counts = (row['F0'], row['F1'], row['F2'])
375-
fmax = np.argmax(f_counts)
376-
nom = f_counts[fmax]
377-
if nom == 0:
378-
fpi = None
379-
f_perc = None
380-
else:
381-
denom = (sum(f_counts) - nom) / 2.
382-
if denom == 0:
383-
fpi = np.log2(float(nom) / 0.5)
384-
else:
385-
fpi = np.log2(float(nom / denom))
386-
f_perc = 100 * (float(f_counts[fmax]) / sum(f_counts))
374+
fpi, fmax, f_perc = CountManager.fpi_stats_from_frame_counts(row)
387375

388-
transcript_fpi_df.at[index, self.COUNT] = sum(f_counts)
376+
transcript_fpi_df.at[index, self.COUNT] = sum(row)
389377
transcript_fpi_df.at[index, 'F'] = fmax
390-
transcript_fpi_df.at[index, self.FRAME_COUNT] = f_counts[fmax]
378+
transcript_fpi_df.at[index, self.FRAME_COUNT] = row[fmax]
391379
transcript_fpi_df.at[index, self.FRAME_PERC] = f_perc
392380
transcript_fpi_df.at[index, self.FPI] = fpi
393381

fivepseq/logic/algorithms/general_pipelines/count_pipeline.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def run(self):
3636
self.run_frame_counts()
3737
self.run_transcript_indices()
3838
self.run_codon_counts()
39+
self.run_codon_stats()
3940
self.run_loci_counts()
4041
self.run_queue_analysis()
4142

@@ -198,6 +199,17 @@ def run_codon_counts(self):
198199
if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.TRIPEPTIDE_PAUSES_FILE)):
199200
self.fivepseq_out.write_df_to_file(self.fivepseq_counts.get_tripeptide_pauses(),
200201
self.fivepseq_out.TRIPEPTIDE_PAUSES_FILE)
202+
def run_codon_stats(self):
203+
# amino acid stats
204+
if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.AMINO_ACID_STATS_FILE)):
205+
self.fivepseq_out.write_df_to_file(self.fivepseq_counts.get_amino_acid_stats(),
206+
# generate more than needed for visualization
207+
self.fivepseq_out.AMINO_ACID_STATS_FILE)
208+
209+
# codon stats
210+
if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.CODON_STATS_FILE)):
211+
self.fivepseq_out.write_df_to_file(self.fivepseq_counts.get_codon_stats(),
212+
self.fivepseq_out.CODON_STATS_FILE)
201213

202214
def run_loci_counts(self):
203215
# loci pauses

fivepseq/logic/structures/fivepseq_counts.py

Lines changed: 150 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class FivePSeqCounts:
4343
downsample_constant = None
4444
outlier_probability = None
4545

46-
4746
config = None
4847
alignment = None
4948
annotation = None
@@ -56,12 +55,19 @@ class FivePSeqCounts:
5655
meta_count_series_term = None
5756
frame_counts_df_start = None
5857
frame_counts_df_term = None
58+
codon_genome_usage_df = None
5959
codon_count_df = None
6060
amino_acid_count_df = None
6161
dicodon_count_df = None
6262
dipeptide_count_df = None
6363
tricodon_count_df = None
6464
tripeptide_count_df = None
65+
codon_stats_df = None
66+
amino_acid_stats_df = None
67+
68+
codon_genome_usage_df = None
69+
amino_acid_genome_usage_df = None
70+
6571
start_codon_dict = None
6672
stop_codon_dict = None
6773
canonical_transcript_index = None
@@ -80,10 +86,10 @@ class FivePSeqCounts:
8086
TRIPEPTIDE_POS = -11
8187
DIPEPTIDE_POS = -14
8288

83-
8489
missing_chroms = []
8590

86-
def __init__(self, alignment, annotation, genome, config, downsample_constant, is_geneset = False, transcript_filter=None):
91+
def __init__(self, alignment, annotation, genome, config, downsample_constant, is_geneset=False,
92+
transcript_filter=None):
8793
"""
8894
Initializes a FivePSeqCounts object with Alignment and Annotation instances.
8995
@@ -210,7 +216,8 @@ def generate_transcript_descriptors(self):
210216

211217
count_vector_downsampled = self.get_count_vector(transcript, span_size=0,
212218
region=self.FULL_LENGTH, downsample=True)
213-
self.transcript_descriptors.at[transcript_ind, self.NUMBER_READS_DOWNSAMPLED] = int(np.sum(count_vector_downsampled))
219+
self.transcript_descriptors.at[transcript_ind, self.NUMBER_READS_DOWNSAMPLED] = int(
220+
np.sum(count_vector_downsampled))
214221

215222
self.logger.info("Done generating transcript descriptors")
216223

@@ -487,7 +494,7 @@ def get_sequence(self, transcript, transcript_span_size, desired_span_size):
487494
return desired_seq
488495

489496
def get_cds_sequence_safe(self, transcript, span_size):
490-
#NOTE a dangerous code here. Works correctly only if the input span size is the same as in the transcript.
497+
# NOTE a dangerous code here. Works correctly only if the input span size is the same as in the transcript.
491498
# TOCHANGE
492499
try:
493500
sequence = transcript.get_sequence(self.genome.genome_dict)
@@ -684,7 +691,6 @@ def get_unique_sequences(self, region, span_before, span_after):
684691
i += 1
685692
return sequences
686693

687-
688694
def get_amino_acid_pauses(self):
689695
if self.amino_acid_count_df is None:
690696
self.compute_codon_pauses()
@@ -745,7 +751,7 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
745751

746752
if self.config.args.no_mask:
747753
mask_dist = 0
748-
self.logger.info("Transcript boundaries will not be masked" )
754+
self.logger.info("Transcript boundaries will not be masked")
749755
else:
750756
if hasattr(config.args, "codon_mask_size"):
751757
mask_dist = config.args.codon_mask_size
@@ -757,17 +763,22 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
757763
columns=range(dist_from, dist_to))
758764

759765
dicodon_count_df = pd.DataFrame(data=0, index=Codons.get_dicodon_table().keys(),
760-
columns=range(dist_from + 3, dist_to + 3))
766+
columns=range(dist_from + 3, dist_to + 3))
761767

762768
dipeptide_count_df = pd.DataFrame(data=0, index=Codons.get_dipeptide_list(),
763-
columns=range(dist_from + 3, dist_to + 3))
769+
columns=range(dist_from + 3, dist_to + 3))
764770

765771
tricodon_count_df = pd.DataFrame(data=0, index=Codons.get_tricodon_table().keys(),
766772
columns=range(dist_from + 6, dist_to + 6))
767773

768774
tripeptide_count_df = pd.DataFrame(data=0, index=Codons.get_tripeptide_list(),
769775
columns=range(dist_from + 6, dist_to + 6))
770776

777+
self.codon_genome_usage_df = pd.DataFrame(data=0, index=Codons.CODON_TABLE.keys(),
778+
columns=['abs', 'fraction'])
779+
self.amino_acid_genome_usage_df = pd.DataFrame(data=0, index=Codons.AMINO_ACID_TABLE.keys(),
780+
columns=['abs', 'fraction'])
781+
771782
counter = 1
772783

773784
transcript_assembly = self.annotation.get_transcript_assembly(
@@ -805,6 +816,14 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
805816
count_vector = [0] * (-1 * dist_from) + count_vector + [0] * dist_to
806817
cds_sequence = ''.join('N' * (-1 * dist_from)) + cds_sequence + ''.join('N' * dist_to)
807818

819+
# store genome usage stats
820+
for i in range(0, len(cds_sequence), 3):
821+
codon = cds_sequence[i: i + 3].upper()
822+
if codon in self.codon_genome_usage_df.index:
823+
self.codon_genome_usage_df.at[codon, "abs"] += 1
824+
amino_acid = Codons.CODON_TABLE.get(codon)
825+
self.amino_acid_genome_usage_df.at[amino_acid, "abs"] += 1
826+
808827
# identify 3nt bins with non-zero counts
809828
ind = np.array(range(0, len(count_vector), 3))
810829
hits = [sum(count_vector[i:i + 3]) > 0 for i in ind]
@@ -849,6 +868,10 @@ def compute_codon_pauses(self, dist_from=-30, dist_to=3, downsample=True):
849868
self.logger.warn("Index out of range: i: %d, j: %d, p: %d, d: %d. %s"
850869
% (i, j, p, d, str(e)))
851870

871+
self.codon_genome_usage_df.loc[:, "fraction"] = self.codon_genome_usage_df.loc[:, "abs"] / sum(
872+
self.codon_genome_usage_df.loc[:, "abs"])
873+
self.amino_acid_genome_usage_df.loc[:, "fraction"] = self.amino_acid_genome_usage_df.loc[:, "abs"] / sum(
874+
self.amino_acid_genome_usage_df.loc[:, "abs"])
852875
self.amino_acid_count_df = self.codon_to_amino_acid_count_df(codon_count_df)
853876
self.tripeptide_count_df = self.filter_codon_counts(tripeptide_count_df, self.get_tripeptide_pos())
854877
self.dipeptide_count_df = self.filter_codon_counts(dipeptide_count_df, self.get_dipeptide_pos())
@@ -881,7 +904,6 @@ def codon_to_amino_acid_count_df(self, codon_count_df):
881904

882905
return amino_acid_count_df
883906

884-
885907
def get_tripeptide_pos(self):
886908

887909
if hasattr(config.args, "tripeptide_pos"):
@@ -900,8 +922,7 @@ def get_dipeptide_pos(self):
900922

901923
return pos
902924

903-
904-
def filter_codon_counts(self, codon_count_df, pos, top = 50):
925+
def filter_codon_counts(self, codon_count_df, pos, top=50):
905926
"""
906927
Filter the di/tricodon (or di/tripeptide) counts to exclude low counts (rowSums less than the specified threshold) and
907928
to include only the top di/tricodons with highest relative counts at the given position
@@ -916,10 +937,97 @@ def filter_codon_counts(self, codon_count_df, pos, top = 50):
916937
(top, pos))
917938

918939
codon_filtered_df = codon_count_df[codon_count_df.sum(1) >= self.COUNT_THRESHOLD]
919-
pos_rel_counts = codon_filtered_df[pos]/codon_filtered_df.sum(1)
920-
codon_filtered_df = codon_filtered_df.iloc[sorted(range(len(pos_rel_counts)), reverse=True, key=lambda k: pos_rel_counts[k])[0:top]]
940+
pos_rel_counts = codon_filtered_df[pos] / codon_filtered_df.sum(1)
941+
codon_filtered_df = codon_filtered_df.iloc[
942+
sorted(range(len(pos_rel_counts)), reverse=True, key=lambda k: pos_rel_counts[k])[0:top]]
921943
return codon_filtered_df
922944

945+
def get_amino_acid_stats(self):
946+
if self.amino_acid_stats_df is None:
947+
self.amino_acid_stats_df = self.compute_codon_stats_amino_acid()
948+
949+
return self.amino_acid_stats_df
950+
951+
def get_codon_stats(self):
952+
if self.codon_stats_df is None:
953+
self.codon_stats_df = self.compute_codon_stats_codon()
954+
955+
return self.codon_stats_df
956+
957+
def compute_codon_genome_usage(self):
958+
self.codon_genome_usage_df = pd.DataFrame(data=0, index=Codons.CODON_TABLE.keys(),
959+
columns=['abs', 'fraction'])
960+
self.amino_acid_genome_usage_df = pd.DataFrame(data=0, index=Codons.AMINO_ACID_TABLE.keys(),
961+
columns=['abs', 'fraction'])
962+
963+
def compute_codon_stats_amino_acid(self):
964+
return self.compute_codon_stats(self.get_amino_acid_pauses(), self.amino_acid_genome_usage_df)
965+
966+
def compute_codon_stats_codon(self):
967+
return self.compute_codon_stats(self.get_codon_pauses(), self.codon_genome_usage_df)
968+
969+
def compute_codon_stats(self, codon_counts, codon_genome_usage, until=-3):
970+
"""
971+
Counts usage and frame protection stats for each codon/amino-acid.
972+
973+
The following dataframe will be generated based on codon counts table:
974+
975+
codon/aminoacid FPI Frame peak(pos) peak(scale) usage(sum of counts) genome_presence
976+
977+
:return: dataframe
978+
"""
979+
980+
self.logger.info("Counting codon usage statistics")
981+
982+
try:
983+
stop_ind = codon_counts.keys().to_list().index(until)
984+
codon_counts = codon_counts.iloc[:, 0:stop_ind]
985+
f2 = sum([codon_counts.iloc[:, i] for i in reversed(range(stop_ind - 1, -1, -3))])
986+
f1 = sum([codon_counts.iloc[:, i] for i in reversed(range(stop_ind - 2, -1, -3))])
987+
f0 = sum([codon_counts.iloc[:, i] for i in reversed(range(stop_ind - 3, -1, -3))])
988+
989+
codon_stats = pd.DataFrame(list(zip(f0, f1, f2)), columns=['F0', 'F1', 'F2'])
990+
991+
codon_stats['FPI'] = np.zeros(len(codon_stats))
992+
codon_stats['F'] = np.zeros(len(codon_stats))
993+
codon_stats['F_perc'] = np.zeros(len(codon_stats))
994+
995+
for i in range(len(codon_stats)):
996+
fpi, fmax, fperc = CountManager.fpi_stats_from_frame_counts(codon_stats.iloc[i, :])
997+
codon_stats.loc[i, 'FPI'] = fpi
998+
codon_stats.loc[i, 'F'] = fmax
999+
codon_stats.loc[i, 'F_perc'] = fperc
1000+
1001+
codon_stats['peak_pos'] = [np.argmax(codon_counts.iloc[i, :]) for i in range(len(codon_stats))]
1002+
codon_stats['peak_scale'] = np.zeros(len(codon_stats))
1003+
1004+
1005+
for i in range(len(codon_stats)):
1006+
for i in range(len(codon_stats)):
1007+
counts = list(codon_counts.iloc[i, :])
1008+
if sum(counts) > 0:
1009+
frame = int(codon_stats.loc[i, 'F'])
1010+
frame_inds = [j for j in reversed(range(len(counts) - 3 + frame, -1, -3))]
1011+
frame_counts = [counts[j] for j in frame_inds]
1012+
codon_stats.loc[i, 'peak_scale'] = len(frame_counts) * max(frame_counts) / sum(frame_counts)
1013+
codon_stats.loc[i, 'peak_pos'] = codon_counts.columns[frame_inds[np.argmax(frame_counts)]]
1014+
1015+
codon_stats['usage'] = list(sum([codon_counts.iloc[:, i] for i in range(0, stop_ind)]))
1016+
codon_stats['genome_usage_abs'] = list(codon_genome_usage.loc[:, 'abs'])
1017+
codon_stats['genome_usage_fraction'] = list(codon_genome_usage.loc[:, 'fraction'])
1018+
usage_norm = codon_stats['usage'] / codon_stats['genome_usage_fraction']
1019+
usage_norm /= sum(usage_norm)
1020+
codon_stats['usage_normalized'] = usage_norm
1021+
1022+
codon_stats.index = codon_counts.index
1023+
1024+
return codon_stats
1025+
1026+
except:
1027+
self.logger.warning("Could not compute codon stats. Codon counts dataframe did not have column %d." % until)
1028+
return None
1029+
# exclude the counts downstream from -3
1030+
9231031
@preconditions(lambda loci_file: str)
9241032
def get_pauses_from_loci(self, loci_file, read_locations=READ_LOCATIONS_ALL):
9251033
"""
@@ -1590,9 +1698,9 @@ def read_count_dict(file_path):
15901698
:return: float
15911699
"""
15921700
count_freq_dict = {}
1593-
dict_mat = pd.read_csv(file_path, header = None, delimiter="\t", index_col=0)
1701+
dict_mat = pd.read_csv(file_path, header=None, delimiter="\t", index_col=0)
15941702
for i in range(len(dict_mat)):
1595-
count_freq_dict[dict_mat.index[i]] = dict_mat.iloc[i,0]
1703+
count_freq_dict[dict_mat.index[i]] = dict_mat.iloc[i, 0]
15961704

15971705
return collections.OrderedDict(sorted(count_freq_dict.items()))
15981706

@@ -1725,4 +1833,30 @@ def combine_amino_acid_dfs(amino_acid_df_dict, lib_size_dict=None):
17251833

17261834
return amino_acid_df_combined
17271835

1836+
@staticmethod
1837+
def fpi_stats_from_frame_counts(frame_counts):
1838+
"""
1839+
Takes as input a vector named [F0, F1, F2] and returns:
1840+
(fpi, fmax, f_perc)
1841+
fpi = frame protection index of the maximum frame
1842+
fmax = the maximum frame
1843+
f_perc = the fraction of counts in the maximum frame
1844+
1845+
:param frame_counts:
1846+
:return:
1847+
"""
1848+
f_counts = (frame_counts['F0'], frame_counts['F1'], frame_counts['F2'])
1849+
fmax = np.argmax(f_counts)
1850+
nom = f_counts[fmax]
1851+
if nom == 0:
1852+
fpi = None
1853+
f_perc = None
1854+
else:
1855+
denom = (sum(f_counts) - nom) / 2.
1856+
if denom == 0:
1857+
fpi = np.log2(float(nom) / 0.5)
1858+
else:
1859+
fpi = np.log2(float(nom / denom))
1860+
f_perc = 100 * (float(f_counts[fmax]) / sum(f_counts))
17281861

1862+
return fpi, fmax, f_perc

fivepseq/util/writers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class FivePSeqOut:
2626
TRICODON_PAUSES_FILE = "tricodon_pauses.txt"
2727
TRIPEPTIDE_PAUSES_FILE = "tripeptide_pauses.txt"
2828
DIPEPTIDE_PAUSES_FILE = "dipeptide_pauses.txt"
29+
AMINO_ACID_STATS_FILE = "amino_acid_stats.txt"
30+
CODON_STATS_FILE = "codon_stats.txt"
2931
LOCI_PAUSES_FILE = "loci_pauses.txt"
3032
LOCI_PAUSES_FILE_PREFIX = "loci_pauses"
3133
LOCI_OVERLAPS_FILE = "loci_overlaps.txt"
@@ -206,6 +208,9 @@ def write_df_to_file(self, df, file_name):
206208
207209
:return:
208210
"""
211+
if df is None:
212+
return
213+
209214
f = self.open_file_for_writing(file_name)
210215
if f is not None:
211216
df.to_csv(f, sep="\t", header=True)

0 commit comments

Comments
 (0)