Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 98 additions & 19 deletions src/CAI/CAI.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
warnings.simplefilter("ignore", BiopythonWarning)


def _synonymous_codons(genetic_code_dict):
def _init_synonymous_codons(genetic_code_dict):

# invert the genetic code dictionary to map each amino acid to its codons
codons_for_amino_acid = {}
Expand All @@ -27,37 +27,39 @@ def _synonymous_codons(genetic_code_dict):


_synonymous_codons = {
k: _synonymous_codons(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items()
k: _init_synonymous_codons(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items()
}
_non_synonymous_codons = {
k: {codon for codon in v.keys() if len(v[codon]) == 1}
for k, v in _synonymous_codons.items()
}


def RSCU(sequences, genetic_code=11):
r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences.
def _init_codonsbyaa(genetic_code_dict):

RSCU is 'the observed frequency of [a] codon divided by the frequency
expected under the assumption of equal usage of the synonymous codons for an
amino acid' (page 1283).
# invert the genetic code dictionary to map each amino acid to its codons
codons_for_amino_acid = {}
for codon, amino_acid in genetic_code_dict.items():
codons_for_amino_acid[amino_acid] = codons_for_amino_acid.get(amino_acid, [])
codons_for_amino_acid[amino_acid].append(codon)

In math terms, it is
# create dictionary of codons by amino acid
# Example: {'L': ['CTT', 'CTG', 'CTA', 'CTC', 'TTA', 'TTG'], 'M': ['ATG']...}
return codons_for_amino_acid

.. math::
_codons_by_aa = {
k: _init_codonsbyaa(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items()
}

\frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}}

"where :math:`X` is the number of occurrences of the :math:`j` th codon for
the :math:`i` th amino acid, and :math:`n` is the number (from one to six)
of alternative codons for the :math:`i` th amino acid" (page 1283).
def Count3mers(sequences):
r"""Counts the 3-mers in frame in a set of sequences.

Args:
sequences (list): The reference set of sequences.
genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
sequences (list): The set of sequences.

Returns:
dict: The relative synonymous codon usage.
dict: Counts of in-frame 3-mers.

Raises:
ValueError: When an invalid sequence is provided or a list is not provided.
Expand All @@ -66,7 +68,6 @@ def RSCU(sequences, genetic_code=11):
if not isinstance(sequences, (list, tuple)):
raise ValueError(
"Be sure to pass a list of sequences, not a single sequence. "
"To find the RSCU of a single sequence, pass it as a one element list."
)

# ensure all input sequences are divisible by three
Expand All @@ -81,10 +82,88 @@ def RSCU(sequences, genetic_code=11):
(sequence[i : i + 3].upper() for i in range(0, len(sequence), 3))
for sequence in sequences
)
codons = chain.from_iterable(
threemers = chain.from_iterable(
sequences
) # flat list of all codons (to be used for counting)
counts = Counter(codons)
counts = Counter(threemers)

return counts


def CodonFrequencyByAA(sequences, genetic_code=11, outputtype="count"):
r"""Calculates the synonymous codon frequency by AA for a set of sequences.

Args:
sequences (list): The reference set of sequences.
genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
outputtype: "count" to return codon counts, "freq" to return frequencies proportional to aa.

Returns:
dict: For each amino acid, a dict of codon counts or frequencies

Raises:
ValueError: When an invalid sequence is provided or a list is not provided.
"""

counts = Count3mers(sequences)

# determine the synonymous codons for the genetic code
codons_by_aa = _codons_by_aa[genetic_code]

# hold the result as it is being calculated
counts_by_aa = {}

# calculate counts organized by aa
for aa, codons in codons_by_aa.items():
miniresult = {}
for codon in codons:
# note: check what happens if zero?
miniresult[codon] = counts[codon]
counts_by_aa[aa] = miniresult

if (outputtype == "count") :
return counts_by_aa
elif (outputtype == "freq") :
freqs_by_aa = {}
for aa, codons in codons_by_aa.items():
minifreqs = {}
miniresult = counts_by_aa[aa]
total_by_aa = sum(miniresult.values())
for codon in codons:
minifreqs[codon] = miniresult[codon] / total_by_aa
freqs_by_aa[aa] = minifreqs
return freqs_by_aa


def RSCU(sequences, genetic_code=11):
r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences.

RSCU is 'the observed frequency of [a] codon divided by the frequency
expected under the assumption of equal usage of the synonymous codons for an
amino acid' (page 1283).

In math terms, it is

.. math::

\frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}}

"where :math:`X` is the number of occurrences of the :math:`j` th codon for
the :math:`i` th amino acid, and :math:`n` is the number (from one to six)
of alternative codons for the :math:`i` th amino acid" (page 1283).

Args:
sequences (list): The reference set of sequences.
genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.

Returns:
dict: The relative synonymous codon usage.

Raises:
ValueError: When an invalid sequence is provided or a list is not provided.
"""

counts = Count3mers(sequences)

# "if a certain codon is never used in the reference set... assign [its
# count] a value of 0.5" (page 1285)
Expand Down