This code snippet is part of a bioinformatics script designed to process sequence data, validate it, and perform various operations on sequences. Here's a summary of its functionalities in 10 points:
IncorrectSequenceLetter
for handling invalid sequence letters.Sequence
class that serves as a base for all sequence types, with methods for validation, accessing identifier and sequence, calculating molecular weight, checking for subsequence, and defining equality and concatenation operations.NucleotideSequence
) and protein sequences (ProteinSequence
), with the former having additional methods for translation and the latter serving as a straightforward extension of the Sequence
class.NucleotideSequence
into DNASequence
and RNASequence
classes, each with methods relevant to their type, such as transcribe
for DNA and reverse_transcribe
for RNA.FASTA_iterator
that iterates over sequences in a FASTA file, yielding sequence objects of a specified class.FASTA_to_protein
function.sys.argv
mechanism to accept command-line arguments for input files and output options, allowing the script to operate in a flexible manner based on user input.glob
and os
modules to search for FASTA files in specified directories or use default locations if no arguments are provided.sort_and_print
function that sorts protein sequences by molecular weight and either prints them to the console or writes them to a specified output file, completing the script's primary objective of processing and organizing sequence data.
# Importing necessary modules
import sequence_dictionaries as vary
import sys
import glob
import os
# Custom exception class for incorrect sequence letters
class IncorrectSequenceLetter(ValueError):
def __init__(self, letter, class_name):
self.letter = letter
self.class_name = class_name
def __str__(self):
return f"The sequence item {self.letter} is not found in the alphabet of class {self.class_name}"
# Base class for all sequence types
class Sequence():
alphabet = vary.protein_letters + vary.dna_letters + vary.rna_letters
def __init__(self, identifier, sequence):
self.__identifier = str(identifier)
self.__sequence = str(sequence)
self.validate_sequence()
def validate_sequence(self):
for letter in self.__sequence:
if letter not in self.alphabet:
raise IncorrectSequenceLetter(letter, self.__class__.__name__)
def get_identifier(self):
return self.__identifier
def get_sequence(self):
return self.__sequence
def get_mw(self):
if isinstance(self, ProteinSequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.protein_weights[element]
return float(sequence_molecular_weight)
elif isinstance(self, DNASequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.dna_weights[element]
return float(sequence_molecular_weight)
elif isinstance(self, RNASequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.rna_weights[element]
return float(sequence_molecular_weight)
def has_subsequence(self, protein):
if protein.__sequence in self.__sequence:
return True
else:
return False
def __len__(self):
return len(self.__sequence)
def __eq__(self, other):
return self.__sequence == other.get_sequence()
def __ne__ (self, other):
return self.__sequence != other.get_sequence()
def __add__(self, other):
if not isinstance(other, self.__class__):
raise TypeError(f"Addition not supported between instances of '{self.__class__.__name__}' and
'{other.__class__.__name__}'")
concatenated_identifier = self.get_identifier() + "+" + other.get_identifier()
concatenated_sequence = self.get_sequence() + other.get_sequence()
return self.__class__(concatenated_identifier, concatenated_sequence)
def __getitem__ (self, i):
return self.__sequence[i]
def __contains__(self, string):
return string in self.__sequence
def __lt__(self, other):
return self.get_mw() < other.get_mw() def __hash__(self): return hash((self.__identifier, self.__sequence)) # Class for
nucleotide sequences class NucleotideSequence(Sequence): alphabet=vary.dna_letters + vary.rna_letters def
translate(self): protein_sequence="" codons=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
3)] for codon in codons: if isinstance(self, RNASequence): if codon in vary.rna_table.keys(): protein_sequence
+=vary.rna_table[codon] elif isinstance(self, DNASequence): if codon in vary.dna_table.keys(): protein_sequence
+=vary.dna_table[codon] return protein_sequence # Class for protein sequences class ProteinSequence(Sequence):
alphabet=vary.protein_letters pass # Class for DNA sequences class DNASequence(NucleotideSequence):
alphabet=vary.dna_letters def transcribe(self): return self.get_sequence().replace("T", "U" ) # Class for RNA
sequences class RNASequence(NucleotideSequence): def reverse_transcribe(self): reverse_transcription=""
intermediate_sequence="" sequence_splited=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
3)] for base in sequence_splited: if base in vary.rna_table.keys(): intermediate_sequence +=vary.rna_table[base] for
element in intermediate_sequence: if element in vary.dna_table_back.keys(): reverse_transcription
+=vary.dna_table_back[element] return reverse_transcription # Generator function to iterate over FASTA files def
FASTA_iterator(fasta_filename, class_type=DNASequence): """ This is A Generator Function that reads a Fasta file. In each iteration, the function
must return an object class protein""" identifiers='' sequences='' with open (fasta_filename, 'r' ) as file: for
line in file: if line.startswith(">"):
if identifiers:
try:
yield class_type(identifier=identifiers, sequence=sequences )
except IncorrectSequenceLetter:
sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
identifiers = line[1:].strip()
sequences = ''
else:
sequences += line.strip()
if identifiers:
try:
yield class_type(identifier=identifiers, sequence=sequences )
except IncorrectSequenceLetter:
sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
# Function to convert FASTA files to protein sequences and sort them
def FASTA_to_protein(input_files):
seq_counter = 0
sorted_list = []
for input_file in input_files:
for seq in FASTA_iterator(input_file):
try:
if isinstance(seq, DNASequence) or isinstance(seq, RNASequence):
protein_sequence = seq.translate()
protein = ProteinSequence(identifier= seq.get_identifier(), sequence=protein_sequence)
sorted_list.append((protein))
seq_counter += 1
except Exception as e:
sys.stderr.write(f"Error processing {input_file}: {e}\n")
sys.stderr.write(f"{input_file} finished.\n")
sys.stderr.write(f"{seq_counter} sequences found.\n")
return sorted_list
# Function to sort and print protein sequences
def sort_and_print(files, output=None):
if len(files) == 0:
sys.stderr.write("No FASTA files found.\n")
elif len(files) == 1:
if files[0].endswith((".fa", ".fasta")):
fasta_files = files
sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
else:
sys.stderr.write("1 file is being processed.\n")
fasta_files = []
else:
fasta_files = [file for file in files if file.endswith(".fa") or file.endswith(".fasta")]
sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
sorted_list = FASTA_to_protein(files)
sys.stderr.write("Sorting the sequences...\n")
sorted_list.sort(key=lambda x: x.get_mw())
if output is None:
for seq in sorted_list:
print(seq.get_identifier(), len(seq.get_sequence()), "{:.2f}".format(seq.get_mw()), sep='\t')
else:
print(f"Writing to file: {output}")
with open(output, "w") as out:
for seq in sorted_list:
out.write(f"{seq.get_identifier()}\t{len(seq.get_sequence())}\t{seq.get_mw():.2f}\n")
sys.stderr.write("Sort process finished.\n")
sys.stderr.write("Program finished correctly.\n")
# Main entry point of the program
if __name__ == "__main__":
files = []
output = None
if len(sys.argv) == 1:
files = glob.glob("*.fasta") + glob.glob("*.fa")
elif len(sys.argv) == 2:
if os.path.isdir(sys.argv[1]):
files = glob.glob(sys.argv[1] + "/*.fasta") + glob.glob(sys.argv[1] + "/*.fa")
elif os.path.isfile(sys.argv[1]):
files = [sys.argv[1]]
else:
sys.stderr.write(f"Error: The argument {sys.argv[1]} is not a directory or a file.\n")
elif len(sys.argv) == 3:
if os.path.isdir(sys.argv[1]):
files = glob.glob(os.path.join(sys.argv[1], "*.fasta")) + glob.glob(os.path.join(sys.argv[1], "*.fa"))
else:
files = [sys.argv[1]]
output = sys.argv[2]
sort_and_print(files, output)