This code snippet is part of a bioinformatics script designed to process sequence data, validate it, and perform various operations on sequences. Here's a summary of its functionalities in 10 points:
IncorrectSequenceLetter for handling invalid sequence letters.Sequence class that serves as a base for all sequence types, with methods for validation, accessing identifier and sequence, calculating molecular weight, checking for subsequence, and defining equality and concatenation operations.NucleotideSequence) and protein sequences (ProteinSequence), with the former having additional methods for translation and the latter serving as a straightforward extension of the Sequence class.NucleotideSequence into DNASequence and RNASequence classes, each with methods relevant to their type, such as transcribe for DNA and reverse_transcribe for RNA.FASTA_iterator that iterates over sequences in a FASTA file, yielding sequence objects of a specified class.FASTA_to_protein function.sys.argv mechanism to accept command-line arguments for input files and output options, allowing the script to operate in a flexible manner based on user input.glob and os modules to search for FASTA files in specified directories or use default locations if no arguments are provided.sort_and_print function that sorts protein sequences by molecular weight and either prints them to the console or writes them to a specified output file, completing the script's primary objective of processing and organizing sequence data.
# Importing necessary modules
import sequence_dictionaries as vary
import sys
import glob
import os
# Custom exception class for incorrect sequence letters
class IncorrectSequenceLetter(ValueError):
def __init__(self, letter, class_name):
self.letter = letter
self.class_name = class_name
def __str__(self):
return f"The sequence item {self.letter} is not found in the alphabet of class {self.class_name}"
# Base class for all sequence types
class Sequence():
alphabet = vary.protein_letters + vary.dna_letters + vary.rna_letters
def __init__(self, identifier, sequence):
self.__identifier = str(identifier)
self.__sequence = str(sequence)
self.validate_sequence()
def validate_sequence(self):
for letter in self.__sequence:
if letter not in self.alphabet:
raise IncorrectSequenceLetter(letter, self.__class__.__name__)
def get_identifier(self):
return self.__identifier
def get_sequence(self):
return self.__sequence
def get_mw(self):
if isinstance(self, ProteinSequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.protein_weights[element]
return float(sequence_molecular_weight)
elif isinstance(self, DNASequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.dna_weights[element]
return float(sequence_molecular_weight)
elif isinstance(self, RNASequence):
sequence_molecular_weight = 0
for element in self.__sequence:
sequence_molecular_weight += vary.rna_weights[element]
return float(sequence_molecular_weight)
def has_subsequence(self, protein):
if protein.__sequence in self.__sequence:
return True
else:
return False
def __len__(self):
return len(self.__sequence)
def __eq__(self, other):
return self.__sequence == other.get_sequence()
def __ne__ (self, other):
return self.__sequence != other.get_sequence()
def __add__(self, other):
if not isinstance(other, self.__class__):
raise TypeError(f"Addition not supported between instances of '{self.__class__.__name__}' and
'{other.__class__.__name__}'")
concatenated_identifier = self.get_identifier() + "+" + other.get_identifier()
concatenated_sequence = self.get_sequence() + other.get_sequence()
return self.__class__(concatenated_identifier, concatenated_sequence)
def __getitem__ (self, i):
return self.__sequence[i]
def __contains__(self, string):
return string in self.__sequence
def __lt__(self, other):
return self.get_mw() < other.get_mw() def __hash__(self): return hash((self.__identifier, self.__sequence)) # Class for
nucleotide sequences class NucleotideSequence(Sequence): alphabet=vary.dna_letters + vary.rna_letters def
translate(self): protein_sequence="" codons=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
3)] for codon in codons: if isinstance(self, RNASequence): if codon in vary.rna_table.keys(): protein_sequence
+=vary.rna_table[codon] elif isinstance(self, DNASequence): if codon in vary.dna_table.keys(): protein_sequence
+=vary.dna_table[codon] return protein_sequence # Class for protein sequences class ProteinSequence(Sequence):
alphabet=vary.protein_letters pass # Class for DNA sequences class DNASequence(NucleotideSequence):
alphabet=vary.dna_letters def transcribe(self): return self.get_sequence().replace("T", "U" ) # Class for RNA
sequences class RNASequence(NucleotideSequence): def reverse_transcribe(self): reverse_transcription=""
intermediate_sequence="" sequence_splited=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
3)] for base in sequence_splited: if base in vary.rna_table.keys(): intermediate_sequence +=vary.rna_table[base] for
element in intermediate_sequence: if element in vary.dna_table_back.keys(): reverse_transcription
+=vary.dna_table_back[element] return reverse_transcription # Generator function to iterate over FASTA files def
FASTA_iterator(fasta_filename, class_type=DNASequence): """ This is A Generator Function that reads a Fasta file. In each iteration, the function
must return an object class protein""" identifiers='' sequences='' with open (fasta_filename, 'r' ) as file: for
line in file: if line.startswith(">"):
if identifiers:
try:
yield class_type(identifier=identifiers, sequence=sequences )
except IncorrectSequenceLetter:
sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
identifiers = line[1:].strip()
sequences = ''
else:
sequences += line.strip()
if identifiers:
try:
yield class_type(identifier=identifiers, sequence=sequences )
except IncorrectSequenceLetter:
sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
# Function to convert FASTA files to protein sequences and sort them
def FASTA_to_protein(input_files):
seq_counter = 0
sorted_list = []
for input_file in input_files:
for seq in FASTA_iterator(input_file):
try:
if isinstance(seq, DNASequence) or isinstance(seq, RNASequence):
protein_sequence = seq.translate()
protein = ProteinSequence(identifier= seq.get_identifier(), sequence=protein_sequence)
sorted_list.append((protein))
seq_counter += 1
except Exception as e:
sys.stderr.write(f"Error processing {input_file}: {e}\n")
sys.stderr.write(f"{input_file} finished.\n")
sys.stderr.write(f"{seq_counter} sequences found.\n")
return sorted_list
# Function to sort and print protein sequences
def sort_and_print(files, output=None):
if len(files) == 0:
sys.stderr.write("No FASTA files found.\n")
elif len(files) == 1:
if files[0].endswith((".fa", ".fasta")):
fasta_files = files
sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
else:
sys.stderr.write("1 file is being processed.\n")
fasta_files = []
else:
fasta_files = [file for file in files if file.endswith(".fa") or file.endswith(".fasta")]
sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
sorted_list = FASTA_to_protein(files)
sys.stderr.write("Sorting the sequences...\n")
sorted_list.sort(key=lambda x: x.get_mw())
if output is None:
for seq in sorted_list:
print(seq.get_identifier(), len(seq.get_sequence()), "{:.2f}".format(seq.get_mw()), sep='\t')
else:
print(f"Writing to file: {output}")
with open(output, "w") as out:
for seq in sorted_list:
out.write(f"{seq.get_identifier()}\t{len(seq.get_sequence())}\t{seq.get_mw():.2f}\n")
sys.stderr.write("Sort process finished.\n")
sys.stderr.write("Program finished correctly.\n")
# Main entry point of the program
if __name__ == "__main__":
files = []
output = None
if len(sys.argv) == 1:
files = glob.glob("*.fasta") + glob.glob("*.fa")
elif len(sys.argv) == 2:
if os.path.isdir(sys.argv[1]):
files = glob.glob(sys.argv[1] + "/*.fasta") + glob.glob(sys.argv[1] + "/*.fa")
elif os.path.isfile(sys.argv[1]):
files = [sys.argv[1]]
else:
sys.stderr.write(f"Error: The argument {sys.argv[1]} is not a directory or a file.\n")
elif len(sys.argv) == 3:
if os.path.isdir(sys.argv[1]):
files = glob.glob(os.path.join(sys.argv[1], "*.fasta")) + glob.glob(os.path.join(sys.argv[1], "*.fa"))
else:
files = [sys.argv[1]]
output = sys.argv[2]
sort_and_print(files, output)