Python Code Showcase

This code snippet is part of a bioinformatics script designed to process sequence data, validate it, and perform various operations on sequences. Here's a summary of its functionalities in 10 points:

Imports and Custom Exceptions: The script starts by importing necessary modules and defining a custom exception class IncorrectSequenceLetter for handling invalid sequence letters.
Base Sequence Class: Defines a Sequence class that serves as a base for all sequence types, with methods for validation, accessing identifier and sequence, calculating molecular weight, checking for subsequence, and defining equality and concatenation operations.
Nucleotide and Protein Sequence Classes: Introduces specialized classes for nucleotide sequences (NucleotideSequence) and protein sequences (ProteinSequence), with the former having additional methods for translation and the latter serving as a straightforward extension of the Sequence class.
DNA and RNA Sequence Classes: Further specializes NucleotideSequence into DNASequence and RNASequence classes, each with methods relevant to their type, such as transcribe for DNA and reverse_transcribe for RNA.
FASTA File Processing: Implements a generator function FASTA_iterator that iterates over sequences in a FASTA file, yielding sequence objects of a specified class.
Sequence Translation and Sorting: Provides functionality to convert DNA or RNA sequences to protein sequences and sort them based on their molecular weight through the FASTA_to_protein function.
Error Handling: Includes error handling for incorrect sequence letters and other exceptions during sequence processing and translation.
Command-line Interface: Uses the sys.argv mechanism to accept command-line arguments for input files and output options, allowing the script to operate in a flexible manner based on user input.
File and Directory Handling: Employs glob and os modules to search for FASTA files in specified directories or use default locations if no arguments are provided.
Main Functionality: The script culminates in a sort_and_print function that sorts protein sequences by molecular weight and either prints them to the console or writes them to a specified output file, completing the script's primary objective of processing and organizing sequence data.

Python Code


                # Importing necessary modules
                import sequence_dictionaries as vary
                import sys
                import glob
                import os
                
                # Custom exception class for incorrect sequence letters
                class IncorrectSequenceLetter(ValueError):
                def __init__(self, letter, class_name):
                self.letter = letter
                self.class_name = class_name
                def __str__(self):
                return f"The sequence item {self.letter} is not found in the alphabet of class {self.class_name}"
                
                # Base class for all sequence types
                class Sequence():
                alphabet = vary.protein_letters + vary.dna_letters + vary.rna_letters
                
                def __init__(self, identifier, sequence):
                self.__identifier = str(identifier)
                self.__sequence = str(sequence)
                self.validate_sequence()
                
                def validate_sequence(self):
                for letter in self.__sequence:
                if letter not in self.alphabet:
                raise IncorrectSequenceLetter(letter, self.__class__.__name__)
                
                def get_identifier(self):
                return self.__identifier
                def get_sequence(self):
                return self.__sequence
                def get_mw(self):
                if isinstance(self, ProteinSequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.protein_weights[element]
                return float(sequence_molecular_weight)
                elif isinstance(self, DNASequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.dna_weights[element]
                return float(sequence_molecular_weight)
                elif isinstance(self, RNASequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.rna_weights[element]
                return float(sequence_molecular_weight)
                def has_subsequence(self, protein):
                if protein.__sequence in self.__sequence:
                return True
                else:
                return False
                def __len__(self):
                return len(self.__sequence)
                def __eq__(self, other):
                return self.__sequence == other.get_sequence()
                def __ne__ (self, other):
                return self.__sequence != other.get_sequence()
                def __add__(self, other):
                if not isinstance(other, self.__class__):
                raise TypeError(f"Addition not supported between instances of '{self.__class__.__name__}' and
                '{other.__class__.__name__}'")
                concatenated_identifier = self.get_identifier() + "+" + other.get_identifier()
                concatenated_sequence = self.get_sequence() + other.get_sequence()
                return self.__class__(concatenated_identifier, concatenated_sequence)
                def __getitem__ (self, i):
                return self.__sequence[i]
                def __contains__(self, string):
                return string in self.__sequence
                def __lt__(self, other):
                return self.get_mw() < other.get_mw() def __hash__(self): return hash((self.__identifier, self.__sequence)) # Class for
                    nucleotide sequences class NucleotideSequence(Sequence): alphabet=vary.dna_letters + vary.rna_letters def
                    translate(self): protein_sequence="" codons=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
                    3)] for codon in codons: if isinstance(self, RNASequence): if codon in vary.rna_table.keys(): protein_sequence
                    +=vary.rna_table[codon] elif isinstance(self, DNASequence): if codon in vary.dna_table.keys(): protein_sequence
                    +=vary.dna_table[codon] return protein_sequence # Class for protein sequences class ProteinSequence(Sequence):
                    alphabet=vary.protein_letters pass # Class for DNA sequences class DNASequence(NucleotideSequence):
                    alphabet=vary.dna_letters def transcribe(self): return self.get_sequence().replace("T", "U" ) # Class for RNA
                    sequences class RNASequence(NucleotideSequence): def reverse_transcribe(self): reverse_transcription=""
                    intermediate_sequence="" sequence_splited=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
                    3)] for base in sequence_splited: if base in vary.rna_table.keys(): intermediate_sequence +=vary.rna_table[base] for
                    element in intermediate_sequence: if element in vary.dna_table_back.keys(): reverse_transcription
                    +=vary.dna_table_back[element] return reverse_transcription # Generator function to iterate over FASTA files def
                    FASTA_iterator(fasta_filename, class_type=DNASequence): """ This is A Generator Function that reads a Fasta file. In each iteration, the function
                    must return an object class protein""" identifiers='' sequences='' with open (fasta_filename, 'r' ) as file: for
                    line in file: if line.startswith(">"):
                    if identifiers:
                    try:
                    yield class_type(identifier=identifiers, sequence=sequences )
                    except IncorrectSequenceLetter:
                    sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
                    identifiers = line[1:].strip()
                    sequences = ''
                    else:
                    sequences += line.strip()
                    if identifiers:
                    try:
                    yield class_type(identifier=identifiers, sequence=sequences )
                    except IncorrectSequenceLetter:
                    sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
                
                    # Function to convert FASTA files to protein sequences and sort them
                    def FASTA_to_protein(input_files):
                    seq_counter = 0
                    sorted_list = []
                    for input_file in input_files:
                    for seq in FASTA_iterator(input_file):
                    try:
                    if isinstance(seq, DNASequence) or isinstance(seq, RNASequence):
                    protein_sequence = seq.translate()
                    protein = ProteinSequence(identifier= seq.get_identifier(), sequence=protein_sequence)
                    sorted_list.append((protein))
                    seq_counter += 1
                    except Exception as e:
                    sys.stderr.write(f"Error processing {input_file}: {e}\n")
                    sys.stderr.write(f"{input_file} finished.\n")
                    sys.stderr.write(f"{seq_counter} sequences found.\n")
                    return sorted_list
                
                    # Function to sort and print protein sequences
                    def sort_and_print(files, output=None):
                    if len(files) == 0:
                    sys.stderr.write("No FASTA files found.\n")
                    elif len(files) == 1:
                    if files[0].endswith((".fa", ".fasta")):
                    fasta_files = files
                    sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
                    else:
                    sys.stderr.write("1 file is being processed.\n")
                    fasta_files = []
                    else:
                    fasta_files = [file for file in files if file.endswith(".fa") or file.endswith(".fasta")]
                    sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
                    sorted_list = FASTA_to_protein(files)
                    sys.stderr.write("Sorting the sequences...\n")
                    sorted_list.sort(key=lambda x: x.get_mw())
                
                    if output is None:
                    for seq in sorted_list:
                    print(seq.get_identifier(), len(seq.get_sequence()), "{:.2f}".format(seq.get_mw()), sep='\t')
                    else:
                    print(f"Writing to file: {output}")
                    with open(output, "w") as out:
                    for seq in sorted_list:
                    out.write(f"{seq.get_identifier()}\t{len(seq.get_sequence())}\t{seq.get_mw():.2f}\n")
                    sys.stderr.write("Sort process finished.\n")
                    sys.stderr.write("Program finished correctly.\n")
                
                    # Main entry point of the program
                    if __name__ == "__main__":
                    files = []
                    output = None
                    if len(sys.argv) == 1:
                    files = glob.glob("*.fasta") + glob.glob("*.fa")
                    elif len(sys.argv) == 2:
                    if os.path.isdir(sys.argv[1]):
                    files = glob.glob(sys.argv[1] + "/*.fasta") + glob.glob(sys.argv[1] + "/*.fa")
                    elif os.path.isfile(sys.argv[1]):
                    files = [sys.argv[1]]
                    else:
                    sys.stderr.write(f"Error: The argument {sys.argv[1]} is not a directory or a file.\n")
                    elif len(sys.argv) == 3:
                    if os.path.isdir(sys.argv[1]):
                    files = glob.glob(os.path.join(sys.argv[1], "*.fasta")) + glob.glob(os.path.join(sys.argv[1], "*.fa"))
                    else:
                    files = [sys.argv[1]]
                    output = sys.argv[2]
                    sort_and_print(files, output)