Python Code Showcase

This code snippet is part of a bioinformatics script designed to process sequence data, validate it, and perform various operations on sequences. Here's a summary of its functionalities in 10 points:

  1. Imports and Custom Exceptions: The script starts by importing necessary modules and defining a custom exception class IncorrectSequenceLetter for handling invalid sequence letters.
  2. Base Sequence Class: Defines a Sequence class that serves as a base for all sequence types, with methods for validation, accessing identifier and sequence, calculating molecular weight, checking for subsequence, and defining equality and concatenation operations.
  3. Nucleotide and Protein Sequence Classes: Introduces specialized classes for nucleotide sequences (NucleotideSequence) and protein sequences (ProteinSequence), with the former having additional methods for translation and the latter serving as a straightforward extension of the Sequence class.
  4. DNA and RNA Sequence Classes: Further specializes NucleotideSequence into DNASequence and RNASequence classes, each with methods relevant to their type, such as transcribe for DNA and reverse_transcribe for RNA.
  5. FASTA File Processing: Implements a generator function FASTA_iterator that iterates over sequences in a FASTA file, yielding sequence objects of a specified class.
  6. Sequence Translation and Sorting: Provides functionality to convert DNA or RNA sequences to protein sequences and sort them based on their molecular weight through the FASTA_to_protein function.
  7. Error Handling: Includes error handling for incorrect sequence letters and other exceptions during sequence processing and translation.
  8. Command-line Interface: Uses the sys.argv mechanism to accept command-line arguments for input files and output options, allowing the script to operate in a flexible manner based on user input.
  9. File and Directory Handling: Employs glob and os modules to search for FASTA files in specified directories or use default locations if no arguments are provided.
  10. Main Functionality: The script culminates in a sort_and_print function that sorts protein sequences by molecular weight and either prints them to the console or writes them to a specified output file, completing the script's primary objective of processing and organizing sequence data.

Python Code


                # Importing necessary modules
                import sequence_dictionaries as vary
                import sys
                import glob
                import os
                
                # Custom exception class for incorrect sequence letters
                class IncorrectSequenceLetter(ValueError):
                def __init__(self, letter, class_name):
                self.letter = letter
                self.class_name = class_name
                def __str__(self):
                return f"The sequence item {self.letter} is not found in the alphabet of class {self.class_name}"
                
                # Base class for all sequence types
                class Sequence():
                alphabet = vary.protein_letters + vary.dna_letters + vary.rna_letters
                
                def __init__(self, identifier, sequence):
                self.__identifier = str(identifier)
                self.__sequence = str(sequence)
                self.validate_sequence()
                
                def validate_sequence(self):
                for letter in self.__sequence:
                if letter not in self.alphabet:
                raise IncorrectSequenceLetter(letter, self.__class__.__name__)
                
                def get_identifier(self):
                return self.__identifier
                def get_sequence(self):
                return self.__sequence
                def get_mw(self):
                if isinstance(self, ProteinSequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.protein_weights[element]
                return float(sequence_molecular_weight)
                elif isinstance(self, DNASequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.dna_weights[element]
                return float(sequence_molecular_weight)
                elif isinstance(self, RNASequence):
                sequence_molecular_weight = 0
                for element in self.__sequence:
                sequence_molecular_weight += vary.rna_weights[element]
                return float(sequence_molecular_weight)
                def has_subsequence(self, protein):
                if protein.__sequence in self.__sequence:
                return True
                else:
                return False
                def __len__(self):
                return len(self.__sequence)
                def __eq__(self, other):
                return self.__sequence == other.get_sequence()
                def __ne__ (self, other):
                return self.__sequence != other.get_sequence()
                def __add__(self, other):
                if not isinstance(other, self.__class__):
                raise TypeError(f"Addition not supported between instances of '{self.__class__.__name__}' and
                '{other.__class__.__name__}'")
                concatenated_identifier = self.get_identifier() + "+" + other.get_identifier()
                concatenated_sequence = self.get_sequence() + other.get_sequence()
                return self.__class__(concatenated_identifier, concatenated_sequence)
                def __getitem__ (self, i):
                return self.__sequence[i]
                def __contains__(self, string):
                return string in self.__sequence
                def __lt__(self, other):
                return self.get_mw() < other.get_mw() def __hash__(self): return hash((self.__identifier, self.__sequence)) # Class for
                    nucleotide sequences class NucleotideSequence(Sequence): alphabet=vary.dna_letters + vary.rna_letters def
                    translate(self): protein_sequence="" codons=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
                    3)] for codon in codons: if isinstance(self, RNASequence): if codon in vary.rna_table.keys(): protein_sequence
                    +=vary.rna_table[codon] elif isinstance(self, DNASequence): if codon in vary.dna_table.keys(): protein_sequence
                    +=vary.dna_table[codon] return protein_sequence # Class for protein sequences class ProteinSequence(Sequence):
                    alphabet=vary.protein_letters pass # Class for DNA sequences class DNASequence(NucleotideSequence):
                    alphabet=vary.dna_letters def transcribe(self): return self.get_sequence().replace("T", "U" ) # Class for RNA
                    sequences class RNASequence(NucleotideSequence): def reverse_transcribe(self): reverse_transcription=""
                    intermediate_sequence="" sequence_splited=[self.get_sequence()[i:i+3] for i in range(0, len(self.get_sequence()),
                    3)] for base in sequence_splited: if base in vary.rna_table.keys(): intermediate_sequence +=vary.rna_table[base] for
                    element in intermediate_sequence: if element in vary.dna_table_back.keys(): reverse_transcription
                    +=vary.dna_table_back[element] return reverse_transcription # Generator function to iterate over FASTA files def
                    FASTA_iterator(fasta_filename, class_type=DNASequence): """ This is A Generator Function that reads a Fasta file. In each iteration, the function
                    must return an object class protein""" identifiers='' sequences='' with open (fasta_filename, 'r' ) as file: for
                    line in file: if line.startswith(">"):
                    if identifiers:
                    try:
                    yield class_type(identifier=identifiers, sequence=sequences )
                    except IncorrectSequenceLetter:
                    sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
                    identifiers = line[1:].strip()
                    sequences = ''
                    else:
                    sequences += line.strip()
                    if identifiers:
                    try:
                    yield class_type(identifier=identifiers, sequence=sequences )
                    except IncorrectSequenceLetter:
                    sys.stderr.write(f"Error: Invalid sequence found at identifier '{identifiers}'. Skipping this sequence.\n")
                
                    # Function to convert FASTA files to protein sequences and sort them
                    def FASTA_to_protein(input_files):
                    seq_counter = 0
                    sorted_list = []
                    for input_file in input_files:
                    for seq in FASTA_iterator(input_file):
                    try:
                    if isinstance(seq, DNASequence) or isinstance(seq, RNASequence):
                    protein_sequence = seq.translate()
                    protein = ProteinSequence(identifier= seq.get_identifier(), sequence=protein_sequence)
                    sorted_list.append((protein))
                    seq_counter += 1
                    except Exception as e:
                    sys.stderr.write(f"Error processing {input_file}: {e}\n")
                    sys.stderr.write(f"{input_file} finished.\n")
                    sys.stderr.write(f"{seq_counter} sequences found.\n")
                    return sorted_list
                
                    # Function to sort and print protein sequences
                    def sort_and_print(files, output=None):
                    if len(files) == 0:
                    sys.stderr.write("No FASTA files found.\n")
                    elif len(files) == 1:
                    if files[0].endswith((".fa", ".fasta")):
                    fasta_files = files
                    sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
                    else:
                    sys.stderr.write("1 file is being processed.\n")
                    fasta_files = []
                    else:
                    fasta_files = [file for file in files if file.endswith(".fa") or file.endswith(".fasta")]
                    sys.stderr.write(f"{len(fasta_files)} FASTA files found.\n")
                    sorted_list = FASTA_to_protein(files)
                    sys.stderr.write("Sorting the sequences...\n")
                    sorted_list.sort(key=lambda x: x.get_mw())
                
                    if output is None:
                    for seq in sorted_list:
                    print(seq.get_identifier(), len(seq.get_sequence()), "{:.2f}".format(seq.get_mw()), sep='\t')
                    else:
                    print(f"Writing to file: {output}")
                    with open(output, "w") as out:
                    for seq in sorted_list:
                    out.write(f"{seq.get_identifier()}\t{len(seq.get_sequence())}\t{seq.get_mw():.2f}\n")
                    sys.stderr.write("Sort process finished.\n")
                    sys.stderr.write("Program finished correctly.\n")
                
                    # Main entry point of the program
                    if __name__ == "__main__":
                    files = []
                    output = None
                    if len(sys.argv) == 1:
                    files = glob.glob("*.fasta") + glob.glob("*.fa")
                    elif len(sys.argv) == 2:
                    if os.path.isdir(sys.argv[1]):
                    files = glob.glob(sys.argv[1] + "/*.fasta") + glob.glob(sys.argv[1] + "/*.fa")
                    elif os.path.isfile(sys.argv[1]):
                    files = [sys.argv[1]]
                    else:
                    sys.stderr.write(f"Error: The argument {sys.argv[1]} is not a directory or a file.\n")
                    elif len(sys.argv) == 3:
                    if os.path.isdir(sys.argv[1]):
                    files = glob.glob(os.path.join(sys.argv[1], "*.fasta")) + glob.glob(os.path.join(sys.argv[1], "*.fa"))
                    else:
                    files = [sys.argv[1]]
                    output = sys.argv[2]
                    sort_and_print(files, output)