Friday, September 12, 2025

The right way to Create a Bioinformatics AI Agent Utilizing Biopython for DNA and Protein Evaluation

Share


class BioPythonAIAgent:
   def __init__(self, e mail="[email protected]"):
       self.e mail = e mail
       Entrez.e mail = e mail
       self.sequences = {}
       self.analysis_results = {}
       self.alignments = {}
       self.bushes = {}
  
   def fetch_sequence_from_ncbi(self, accession_id, db="nucleotide", rettype="fasta"):
       strive:
           deal with = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode="textual content")
           file = SeqIO.learn(deal with, "fasta")
           deal with.shut()
           self.sequences[accession_id] = file
           return file
       besides Exception as e:
           print(f"Error fetching sequence: {str(e)}")
           return None
  
   def create_sample_sequences(self):
       covid_spike = "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT"
      
       human_insulin = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
      
       e_coli_16s = "AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAATGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGCGTTAAGGTTAATAACCTTGGCGATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA"
      
       sample_sequences = [
           ("COVID_Spike", covid_spike, "SARS-CoV-2 Spike Protein"),
           ("Human_Insulin", human_insulin, "Human Insulin Precursor"),
           ("E_coli_16S", e_coli_16s, "E. coli 16S rRNA")
       ]
      
       for seq_id, seq_str, desc in sample_sequences:
           file = SeqRecord(Seq(seq_str), id=seq_id, description=desc)
           self.sequences[seq_id] = file
      
       return sample_sequences
  
   def analyze_sequence(self, sequence_id=None, sequence=None):
       if sequence_id and sequence_id in self.sequences:
           seq_record = self.sequences[sequence_id]
           seq = seq_record.seq
           description = seq_record.description
       elif sequence:
           seq = Seq(sequence)
           description = "Customized sequence"
       else:
           return None
      
       evaluation = {
           'size': len(seq),
           'composition': {}
       }
      
       for base in ['A', 'T', 'G', 'C']:
           evaluation['composition'][base] = seq.rely(base)
      
       if 'A' in evaluation['composition'] and 'T' in evaluation['composition']:
           evaluation['gc_content'] = spherical(gc_fraction(seq) * 100, 2)
           strive:
               evaluation['molecular_weight'] = spherical(molecular_weight(seq, seq_type="DNA"), 2)
           besides:
               evaluation['molecular_weight'] = len(seq) * 650
      
       strive:
           if len(seq) % 3 == 0:
               protein = seq.translate()
               evaluation['translation'] = str(protein)
               evaluation['stop_codons'] = protein.rely('*')
              
               if '*' not in str(protein)[:-1]:
                   prot_analysis = ProteinAnalysis(str(protein)[:-1])
                   evaluation['protein_mw'] = spherical(prot_analysis.molecular_weight(), 2)
                   evaluation['isoelectric_point'] = spherical(prot_analysis.isoelectric_point(), 2)
                   evaluation['protein_composition'] = prot_analysis.get_amino_acids_percent()
       besides:
           go
      
       key = sequence_id if sequence_id else "customized"
       self.analysis_results[key] = evaluation
      
       return evaluation
  
   def visualize_composition(self, sequence_id):
       if sequence_id not in self.analysis_results:
           return
      
       evaluation = self.analysis_results[sequence_id]
      
       fig = make_subplots(
           rows=2, cols=2,
           specs=[[{"type": "pie"}, {"type": "bar"}],
                  [{"colspan": 2}, None]],
           subplot_titles=("Nucleotide Composition", "Base Depend", "Sequence Properties")
       )
      
       labels = record(evaluation['composition'].keys())
       values = record(evaluation['composition'].values())
      
       fig.add_trace(
           go.Pie(labels=labels, values=values, identify="Composition"),
           row=1, col=1
       )
      
       fig.add_trace(
           go.Bar(x=labels, y=values, identify="Depend", marker_color=['red', 'blue', 'green', 'orange']),
           row=1, col=2
       )
      
       properties = ['Length', 'GC%', 'MW (kDa)']
       prop_values = [
           analysis['length'],
           evaluation.get('gc_content', 0),
           evaluation.get('molecular_weight', 0) / 1000
       ]
      
       fig.add_trace(
           go.Scatter(x=properties, y=prop_values, mode="markers+traces",
                     marker=dict(dimension=10, shade="purple"), identify="Properties"),
           row=2, col=1
       )
      
       fig.update_layout(
           title=f"Complete Evaluation: {sequence_id}",
           showlegend=False,
           peak=600
       )
      
       fig.present()
  
   def perform_multiple_sequence_alignment(self, sequence_ids):
       if len(sequence_ids) < 2:
           return None
      
       sequences = []
       for seq_id in sequence_ids:
           if seq_id in self.sequences:
               sequences.append(self.sequences[seq_id])
      
       if len(sequences) < 2:
           return None
      
       from Bio.Align import PairwiseAligner
       aligner = PairwiseAligner()
       aligner.match_score = 2
       aligner.mismatch_score = -1
       aligner.open_gap_score = -2
       aligner.extend_gap_score = -0.5
      
       alignments = []
       for i in vary(len(sequences)):
           for j in vary(i+1, len(sequences)):
               alignment = aligner.align(sequences[i].seq, sequences[j].seq)[0]
               alignments.append(alignment)
      
       return alignments
  
   def create_phylogenetic_tree(self, alignment_key=None, sequences=None):
       if alignment_key and alignment_key in self.alignments:
           alignment = self.alignments[alignment_key]
       elif sequences:
           information = []
           for i, seq in enumerate(sequences):
               file = SeqRecord(Seq(seq), id=f"seq_{i}")
               information.append(file)
           SeqIO.write(information, "temp.fasta", "fasta")
          
           strive:
               clustalw_cline = ClustalwCommandline("clustalw2", infile="temp.fasta")
               stdout, stderr = clustalw_cline()
               alignment = AlignIO.learn("temp.aln", "clustal")
               os.take away("temp.fasta")
               os.take away("temp.aln")
               os.take away("temp.dnd")
           besides:
               return None
       else:
           return None
      
       calculator = DistanceCalculator('identification')
       dm = calculator.get_distance(alignment)
      
       constructor = DistanceTreeConstructor()
       tree = constructor.upgma(dm)
      
       tree_key = f"tree_{len(self.bushes)}"
       self.bushes[tree_key] = tree
      
       return tree
  
   def visualize_tree(self, tree):
       fig, ax = plt.subplots(figsize=(10, 6))
       Phylo.draw(tree, axes=ax)
       plt.title("Phylogenetic Tree")
       plt.tight_layout()
       plt.present()
  
   def protein_structure_analysis(self, sequence_id):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
      
       strive:
           if len(seq) % 3 == 0:
               protein = seq.translate()
               if '*' not in str(protein)[:-1]:
                   prot_analysis = ProteinAnalysis(str(protein)[:-1])
                  
                   structure_analysis = {
                       'molecular_weight': prot_analysis.molecular_weight(),
                       'isoelectric_point': prot_analysis.isoelectric_point(),
                       'amino_acid_percent': prot_analysis.get_amino_acids_percent(),
                       'secondary_structure': prot_analysis.secondary_structure_fraction(),
                       'flexibility': prot_analysis.flexibility(),
                       'gravy': prot_analysis.gravy()
                   }
                  
                   return structure_analysis
       besides:
           go
      
       return None
  
   def comparative_analysis(self, sequence_ids):
       outcomes = []
      
       for seq_id in sequence_ids:
           if seq_id in self.analysis_results:
               evaluation = self.analysis_results[seq_id].copy()
               evaluation['sequence_id'] = seq_id
               outcomes.append(evaluation)
      
       df = pd.DataFrame(outcomes)
      
       if len(df) > 1:
           fig = make_subplots(
               rows=2, cols=2,
               subplot_titles=("Size Comparability", "GC Content material", "Molecular Weight", "Composition Heatmap")
           )
          
           fig.add_trace(
               go.Bar(x=df['sequence_id'], y=df['length'], identify="Size"),
               row=1, col=1
           )
          
           if 'gc_content' in df.columns:
               fig.add_trace(
                   go.Scatter(x=df['sequence_id'], y=df['gc_content'], mode="markers+traces", identify="GC%"),
                   row=1, col=2
               )
          
           if 'molecular_weight' in df.columns:
               fig.add_trace(
                   go.Bar(x=df['sequence_id'], y=df['molecular_weight'], identify="MW"),
                   row=2, col=1
               )
          
           fig.update_layout(title="Comparative Sequence Evaluation", peak=600)
           fig.present()
      
       return df
  
   def codon_usage_analysis(self, sequence_id):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
      
       if len(seq) % 3 != 0:
           return None
      
       codons = {}
       for i in vary(0, len(seq) - 2, 3):
           codon = str(seq[i:i+3])
           codons[codon] = codons.get(codon, 0) + 1
      
       codon_df = pd.DataFrame(record(codons.objects()), columns=['Codon', 'Count'])
       codon_df = codon_df.sort_values('Depend', ascending=False)
      
       fig = px.bar(codon_df.head(20), x='Codon', y='Depend',
                    title=f"High 20 Codon Utilization - {sequence_id}")
       fig.present()
      
       return codon_df
  
   def motif_search(self, sequence_id, motif_pattern):
       if sequence_id not in self.sequences:
           return []
      
       seq = str(self.sequences[sequence_id].seq)
       positions = []
      
       for i in vary(len(seq) - len(motif_pattern) + 1):
           if seq[i:i+len(motif_pattern)] == motif_pattern:
               positions.append(i)
      
       return positions
  
   def gc_content_window(self, sequence_id, window_size=100):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
       gc_values = []
       positions = []
      
       for i in vary(0, len(seq) - window_size + 1, window_size//4):
           window = seq[i:i+window_size]
           gc_values.append(gc_fraction(window) * 100)
           positions.append(i + window_size//2)
      
       fig = go.Determine()
       fig.add_trace(go.Scatter(x=positions, y=gc_values, mode="traces+markers",
                               identify=f'GC Content material (window={window_size})'))
       fig.update_layout(
           title=f"GC Content material Sliding Window Evaluation - {sequence_id}",
           xaxis_title="Place",
           yaxis_title="GC Content material (%)"
       )
       fig.present()
      
       return positions, gc_values
  
   def run_comprehensive_analysis(self, sequence_ids):
       outcomes = {}
      
       for seq_id in sequence_ids:
           if seq_id in self.sequences:
               evaluation = self.analyze_sequence(seq_id)
               self.visualize_composition(seq_id)
              
               gc_analysis = self.gc_content_window(seq_id)
               codon_analysis = self.codon_usage_analysis(seq_id)
              
               outcomes[seq_id] = {
                   'basic_analysis': evaluation,
                   'gc_window': gc_analysis,
                   'codon_usage': codon_analysis
               }
      
       if len(sequence_ids) > 1:
           comparative_df = self.comparative_analysis(sequence_ids)
           outcomes['comparative'] = comparative_df
      
       return outcomes



Source link

Read more

Read More