Biopython Case Studies¶
In [2]:
from Bio import SeqIO, Entrez, pairwise2
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from datetime import datetime
import pandas as pd
INPUT_FASTA = "myseq.fa"
/opt/anaconda3/lib/python3.13/site-packages/Bio/pairwise2.py:278: BiopythonDeprecationWarning: Bio.pairwise2 has been deprecated, and we intend to remove it in a future release of Biopython. As an alternative, please consider using Bio.Align.PairwiseAligner as a replacement, and contact the Biopython developers if you still need the Bio.pairwise2 module. warnings.warn(
Part I Identify Unknown Sequence via BLAST¶
Step 1: Import the Unknown Sequence into Python Running BLAST over the Internet¶
In [6]:
record = list(SeqIO.parse(INPUT_FASTA, "fasta"))[0]
print(f"Submitting BLAST for {record.id}")
Submitting BLAST for sequence_unknown
In [4]:
print(record)
ID: sequence_unknown
Name: sequence_unknown
Description: sequence_unknown
Number of features: 0
Seq('CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGT...GCC')
Step 2: Running BLAST over the Internet¶
In [16]:
result_handle = NCBIWWW.qblast("blastn", "nt", str(record.seq))
# blast_record = NCBIXML.read(result_handle)
print("BLAST completed")
BLAST completed
In [7]:
help(NCBIWWW.qblast)
Help on function qblast in module Bio.Blast.NCBIWWW:
qblast(
program,
database,
sequence,
url_base='https://blast.ncbi.nlm.nih.gov/Blast.cgi',
auto_format=None,
composition_based_statistics=None,
db_genetic_code=None,
endpoints=None,
entrez_query='(none)',
expect=10.0,
filter=None,
gapcosts=None,
genetic_code=None,
hitlist_size=50,
i_thresh=None,
layout=None,
lcase_mask=None,
matrix_name=None,
nucl_penalty=None,
nucl_reward=None,
other_advanced=None,
perc_ident=None,
phi_pattern=None,
query_file=None,
query_believe_defline=None,
query_from=None,
query_to=None,
searchsp_eff=None,
service=None,
threshold=None,
ungapped_alignment=None,
word_size=None,
short_query=None,
alignments=500,
alignment_view=None,
descriptions=500,
entrez_links_new_window=None,
expect_low=None,
expect_high=None,
format_entrez_query=None,
format_object=None,
format_type='XML',
ncbi_gi=None,
results_file=None,
show_overview=None,
megablast=None,
template_type=None,
template_length=None,
username='blast',
password=None
)
BLAST search using NCBI's QBLAST server.
Supports all parameters of the old qblast API for Put and Get.
Please note that NCBI uses the new Common URL API for BLAST searches
on the internet (https://blast.ncbi.nlm.nih.gov/doc/blast-help/urlapi.html).
Thus, some of the parameters used by this function are not (or are no longer)
officially supported by NCBI. Although they are still functioning, this
may change in the future.
Some useful parameters:
- program blastn, blastp, blastx, tblastn, or tblastx (lower case)
- database Which database to search against (e.g. "nr").
- sequence The sequence to search.
- ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
- descriptions Number of descriptions to show. Def 500.
- alignments Number of alignments to show. Def 500.
- expect An expect value cutoff. Def 10.0.
- matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
- filter "none" turns off filtering. Default no filtering
- format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
- entrez_query Entrez query to limit Blast search - only applies when searching nucleotide BLASTDBs
- hitlist_size Number of hits to return. Default 50
- megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
- short_query TRUE/FALSE whether to adjust the search parameters for a
short query sequence. Note that this will override
manually set parameters like word size and e value. Turns
off when sequence length is > 30 residues. Default: None.
- service plain, psi, phi, rpsblast, megablast (lower case)
This function does no checking of the validity of the parameters
and passes the values to the server as is. More help is available at:
https://blast.ncbi.nlm.nih.gov/doc/blast-help/urlapi.html
Step 3: Import The BLAST Record (XML format)¶
In [17]:
blast_record = NCBIXML.read(result_handle)
In [18]:
blast_record
Out[18]:
<Bio.Blast.NCBIXML.Blast at 0x139398cd0>
Step 4: Parsing BLAST Output¶
In [19]:
len(blast_record.alignments)
Out[19]:
50
Best Match For DNA Sequence
In [21]:
E_VALUE_THRESH = 0.01
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < E_VALUE_THRESH:
print("****Alignment****")
print("sequence:", alignment.title)
print("length:", alignment.length)
print("e value:", hsp.expect)
print(hsp.query)
print(hsp.match)
print(hsp.sbjct)
****Alignment**** sequence: gi|2695886215|gb|OR084930.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003007, partial genome length: 18922 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885795|gb|OR084888.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002129, partial genome length: 18942 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885865|gb|OR084895.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002138, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885945|gb|OR084903.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002151, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885725|gb|OR084881.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002121, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886125|gb|OR084921.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003018, partial genome length: 18941 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885745|gb|OR084883.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002123, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885885|gb|OR084897.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002140, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885655|gb|OR084874.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002107, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885605|gb|OR084869.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000796, partial genome length: 18921 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886145|gb|OR084923.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Monieka-RDCEQT003256, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885555|gb|OR084864.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000492, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886115|gb|OR084920.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003014, partial genome length: 18930 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885665|gb|OR084875.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002111, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885845|gb|OR084893.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002136, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885595|gb|OR084868.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000617, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886015|gb|OR084910.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Bomongo-RDCEQT002163, partial genome length: 18932 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885575|gb|OR084866.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000571, partial genome length: 18936 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885825|gb|OR084891.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002134, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885385|gb|OR084847.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Tumba-RDCEQT000199, partial genome length: 18574 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885525|gb|OR084861.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000456, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886025|gb|OR084911.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002164, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885905|gb|OR084899.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002144, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885695|gb|OR084878.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002117, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|3018473336|emb|OZ276552.1| Zaire ebolavirus isolate Ebola virus/VECTOR/C.porcellus-lab/COD/1976/Mayinga_NML genome assembly, complete genome: monopartite length: 18582 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886045|gb|OR084913.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/mbandaka-RDCEQT002502, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885635|gb|OR084872.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002103, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886205|gb|OR084929.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002148, partial genome length: 18939 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886095|gb|OR084918.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003011, partial genome length: 18531 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885515|gb|OR084860.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000301, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885855|gb|OR084894.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002137, partial genome length: 18531 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885415|gb|OR084850.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000010, partial genome length: 18943 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886195|gb|OR084928.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000300, partial genome length: 18927 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2501520155|gb|OQ784121.1| Mutant Zaire ebolavirus rMA-EBOV-ZsGnLuc, complete genome length: 20233 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885435|gb|OR084852.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000017, partial genome length: 18940 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885545|gb|OR084863.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000474, partial genome length: 18531 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885465|gb|OR084855.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000042, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885395|gb|OR084848.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Tumba-RDCEQT000029, partial genome length: 18287 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885585|gb|OR084867.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000616, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885645|gb|OR084873.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002106, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885925|gb|OR084901.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002146, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886085|gb|OR084917.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003008, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885445|gb|OR084853.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000018, partial genome >gi|2695885485|gb|OR084857.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000074, partial genome length: 18939 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885615|gb|OR084870.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000869, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885815|gb|OR084890.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002132, partial genome length: 18532 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885475|gb|OR084856.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000046, partial genome length: 18831 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885935|gb|OR084902.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002149, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695886035|gb|OR084912.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002501, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2501520146|gb|OQ784120.1| Mutant Zaire ebolavirus rMA-EBOV-nLuc-ZsG, complete genome length: 20302 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ****Alignment**** sequence: gi|2695885875|gb|OR084896.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002139, partial genome length: 18832 e value: 3.21162e-28 CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
Part II: Some other Useful Biopython Functions¶
In [2]:
from Bio.Seq import Seq
my_seq= Seq("ACGAGGACAGAT") ## create a DNA sequence record in BioPython
my_seq
Out[2]:
Seq('ACGAGGACAGAT')
In [3]:
print(my_seq)
ACGAGGACAGAT
Nucleotide Methods: complement, reverse_complement, trnaslate, ...etc¶
In [4]:
my_seq.complement()
Out[4]:
Seq('TGCTCCTGTCTA')
In [5]:
my_seq.reverse_complement()
Out[5]:
Seq('ATCTGTCCTCGT')
In [6]:
my_seq.translate()
Out[6]:
Seq('TRTD')
General Methods: find, count ...etc¶
Count¶
In [7]:
print(my_seq.count("A")) # count of A
print(my_seq.count("G")) # count of G
5 4
In [8]:
print(my_seq.count("ACG")) ##Counts non-overlapping occurrences
1
Find¶
In [9]:
print(my_seq)
position = my_seq.find("A")
print(position)
ACGAGGACAGAT 0
Part III: Motif Finding with Biopython¶
In [10]:
from Bio import motifs
from Bio.Seq import Seq
sequences = [
Seq("ATGCA"),
Seq("ATGGA"),
Seq("ATGTA"),
Seq("ATGCA"),
Seq("ATGCC")
]
motif = motifs.create(sequences)
In [11]:
motif
Out[11]:
<Bio.motifs.Motif at 0x1363667b0>
In [12]:
print(motif.counts)
0 1 2 3 4 A: 5.00 0.00 0.00 0.00 4.00 C: 0.00 0.00 0.00 3.00 1.00 G: 0.00 0.00 5.00 1.00 0.00 T: 0.00 5.00 0.00 1.00 0.00
In [13]:
ppm=motif.counts.normalize(pseudocounts=0.5)
print(ppm)
0 1 2 3 4 A: 0.79 0.07 0.07 0.07 0.64 C: 0.07 0.07 0.07 0.50 0.21 G: 0.07 0.07 0.79 0.21 0.07 T: 0.07 0.79 0.07 0.21 0.07
In [14]:
motif.weblogo("motif.png")
In [15]:
motif.pseudocounts = 0.5
motif.background = {"A": 0.25, "C": 0.25, "G": 0.25, "T": 0.25}
# Compute PSSM
pssm = motif.pssm
print(pssm)
0 1 2 3 4 A: 1.65 -1.81 -1.81 -1.81 1.36 C: -1.81 -1.81 -1.81 1.00 -0.22 G: -1.81 -1.81 1.65 -0.22 -1.81 T: -1.81 1.65 -1.81 -0.22 -1.81
In [16]:
rows = len(pssm.alphabet)
cols = pssm.length
cols
Out[16]:
5
In [17]:
# ---------- Helper: score a k-mer of length L ----------
def score_kmer(kmer, pssm_matrix):
L= pssm_matrix.length
kmer = str(kmer).upper()
if len(kmer) != L:
raise ValueError(f"k-mer length must be {L}")
total = 0.0
for i, base in enumerate(kmer):
if base not in pssm_matrix:
raise ValueError(f"Unknown base {base}")
total += pssm_matrix[base][i]
return total
In [18]:
def scan_sequence(seq, pssm_matrix, threshold=None, include_revcomp=True):
seq = str(seq).upper()
hits = []
L= pssm_matrix.length
for i in range(len(seq) - L + 1):
window = seq[i:i+L]
sc = score_kmer(window, pssm_matrix)
strand = "+"
if threshold is None or sc >= threshold:
hits.append((i, strand, window, sc))
if include_revcomp:
# score reverse complement of the window (so we find motif on both strands)
rc = str(Seq(window).reverse_complement())
sc_rc = score_kmer(rc, pssm_matrix)
if threshold is None or sc_rc >= threshold:
hits.append((i, "-", rc, sc_rc))
return hits
long_seq = "GGGATGCAAAATGGAATGTA"
hits = scan_sequence(long_seq, pssm, threshold=None)
print("\nAll scored windows (position, strand, seq, score):")
for h in hits:
print(h)
All scored windows (position, strand, seq, score): (0, '+', 'GGGAT', -5.577342991650724) (0, '-', 'ATCCC', 2.2744060497653344) (1, '+', 'GGATG', -7.451812109566864) (1, '-', 'CATCC', -4.644457187509261) (2, '+', 'GATGC', -5.866849608845708) (2, '-', 'GCATC', -5.866849608845708) (3, '+', 'ATGCA', 7.318800169123788) (3, '-', 'TGCAT', -9.03677461028802) (4, '+', 'TGCAA', -5.866849608845708) (4, '-', 'TTGCA', 3.859368550486491) (5, '+', 'GCAAA', -5.866849608845708) (5, '-', 'TTTGC', -2.407417990208411) (6, '+', 'CAAAA', -5.866849608845708) (6, '-', 'TTTTG', -3.992380490929567) (7, '+', 'AAAAT', -5.577342991650724) (7, '-', 'ATTTT', -0.5329488722922697) (8, '+', 'AAATG', -3.992380490929567) (8, '-', 'CATTT', -7.451812109566864) (9, '+', 'AATGG', -3.992380490929567) (9, '-', 'CCATT', -7.451812109566864) (10, '+', 'ATGGA', 6.096407747787341) (10, '-', 'TCCAT', -9.03677461028802) (11, '+', 'TGGAA', -2.407417990208411) (11, '-', 'TTCCA', 0.39993693184919343) (12, '+', 'GGAAT', -9.03677461028802) (12, '-', 'ATTCC', 2.2744060497653344) (13, '+', 'GAATG', -7.451812109566864) (13, '-', 'CATTC', -5.866849608845708) (14, '+', 'AATGT', -3.992380490929567) (14, '-', 'ACATT', -3.992380490929567) (15, '+', 'ATGTA', 6.096407747787341) (15, '-', 'TACAT', -9.03677461028802)
In [19]:
# ---------- Useful diagnostics: max/min possible score ----------
alphabet=["A", "C", "G", "T"]
L= pssm.length
max_score = sum(max(pssm[b][i] for b in alphabet) for i in range(L))
min_score = sum(min(pssm[b][i] for b in alphabet) for i in range(L))
print("\nMax possible score:", round(max_score,6))
print("Min possible score:", round(min_score,6))
Max possible score: 7.3188 Min possible score: -9.036775