Biopython Case Studies¶

Prerequisites¶

Install required packages:

pip install biopython pandas
In [2]:
from Bio import SeqIO, Entrez, pairwise2
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from datetime import datetime

import pandas as pd

INPUT_FASTA = "myseq.fa"
/opt/anaconda3/lib/python3.13/site-packages/Bio/pairwise2.py:278: BiopythonDeprecationWarning: Bio.pairwise2 has been deprecated, and we intend to remove it in a future release of Biopython. As an alternative, please consider using Bio.Align.PairwiseAligner as a replacement, and contact the Biopython developers if you still need the Bio.pairwise2 module.
  warnings.warn(

Part I Identify Unknown Sequence via BLAST¶

Step 1: Import the Unknown Sequence into Python Running BLAST over the Internet¶

In [6]:
record = list(SeqIO.parse(INPUT_FASTA, "fasta"))[0]
print(f"Submitting BLAST for {record.id}")
Submitting BLAST for sequence_unknown
In [4]:
print(record)
ID: sequence_unknown
Name: sequence_unknown
Description: sequence_unknown
Number of features: 0
Seq('CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGT...GCC')

Step 2: Running BLAST over the Internet¶

In [16]:
result_handle = NCBIWWW.qblast("blastn", "nt", str(record.seq))
# blast_record = NCBIXML.read(result_handle)
print("BLAST completed")
BLAST completed
In [7]:
help(NCBIWWW.qblast)
Help on function qblast in module Bio.Blast.NCBIWWW:

qblast(
    program,
    database,
    sequence,
    url_base='https://blast.ncbi.nlm.nih.gov/Blast.cgi',
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query='(none)',
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    short_query=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type='XML',
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
    template_type=None,
    template_length=None,
    username='blast',
    password=None
)
    BLAST search using NCBI's QBLAST server.

    Supports all parameters of the old qblast API for Put and Get.

    Please note that NCBI uses the new Common URL API for BLAST searches
    on the internet (https://blast.ncbi.nlm.nih.gov/doc/blast-help/urlapi.html).
    Thus, some of the parameters used by this function are not (or are no longer)
    officially supported by NCBI. Although they are still functioning, this
    may change in the future.

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search - only applies when searching nucleotide BLASTDBs
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - short_query    TRUE/FALSE whether to adjust the search parameters for a
                      short query sequence. Note that this will override
                      manually set parameters like word size and e value. Turns
                      off when sequence length is > 30 residues. Default: None.
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://blast.ncbi.nlm.nih.gov/doc/blast-help/urlapi.html

Step 3: Import The BLAST Record (XML format)¶

In [17]:
blast_record = NCBIXML.read(result_handle)
In [18]:
blast_record
Out[18]:
<Bio.Blast.NCBIXML.Blast at 0x139398cd0>

BlastR.png

Step 4: Parsing BLAST Output¶

In [19]:
len(blast_record.alignments)
Out[19]:
50

Best Match For DNA Sequence

In [21]:
E_VALUE_THRESH = 0.01

for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print("****Alignment****")
            print("sequence:", alignment.title)
            print("length:", alignment.length)
            print("e value:", hsp.expect)
            print(hsp.query)
            print(hsp.match)
            print(hsp.sbjct)
****Alignment****
sequence: gi|2695886215|gb|OR084930.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003007, partial genome
length: 18922
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885795|gb|OR084888.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002129, partial genome
length: 18942
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885865|gb|OR084895.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002138, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885945|gb|OR084903.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002151, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885725|gb|OR084881.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002121, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886125|gb|OR084921.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003018, partial genome
length: 18941
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885745|gb|OR084883.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002123, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885885|gb|OR084897.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002140, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885655|gb|OR084874.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002107, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885605|gb|OR084869.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000796, partial genome
length: 18921
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886145|gb|OR084923.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Monieka-RDCEQT003256, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885555|gb|OR084864.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000492, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886115|gb|OR084920.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003014, partial genome
length: 18930
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885665|gb|OR084875.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002111, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885845|gb|OR084893.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002136, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885595|gb|OR084868.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000617, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886015|gb|OR084910.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Bomongo-RDCEQT002163, partial genome
length: 18932
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885575|gb|OR084866.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000571, partial genome
length: 18936
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885825|gb|OR084891.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002134, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885385|gb|OR084847.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Tumba-RDCEQT000199, partial genome
length: 18574
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885525|gb|OR084861.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000456, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886025|gb|OR084911.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002164, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885905|gb|OR084899.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002144, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885695|gb|OR084878.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002117, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|3018473336|emb|OZ276552.1| Zaire ebolavirus isolate Ebola virus/VECTOR/C.porcellus-lab/COD/1976/Mayinga_NML genome assembly, complete genome: monopartite
length: 18582
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886045|gb|OR084913.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/mbandaka-RDCEQT002502, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885635|gb|OR084872.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002103, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886205|gb|OR084929.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002148, partial genome
length: 18939
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886095|gb|OR084918.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003011, partial genome
length: 18531
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885515|gb|OR084860.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000301, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885855|gb|OR084894.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002137, partial genome
length: 18531
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885415|gb|OR084850.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000010, partial genome
length: 18943
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886195|gb|OR084928.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000300, partial genome
length: 18927
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2501520155|gb|OQ784121.1| Mutant Zaire ebolavirus rMA-EBOV-ZsGnLuc, complete genome
length: 20233
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885435|gb|OR084852.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000017, partial genome
length: 18940
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885545|gb|OR084863.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000474, partial genome
length: 18531
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885465|gb|OR084855.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000042, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885395|gb|OR084848.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Tumba-RDCEQT000029, partial genome
length: 18287
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885585|gb|OR084867.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000616, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885645|gb|OR084873.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002106, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885925|gb|OR084901.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002146, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886085|gb|OR084917.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT003008, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885445|gb|OR084853.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000018, partial genome >gi|2695885485|gb|OR084857.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000074, partial genome
length: 18939
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885615|gb|OR084870.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000869, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885815|gb|OR084890.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002132, partial genome
length: 18532
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885475|gb|OR084856.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT000046, partial genome
length: 18831
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885935|gb|OR084902.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002149, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695886035|gb|OR084912.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002501, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2501520146|gb|OQ784120.1| Mutant Zaire ebolavirus rMA-EBOV-nLuc-ZsG, complete genome
length: 20302
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
****Alignment****
sequence: gi|2695885875|gb|OR084896.1| Zaire ebolavirus isolate Ebolavirus/H.sapiens-wt/COD/2020/Mbandaka-RDCEQT002139, partial genome
length: 18832
e value: 3.21162e-28
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CATGCTACGGTGCTAAAAGCATTACGCCCTATAGTGATTTTCGAGACATACTGTGTTTTTAAATATAGTATTGCC

Part II: Some other Useful Biopython Functions¶

In [2]:
from Bio.Seq import Seq
my_seq= Seq("ACGAGGACAGAT") ## create a DNA sequence record in BioPython
my_seq
Out[2]:
Seq('ACGAGGACAGAT')
In [3]:
print(my_seq)
ACGAGGACAGAT

Nucleotide Methods: complement, reverse_complement, trnaslate, ...etc¶

In [4]:
my_seq.complement()
Out[4]:
Seq('TGCTCCTGTCTA')
In [5]:
my_seq.reverse_complement()
Out[5]:
Seq('ATCTGTCCTCGT')
In [6]:
my_seq.translate()
Out[6]:
Seq('TRTD')

General Methods: find, count ...etc¶

Count¶

In [7]:
print(my_seq.count("A"))  # count of A
print(my_seq.count("G"))  # count of G
5
4
In [8]:
print(my_seq.count("ACG"))  ##Counts non-overlapping occurrences
1

Find¶

In [9]:
print(my_seq)
position = my_seq.find("A")
print(position)
ACGAGGACAGAT
0

Part III: Motif Finding with Biopython¶

In [10]:
from Bio import motifs
from Bio.Seq import Seq

sequences = [
    Seq("ATGCA"),
    Seq("ATGGA"),
    Seq("ATGTA"),
    Seq("ATGCA"),
    Seq("ATGCC")
]

motif = motifs.create(sequences)
In [11]:
motif
Out[11]:
<Bio.motifs.Motif at 0x1363667b0>
In [12]:
print(motif.counts)
        0      1      2      3      4
A:   5.00   0.00   0.00   0.00   4.00
C:   0.00   0.00   0.00   3.00   1.00
G:   0.00   0.00   5.00   1.00   0.00
T:   0.00   5.00   0.00   1.00   0.00

In [13]:
ppm=motif.counts.normalize(pseudocounts=0.5)
print(ppm)
        0      1      2      3      4
A:   0.79   0.07   0.07   0.07   0.64
C:   0.07   0.07   0.07   0.50   0.21
G:   0.07   0.07   0.79   0.21   0.07
T:   0.07   0.79   0.07   0.21   0.07

In [14]:
motif.weblogo("motif.png")

motif.png

In [15]:
motif.pseudocounts = 0.5
motif.background = {"A": 0.25, "C": 0.25, "G": 0.25, "T": 0.25}

# Compute PSSM
pssm = motif.pssm
print(pssm)
        0      1      2      3      4
A:   1.65  -1.81  -1.81  -1.81   1.36
C:  -1.81  -1.81  -1.81   1.00  -0.22
G:  -1.81  -1.81   1.65  -0.22  -1.81
T:  -1.81   1.65  -1.81  -0.22  -1.81

In [16]:
rows = len(pssm.alphabet)
cols = pssm.length
cols
Out[16]:
5
In [17]:
# ---------- Helper: score a k-mer of length L ----------
def score_kmer(kmer, pssm_matrix):
    L= pssm_matrix.length
    kmer = str(kmer).upper()
    if len(kmer) != L:
        raise ValueError(f"k-mer length must be {L}")
    total = 0.0
    for i, base in enumerate(kmer):
        if base not in pssm_matrix:
            raise ValueError(f"Unknown base {base}")
        total += pssm_matrix[base][i]
    return total
In [18]:
def scan_sequence(seq, pssm_matrix, threshold=None, include_revcomp=True):
    seq = str(seq).upper()
    hits = []
    L= pssm_matrix.length
    for i in range(len(seq) - L + 1):
        window = seq[i:i+L]
        sc = score_kmer(window, pssm_matrix)
        strand = "+"
        if threshold is None or sc >= threshold:
            hits.append((i, strand, window, sc))
        if include_revcomp:
            # score reverse complement of the window (so we find motif on both strands)
            rc = str(Seq(window).reverse_complement())
            sc_rc = score_kmer(rc, pssm_matrix)
            if threshold is None or sc_rc >= threshold:
                hits.append((i, "-", rc, sc_rc))
    return hits

long_seq = "GGGATGCAAAATGGAATGTA"
hits = scan_sequence(long_seq, pssm, threshold=None)
print("\nAll scored windows (position, strand, seq, score):")
for h in hits:
    print(h)
All scored windows (position, strand, seq, score):
(0, '+', 'GGGAT', -5.577342991650724)
(0, '-', 'ATCCC', 2.2744060497653344)
(1, '+', 'GGATG', -7.451812109566864)
(1, '-', 'CATCC', -4.644457187509261)
(2, '+', 'GATGC', -5.866849608845708)
(2, '-', 'GCATC', -5.866849608845708)
(3, '+', 'ATGCA', 7.318800169123788)
(3, '-', 'TGCAT', -9.03677461028802)
(4, '+', 'TGCAA', -5.866849608845708)
(4, '-', 'TTGCA', 3.859368550486491)
(5, '+', 'GCAAA', -5.866849608845708)
(5, '-', 'TTTGC', -2.407417990208411)
(6, '+', 'CAAAA', -5.866849608845708)
(6, '-', 'TTTTG', -3.992380490929567)
(7, '+', 'AAAAT', -5.577342991650724)
(7, '-', 'ATTTT', -0.5329488722922697)
(8, '+', 'AAATG', -3.992380490929567)
(8, '-', 'CATTT', -7.451812109566864)
(9, '+', 'AATGG', -3.992380490929567)
(9, '-', 'CCATT', -7.451812109566864)
(10, '+', 'ATGGA', 6.096407747787341)
(10, '-', 'TCCAT', -9.03677461028802)
(11, '+', 'TGGAA', -2.407417990208411)
(11, '-', 'TTCCA', 0.39993693184919343)
(12, '+', 'GGAAT', -9.03677461028802)
(12, '-', 'ATTCC', 2.2744060497653344)
(13, '+', 'GAATG', -7.451812109566864)
(13, '-', 'CATTC', -5.866849608845708)
(14, '+', 'AATGT', -3.992380490929567)
(14, '-', 'ACATT', -3.992380490929567)
(15, '+', 'ATGTA', 6.096407747787341)
(15, '-', 'TACAT', -9.03677461028802)
In [19]:
# ---------- Useful diagnostics: max/min possible score ----------
alphabet=["A", "C", "G", "T"]
L= pssm.length
max_score = sum(max(pssm[b][i] for b in alphabet) for i in range(L))
min_score = sum(min(pssm[b][i] for b in alphabet) for i in range(L))
print("\nMax possible score:", round(max_score,6))
print("Min possible score:", round(min_score,6))
Max possible score: 7.3188
Min possible score: -9.036775