Python Programming 4¶
Conditionals and Loops¶
If Else¶
In [2]:
DNA_segment = 'ATGACATGA'
codon1 = DNA_segment[0:3]
codon1
Out[2]:
'ATG'
In [3]:
# == operator tests the equality of two strings, resulting in True/False
if (codon1 == 'ATG'):
print('Codon', codon1, 'is a start codon.')
else:
print('Codon', codon1, 'is not a start codon.')
print('Done!')
Codon ATG is a start codon. Done!
If Elif¶
In [4]:
DNA_segment = 'ATGACATGACCAATC'
codon1 = DNA_segment[-3:]
# == operator tests the equality of two strings, resulting in True/False
if (codon1 == 'ATG'):
print('Codon', codon1, 'is a start codon.')
elif ((codon1 == 'TAA') or
(codon1 == 'TAG') or
(codon1 == 'TGA')):
print('Codon', codon1, 'is a stop codon.')
else:
print('Codon', codon1, 'is neither a start nor a stop codon.')
print('Done!')
Codon ATC is neither a start nor a stop codon. Done!
While¶
In [5]:
DNA_seq = 'CGGACACACAAAAAGAATGAAGGATTTTGAATCTTTATTGTGTGCGAGTAACTACGAGGAAGATTAAAGA'
print('DNA sequence:', DNA_seq)
bp = 'T'
print('Base pair:', bp)
print('str.count():', DNA_seq.count(bp))
count = 0
index = 0
while index < len(DNA_seq):
if bp == DNA_seq[index]:
count += 1
index += 1
print('Our while count T:', count)
DNA sequence: CGGACACACAAAAAGAATGAAGGATTTTGAATCTTTATTGTGTGCGAGTAACTACGAGGAAGATTAAAGA Base pair: T str.count(): 17 Our while count T: 17
Indentation matters!
In [7]:
DNA_seq = 'CGGACACACAAAAAGAATGAAGGATTTTGAATCTTTATTGTGTGCGAGTAACTACGAGGAAGATTAAAGA'
print('DNA sequence:', DNA_seq)
bp = 'T'
print('Base pair:', bp)
print('str.count():', DNA_seq.count(bp))
count = 0
index = 0
while index < len(DNA_seq):
if bp == DNA_seq[index]:
count += 1
index += 1
print('Our while count T:', count)
DNA sequence: CGGACACACAAAAAGAATGAAGGATTTTGAATCTTTATTGTGTGCGAGTAACTACGAGGAAGATTAAAGA Base pair: T str.count(): 17 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 0 Our while count T: 1 Our while count T: 1 Our while count T: 1 Our while count T: 1 Our while count T: 1 Our while count T: 1 Our while count T: 1 Our while count T: 2 Our while count T: 3 Our while count T: 4 Our while count T: 5 Our while count T: 5 Our while count T: 5 Our while count T: 5 Our while count T: 6 Our while count T: 6 Our while count T: 7 Our while count T: 8 Our while count T: 9 Our while count T: 9 Our while count T: 10 Our while count T: 11 Our while count T: 11 Our while count T: 12 Our while count T: 12 Our while count T: 13 Our while count T: 13 Our while count T: 13 Our while count T: 13 Our while count T: 13 Our while count T: 13 Our while count T: 14 Our while count T: 14 Our while count T: 14 Our while count T: 14 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 15 Our while count T: 16 Our while count T: 17 Our while count T: 17 Our while count T: 17 Our while count T: 17 Our while count T: 17 Our while count T: 17
For Loops¶
In [8]:
# Save nucleotide bases in a list
bases = ['T', 'C', 'A', 'G']
# As a warmup, print the list of bases
for base in bases:
print(base)
print('Codons starting with T:')
for second_base in bases:
print('Codons starting with T'+second_base)
for third_base in bases:
print('T'+second_base+third_base)
T C A G Codons starting with T: Codons starting with TT TTT TTC TTA TTG Codons starting with TC TCT TCC TCA TCG Codons starting with TA TAT TAC TAA TAG Codons starting with TG TGT TGC TGA TGG
In [9]:
for i in range(0,8):
print(i)
0 1 2 3 4 5 6 7
In [13]:
days = ['Day ' + str(i) for i in range(0,20)]
days
Out[13]:
['Day 0', 'Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5', 'Day 6', 'Day 7', 'Day 8', 'Day 9', 'Day 10', 'Day 11', 'Day 12', 'Day 13', 'Day 14', 'Day 15', 'Day 16', 'Day 17', 'Day 18', 'Day 19']
In [12]:
n_samples = 50
samples = [f"S{i+1:02d}" for i in range(n_samples)]
print(samples)
['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38', 'S39', 'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47', 'S48', 'S49', 'S50']
In [30]:
import numpy as np
x = np.array([1,4,7,2,3])
val = 0
for i in range(x.size):
#val = val + x[i]
val += x[i]
print(val)
17
How about calculating sample variance $$ S^2=\frac{1}{n-1}\sum_{i=1}^{n}(X_i - \overline{X_n})^2 $$
In [18]:
x = np.array([1,4,7,2,3])
def sampvar(x):
n = x.size
sumx = 0
for i in range(n):
sumx += x[i]
xbar = sumx / n
sum2 = 0
for i in range(n):
sum2 += (x[i] - xbar)**2
s2 = sum2/(n-1)
return s2
print(sampvar(x))
print(np.var(x))
5.300000000000001 4.24
In [19]:
%timeit sampvar(x)
1.57 μs ± 31.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
In [20]:
%timeit np.var(x)
7.13 μs ± 29.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
For large arrays, NumPy becomes orders of magnitude faster
Make a plot with multiple functions on it, with a loop
In [21]:
x = np.random.rand(1_000_000)
%timeit sampvar(x)
%timeit np.var(x)
214 ms ± 1.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) 1.23 ms ± 93.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Making multiple plots with a for loop
In [32]:
x = np.linspace(-6, 6, 1000)
# ---- Plot multiple functions on one axes ----
plt.figure(figsize=(10, 6))
for name, fn in activations:
y = fn(x)
plt.plot(x, y, label=name) # default matplotlib colors are used
# aesthetics
plt.axhline(0, linewidth=0.6)
plt.axvline(0, linewidth=0.6)
plt.title("Comparison of Activation Functions")
plt.xlabel("Input (x)")
plt.ylabel("Output")
plt.legend(loc="upper left", ncol=2, fontsize="small", framealpha=0.9)
plt.grid(True, linestyle="--", alpha=0.25)
# set y-limit a bit larger so saturating functions are visible
plt.ylim(-2.0, 6.0)
plt.tight_layout()
plt.show()
In [33]:
import numpy as np
import matplotlib.pyplot as plt
# ---- Activation functions ----
def linear(x):
return x
def hard_step(x):
# Heaviside / step at 0
return np.where(x >= 0, 1.0, 0.0)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def tanh(x):
return np.tanh(x)
def relu(x):
return np.maximum(0, x)
def leaky_relu(x, negative_slope=0.01):
return np.where(x >= 0, x, negative_slope * x)
def elu(x, alpha=1.0):
return np.where(x >= 0, x, alpha * (np.exp(x) - 1.0))
def swish(x, beta=1.0):
return x * (1.0 / (1.0 + np.exp(-beta * x)))
def gelu_approx(x):
# Approximate GELU (Hendrycks & Gimpel)
return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x**3)))
# ---- Prepare data ----
x = np.linspace(-5.0, 5.0, 1000)
activations = [
("Linear", linear),
("Step (Heaviside)", hard_step),
("Sigmoid", sigmoid),
("Tanh", tanh),
("ReLU", relu),
("Leaky ReLU", lambda z: leaky_relu(z, negative_slope=0.1)),
("ELU (α=1.0)", elu),
("Swish (β=1.0)", swish),
("GELU (approx)", gelu_approx),
]
# ---- Plot grid ----
fig, axes = plt.subplots(3, 3, figsize=(12, 10))
axes = axes.flatten()
for ax, (name, fn) in zip(axes, activations):
y = fn(x)
ax.plot(x, y) # do not set explicit colors/styles
ax.axhline(0, linewidth=0.6) # horizontal zero line
ax.axvline(0, linewidth=0.6) # vertical zero line
ax.set_title(name, fontsize=12)
ax.set_xlim(x.min(), x.max())
# set y-limits to give a reasonable view for each activation
# for sigmoids/tanh keep tight bounds; for others allow the function to show
if name in ("Sigmoid", "Step (Heaviside)"):
ax.set_ylim(-0.1, 1.1)
elif name in ("Tanh"):
ax.set_ylim(-1.1, 1.1)
else:
ax.set_ylim(-2.5, 5.0)
ax.grid(True, linestyle='--', alpha=0.25)
plt.tight_layout()
plt.suptitle("Popular Activation Functions", fontsize=16, y=1.02)
plt.subplots_adjust(top=0.92) # make room for suptitle
# Save or show
plt.show()
# plt.savefig("activations_grid.png", dpi=150, bbox_inches="tight")
Combining function and loops
In [5]:
a=range(1, 8)
for i in a:
print(i)
1 2 3 4 5 6 7
In [6]:
DNA_seq = 'gggtgcgacgattcattgttttcggacaagtggataggcaaccactaccggtggattgtc'
print('Sequence:', DNA_seq)
# Count the number of codons
DNA_length = len(DNA_seq)
number_of_codons = int(DNA_length/3)
print(number_of_codons)
Sequence: gggtgcgacgattcattgttttcggacaagtggataggcaaccactaccggtggattgtc 20
In [8]:
codon_list = []
for i in range(number_of_codons):
codon_list.append(DNA_seq[i*3:i*3 + 3])
print('List of codons:', codon_list)
print()
List of codons: ['ggg', 'tgc', 'gac', 'gat', 'tca', 'ttg', 'ttt', 'tcg', 'gac', 'aag', 'tgg', 'ata', 'ggc', 'aac', 'cac', 'tac', 'cgg', 'tgg', 'att', 'gtc']
In [13]:
def createCodon(seq, k=3):
number_of_codons = int(len(seq)/k)
codon_list = []
for i in range(number_of_codons):
codon_list.append(DNA_seq[i*k:(i*k + k)])
return(codon_list)
createCodon(DNA_seq, k=4)
Out[13]:
['gggt', 'gcga', 'cgat', 'tcat', 'tgtt', 'ttcg', 'gaca', 'agtg', 'gata', 'ggca', 'acca', 'ctac', 'cggt', 'ggat', 'tgtc']
In [16]:
def codon_counter(seq, k=3):
codon_list=createCodon(seq, k)
codon_counter={}
for codon in codon_list:
if codon not in codon_counter:
codon_counter[codon]=1
else:
codon_counter[codon]+=1
return(codon_counter)
codon_counter(DNA_seq, k=4)
Out[16]:
{'ggg': 1,
'tgc': 1,
'gac': 2,
'gat': 1,
'tca': 1,
'ttg': 1,
'ttt': 1,
'tcg': 1,
'aag': 1,
'tgg': 2,
'ata': 1,
'ggc': 1,
'aac': 1,
'cac': 1,
'tac': 1,
'cgg': 1,
'att': 1,
'gtc': 1}
Minimizing a function with gradient descent¶
\begin{eqnarray*} f(x) = (x-3)^2 \\ \nabla f(x_{} = 2(x-3). \\ \end{eqnarray*} Make updates $$ x_{n+1} = x_{n} - \eta \nabla f(x_{n}), $$ where $\eta$ is a hyper-parameter called learning rate.
In [51]:
import numpy as np
import matplotlib.pyplot as plt
# objective: f(x) = (x - 3)^2
# gradient: f'(x) = 2*(x - 3)
def f(x):
return (x - 3.0)**2
def grad_f(x):
return 2.0 * (x - 3.0)
# hyperparams
lr = 0.1
n_steps = 80
x = 0.0 # initial guess
for step in range(1, n_steps + 1):
g = grad_f(x)
x = x - lr * g
if step % 10 == 0 or step == 1:
print(f"step {step:2d}: x = {x:.6f}, f(x) = {f(x):.6e}")
print("Converged approx to x =", x)
xmin=x
xmin
step 1: x = 0.600000, f(x) = 5.760000e+00 step 10: x = 2.677877, f(x) = 1.037629e-01 step 20: x = 2.965412, f(x) = 1.196305e-03 step 30: x = 2.996286, f(x) = 1.379246e-05 step 40: x = 2.999601, f(x) = 1.590162e-07 step 50: x = 2.999957, f(x) = 1.833332e-09 step 60: x = 2.999995, f(x) = 2.113688e-11 step 70: x = 3.000000, f(x) = 2.436917e-13 step 80: x = 3.000000, f(x) = 2.809574e-15 Converged approx to x = 2.999999946994588
Out[51]:
2.999999946994588
In [27]:
x=np.linspace(0,6, 200)
fx=(x - 3)**2
fig, ax= plt.subplots(figsize=(5,3))
ax.plot(x, fx)
ax.axvline(xmin)
plt.show()
Modules and Packages¶
A module is simply a Python file (.py) that contains code — variables, functions, classes — that can be imported and used in another Python file.¶
A file named math_utilis.py
In [ ]:
def add(a, b):
return a + b
def sub(a, b):
return a - b
In another file
In [35]:
import math_utils
print(math_utils.add(3, 5)) # Output: 8
8
A package is a folder that contains multiple modules. It must include a special file named __init__.py (even if it's empty).¶
Installing External Modules (Packages)
% pip
In [43]:
import requests,
res = requests.get("https://api.github.com")
print(res.status_code)
200
In [47]:
import requests
import pandas as pd
import io
url = "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"
# Step 1: Download the file
response = requests.get(url)
response.raise_for_status() # will raise an error if the download failed
# Step 2: Load CSV content into pandas
csv_text = response.text
df = pd.read_csv(io.StringIO(csv_text))
# Step 3: Show the DataFrame
print(df)
Month "1958" "1959" "1960" 0 JAN 340 360 417 1 FEB 318 342 391 2 MAR 362 406 419 3 APR 348 396 461 4 MAY 363 420 472 5 JUN 435 472 535 6 JUL 491 548 622 7 AUG 505 559 606 8 SEP 404 463 508 9 OCT 359 407 461 10 NOV 310 362 390 11 DEC 337 405 432
In [50]:
url = "https://raw.githubusercontent.com/bioconnector/workshops/master/data/airway_scaledcounts.csv"
df2 = pd.read_csv(url)
df2.head()
Out[50]:
| ensgene | SRR1039508 | SRR1039509 | SRR1039512 | SRR1039513 | SRR1039516 | SRR1039517 | SRR1039520 | SRR1039521 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ENSG00000000003 | 723.0 | 486.0 | 904.0 | 445.0 | 1170.0 | 1097.0 | 806.0 | 604.0 |
| 1 | ENSG00000000005 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | ENSG00000000419 | 467.0 | 523.0 | 616.0 | 371.0 | 582.0 | 781.0 | 417.0 | 509.0 |
| 3 | ENSG00000000457 | 347.0 | 258.0 | 364.0 | 237.0 | 318.0 | 447.0 | 330.0 | 324.0 |
| 4 | ENSG00000000460 | 96.0 | 81.0 | 73.0 | 66.0 | 118.0 | 94.0 | 102.0 | 74.0 |
In [ ]: