import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, callbacks

np.random.seed(42)
random.seed(42)

# Simple one-hot encoding utilities
BASES = ['A','C','G','T']
_base_to_idx = {b:i for i,b in enumerate(BASES)}

def one_hot_encode_seqs(seqs):
    """Encode a list of uppercase DNA sequences (A,C,G,T) to shape (N, L, 4)."""
    N = len(seqs)
    L = len(seqs[0])
    arr = np.zeros((N, L, 4), dtype=np.float32)
    for i,s in enumerate(seqs):
        for j,ch in enumerate(s):
            arr[i,j,_base_to_idx.get(ch,'A')] = 1.0
    return arr


def seqs_from_onehot(X):
    # convert back to seqs by argmax
    seqs = []
    for i in range(X.shape[0]):
        seq = ''.join(BASES[int(np.argmax(X[i,j]))] for j in range(X.shape[1]))
        seqs.append(seq)
    return seqs

# Synthetic data generator

def generate_synthetic_dataset(num_pos=2000, num_neg=2000, seq_len=101, motif='ACGTG', motif_prob=1.0, random_seed=42):
    """Generate synthetic sequences.
    - num_pos: number of positive examples
    - num_neg: number of negative examples
    - seq_len: length of each sequence
    - motif: motif string to implant in positives
    - motif_prob: probability that a positive contains the motif (allows noisy labels)
    Returns: (seqs, labels)
    """
    rng = np.random.RandomState(random_seed)
    def random_seq(L):
        return ''.join(rng.choice(BASES) for _ in range(L))

    pos_seqs = []
    for _ in range(num_pos):
        s = list(random_seq(seq_len))
        if rng.rand() <= motif_prob:
            # implant motif at random position so it fits
            k = len(motif)
            start = rng.randint(0, seq_len - k + 1)
            s[start:start+k] = list(motif)
        pos_seqs.append(''.join(s))

    neg_seqs = [random_seq(seq_len) for _ in range(num_neg)]

    seqs = pos_seqs + neg_seqs
    labels = np.array([1]*len(pos_seqs) + [0]*len(neg_seqs), dtype=np.int32)

    # shuffle
    perm = rng.permutation(len(seqs))
    seqs = [seqs[i] for i in perm]
    labels = labels[perm]
    return seqs, labels

# Quick test
seqs, labels = generate_synthetic_dataset(num_pos=10, num_neg=10)
print('Example positive seq:', seqs[0], 'label', labels[0])

Example positive seq: TCCCTGCTCACCGTGGGGGCGGTACCCGGGTAGATCGAAGCCCTAAATATCGAACGTGCCGTTATGCAACTCTCGTGACAAAACACCGTTCGCCCGTGAGG label 0

# Prepare dataset

SEQ_LEN = 101
MOTIF = 'ACGTG'

seqs, labels = generate_synthetic_dataset(num_pos=4000, num_neg=4000, seq_len=SEQ_LEN, motif=MOTIF, motif_prob=1.0)

# Train/val/test split
N = len(seqs)
train_end = int(0.8*N)
val_end = int(0.9*N)

train_seqs = seqs[:train_end]
train_y = labels[:train_end]
val_seqs = seqs[train_end:val_end]
val_y = labels[train_end:val_end]
test_seqs = seqs[val_end:]
test_y = labels[val_end:]

X_train = one_hot_encode_seqs(train_seqs)
X_val = one_hot_encode_seqs(val_seqs)
X_test = one_hot_encode_seqs(test_seqs)

print('Shapes:', X_train.shape, X_val.shape, X_test.shape)
print('Train positive fraction:', np.mean(train_y))

Shapes: (6400, 101, 4) (800, 101, 4) (800, 101, 4)
Train positive fraction: 0.49453125

def build_deepbind_like_model(input_length, num_filters=16, kernel_size=11, lr=1e-3):
    inp = layers.Input(shape=(input_length, 4))
    x = layers.Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu')(inp)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(32, activation='relu')(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(lr), loss=losses.BinaryCrossentropy(), metrics=[])
    return model

model = build_deepbind_like_model(SEQ_LEN, num_filters=16, kernel_size=len(MOTIF))
model.summary()

2026-03-16 13:07:54.642968: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2026-03-16 13:07:54.643000: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2026-03-16 13:07:54.643004: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2026-03-16 13:07:54.643037: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-03-16 13:07:54.643046: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 101, 4)         │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d (Conv1D)                 │ (None, 97, 16)         │           336 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ global_max_pooling1d            │ (None, 16)             │             0 │
│ (GlobalMaxPooling1D)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 32)             │           544 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 1)              │            33 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 913 (3.57 KB)

 Trainable params: 913 (3.57 KB)

 Non-trainable params: 0 (0.00 B)

EPOCHS = 10
BATCH_SIZE = 128

es = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, train_y, validation_data=(X_val, val_y), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es])

# Evaluate on test set
preds = model.predict(X_test).ravel()
roc = roc_auc_score(test_y, preds)
precision, recall, _ = precision_recall_curve(test_y, preds)
pr_auc = auc(recall, precision)
print(f'Test ROC AUC: {roc:.4f}, PR AUC: {pr_auc:.4f}')

# Plot training curves
plt.figure(figsize=(6,3))
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend()
plt.title('Loss curves')
plt.show()

Epoch 1/10

2026-03-16 13:12:32.204019: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.

50/50 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.6900 - val_loss: 0.6821
Epoch 2/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.6821 - val_loss: 0.6689
Epoch 3/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.6613 - val_loss: 0.6297
Epoch 4/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.6187 - val_loss: 0.5682
Epoch 5/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.5447 - val_loss: 0.4923
Epoch 6/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.4765 - val_loss: 0.4326
Epoch 7/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.4146 - val_loss: 0.4051
Epoch 8/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.3919 - val_loss: 0.3862
Epoch 9/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.3703 - val_loss: 0.3688
Epoch 10/10
50/50 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - loss: 0.3574 - val_loss: 0.3569
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step 
Test ROC AUC: 0.9047, PR AUC: 0.8651

import heapq
from collections import Counter

def get_conv_layer_output(model, X, layer_index=1):
    # Return activations from conv layer for input X (no pooling)
    conv_model = models.Model(inputs=model.input, outputs=model.layers[layer_index].output)
    return conv_model.predict(X)

conv_out_train = get_conv_layer_output(model, X_train, layer_index=1)
print('Conv output shape (train):', conv_out_train.shape)

# For each filter, find windows with top activations across the dataset
KERNEL = model.layers[1].kernel_size[0]
NUM_TOP = max(50, int(0.01 * X_train.shape[0]))  # top 1% or at least 50

# We'll scan convolution outputs (before pooling) and record the positions

filter_kmers = {f: [] for f in range(conv_out_train.shape[-1])}

for i in range(X_train.shape[0]):
    # conv_out_train[i] shape: (L_out, num_filters)
    for f in range(conv_out_train.shape[-1]):
        # find the max activation position for filter f in this sequence
        # we'll collect top-N across sequences later
        pass

# More systematic approach: for each filter, keep a heap of top activations with (score, seq_index, pos)
heaps = {f: [] for f in range(conv_out_train.shape[-1])}

for i in range(X_train.shape[0]):
    # conv_out_train[i] has shape (L_out, num_filters)
    arr = conv_out_train[i]  # numpy array
    L_out = arr.shape[0]
    for pos in range(L_out):
        for f in range(arr.shape[1]):
            score = arr[pos, f]
            heap = heaps[f]
            if len(heap) < NUM_TOP:
                heapq.heappush(heap, (score, i, pos))
            else:
                if score > heap[0][0]:
                    heapq.heapreplace(heap, (score, i, pos))

# Extract k-mers
for f, heap in heaps.items():
    top = sorted(heap, reverse=True)
    kmers = []
    for (score, i, pos) in top:
        # map pos in conv output to original sequence coordinate
        # For 'valid' conv, pos corresponds to start index in original sequence
        start = pos
        kmer = train_seqs[i][start:start+KERNEL]
        kmers.append(kmer)
    filter_kmers[f] = kmers

# Quick check for filter 0
for f in range(min(4, len(filter_kmers))):
    print(f'Filter {f}, top kmers (first 10):', filter_kmers[f][:10])

200/200 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
Conv output shape (train): (6400, 97, 16)
Filter 0, top kmers (first 10): ['AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG', 'AGCTG']
Filter 1, top kmers (first 10): ['AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC', 'AGGCC']
Filter 2, top kmers (first 10): ['TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT', 'TTAAT']
Filter 3, top kmers (first 10): ['AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT', 'AACGT']

def build_pwm(kmers):
    k = len(kmers[0])
    pfm = np.zeros((k,4), dtype=float)
    for s in kmers:
        for i,ch in enumerate(s):
            pfm[i,_base_to_idx[ch]] += 1
    # add pseudocount
    pfm += 1e-6
    pwm = pfm / pfm.sum(axis=1, keepdims=True)
    return pwm

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def plot_pwm_heatmap(pwm, title="PWM Heatmap"):
    
    df = pd.DataFrame(pwm, columns=BASES)

    plt.figure(figsize=(max(6, pwm.shape[0]/2), 3))
    
    sns.heatmap(
        df.T,                 # transpose for nicer layout
        cmap="viridis",
        vmin=0,
        vmax=1,
        cbar=True
    )

    plt.xlabel("Position")
    plt.ylabel("Base")
    plt.title(title)
    plt.show()

for f in range(min(4, len(filter_kmers))):
    pwm = build_pwm(filter_kmers[f])
    plot_pwm_heatmap(pwm, title=f"Filter {f} PWM")

Homework 4: DeepBind-style 1D CNN for Motif Discovery (Synthetic Dataset)¶

(Due: Monday April 06, 2026 before class)¶

Structure¶

Submit the completed notebook and a short PDF report with your answers and interpretations. There are 6 Questions.¶

1. Setup: imports and utility functions¶

2. Synthetic dataset generation¶

3. Prepare data for Keras¶

4. Model starter code (1D CNN)¶

Question¶

5. Training¶

6. Motif extraction from convolutional filters¶

7. Build PWMs and plot simple sequence logos¶

Questions¶

8 – Advanced Modeling and Biological Extensions (25 pts)¶

Questions¶