Set-Up¶

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, average_precision_score
)
from sklearn.inspection import permutation_importance

import warnings
warnings.filterwarnings('ignore')
In [3]:
np.random.seed(42)

# Optional: SHAP (explainable AI)
try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

# Optional: PyTorch for saliency (students can skip if not installed)
try:
    import torch
    import torch.nn as nn
    _HAS_TORCH = True
except Exception:
    _HAS_TORCH = False

Part 1: Load TCGA Pancancer Data¶

In [4]:
import pandas as pd

expr = pd.read_csv(
    "pancan_scaled_zeroone_rnaseq.tsv.gz",
    sep="\t",
    index_col=0
)

print(expr.shape)
expr.head()
(10459, 5000)
Out[4]:
RPS4Y1 XIST KRT5 AGR2 CEACAM5 KRT6A KRT14 CEACAM6 DDX3Y KDM5D ... FAM129A C8orf48 CDK5R1 FAM81A C13orf18 GDPD3 SMAGP C2orf85 POU5F1B CHST2
TCGA-02-0047-01 0.678296 0.289910 0.034230 0.0 0.000000 0.084731 0.031863 0.037709 0.746797 0.687833 ... 0.440610 0.428782 0.732819 0.634340 0.580662 0.294313 0.458134 0.478219 0.168263 0.638497
TCGA-02-0055-01 0.200633 0.654917 0.181993 0.0 0.000000 0.100606 0.050011 0.092586 0.103725 0.140642 ... 0.620658 0.363207 0.592269 0.602755 0.610192 0.374569 0.722420 0.271356 0.160465 0.602560
TCGA-02-2483-01 0.785980 0.140842 0.081082 0.0 0.000000 0.000000 0.000000 0.000000 0.730648 0.657189 ... 0.437658 0.471489 0.868774 0.471141 0.487212 0.385521 0.466642 0.784059 0.160797 0.557074
TCGA-02-2485-01 0.720258 0.122554 0.180042 0.0 0.000000 0.000000 0.000000 0.000000 0.720306 0.719855 ... 0.553306 0.373344 0.818608 0.691962 0.635023 0.430647 0.453690 0.364494 0.161363 0.607895
TCGA-02-2486-01 0.767127 0.210393 0.034017 0.0 0.061161 0.000000 0.053021 0.000000 0.739546 0.665684 ... 0.601268 0.379943 0.506839 0.684320 0.607821 0.320113 0.476190 0.122722 0.389544 0.698548

5 rows × 5000 columns

In [5]:
sample_ids = expr.index
len(sample_ids)
sample_ids[:5]
Out[5]:
Index(['TCGA-02-0047-01', 'TCGA-02-0055-01', 'TCGA-02-2483-01',
       'TCGA-02-2485-01', 'TCGA-02-2486-01'],
      dtype='object')
In [6]:
gene_names = expr.columns.tolist()
In [7]:
meta = pd.read_csv(
    "denseDataOnlyDownload.tsv",
    sep="\t",
    index_col=0
)
In [8]:
meta
Out[8]:
samples cancer type abbreviation age_at_initial_pathologic_diagnosis gender sample_type _primary_disease
sample
TCGA-V4-A9EE-01 TCGA-V4-A9EE-01 UVM 86.0 MALE Primary Tumor uveal melanoma
TCGA-VD-AA8N-01 TCGA-VD-AA8N-01 UVM 86.0 MALE Primary Tumor uveal melanoma
TCGA-V4-A9F5-01 TCGA-V4-A9F5-01 UVM 85.0 FEMALE Primary Tumor uveal melanoma
TCGA-V4-A9EU-01 TCGA-V4-A9EU-01 UVM 83.0 FEMALE Primary Tumor uveal melanoma
TCGA-VD-AA8T-01 TCGA-VD-AA8T-01 UVM 83.0 FEMALE Primary Tumor uveal melanoma
... ... ... ... ... ... ...
TCGA-DM-A286-01 TCGA-DM-A286-01 NaN NaN NaN NaN NaN
TCGA-EE-A2A6-01 TCGA-EE-A2A6-01 NaN NaN NaN NaN NaN
TCGA-HC-7741-01 TCGA-HC-7741-01 NaN NaN NaN NaN NaN
TCGA-HC-8212-01 TCGA-HC-8212-01 NaN NaN NaN NaN NaN
TCGA-NH-A6GC-06 TCGA-NH-A6GC-06 NaN NaN NaN NaN NaN

12839 rows × 6 columns

In [9]:
tab = meta["cancer type abbreviation"].value_counts(dropna=False)
len(tab)
Out[9]:
34
In [10]:
meta_id=meta.index
meta_id[:10]
Out[10]:
Index(['TCGA-V4-A9EE-01', 'TCGA-VD-AA8N-01', 'TCGA-V4-A9F5-01',
       'TCGA-V4-A9EU-01', 'TCGA-VD-AA8T-01', 'TCGA-YZ-A982-01',
       'TCGA-V4-A9EZ-01', 'TCGA-V4-A9F2-01', 'TCGA-V4-A9F7-01',
       'TCGA-VD-AA8R-01'],
      dtype='object', name='sample')
In [11]:
expr_ids = set(expr.index)
meta_ids = set(meta.index)

print("Only in expression:", len(expr_ids - meta_ids))
print("Only in phenotype:", len(meta_ids - expr_ids))
print("Matched:", len(expr_ids & meta_ids))
Only in expression: 0
Only in phenotype: 2380
Matched: 10459
In [12]:
meta_matched = meta.loc[meta.index.intersection(expr.index)]
meta_matched = meta_matched.loc[expr.index]
In [13]:
meta_matched.shape
Out[13]:
(10459, 6)
In [14]:
meta_matched.iloc[:10,]
Out[14]:
samples cancer type abbreviation age_at_initial_pathologic_diagnosis gender sample_type _primary_disease
TCGA-02-0047-01 TCGA-02-0047-01 GBM 78.0 MALE Primary Tumor glioblastoma multiforme
TCGA-02-0055-01 TCGA-02-0055-01 GBM 62.0 FEMALE Primary Tumor glioblastoma multiforme
TCGA-02-2483-01 TCGA-02-2483-01 GBM 43.0 MALE Primary Tumor glioblastoma multiforme
TCGA-02-2485-01 TCGA-02-2485-01 GBM 53.0 MALE Primary Tumor glioblastoma multiforme
TCGA-02-2486-01 TCGA-02-2486-01 GBM 64.0 MALE Primary Tumor glioblastoma multiforme
TCGA-04-1348-01 TCGA-04-1348-01 OV 44.0 FEMALE Primary Tumor ovarian serous cystadenocarcinoma
TCGA-04-1357-01 TCGA-04-1357-01 OV 52.0 FEMALE Primary Tumor ovarian serous cystadenocarcinoma
TCGA-04-1362-01 TCGA-04-1362-01 OV 59.0 FEMALE Primary Tumor ovarian serous cystadenocarcinoma
TCGA-04-1364-01 TCGA-04-1364-01 OV 61.0 FEMALE Primary Tumor ovarian serous cystadenocarcinoma
TCGA-04-1365-01 TCGA-04-1365-01 OV 87.0 FEMALE Primary Tumor ovarian serous cystadenocarcinoma
In [15]:
print("Samples x genes:", expr.shape)
print(meta_matched['cancer type abbreviation'].value_counts(dropna=False))
Samples x genes: (10459, 5000)
cancer type abbreviation
BRCA    1215
KIRC     606
LUAD     576
THCA     572
HNSC     566
LUSC     552
PRAD     550
LGG      529
SKCM     473
STAD     450
BLCA     426
LIHC     423
COAD     326
KIRP     323
OV       308
CESC     307
SARC     265
ESCA     196
UCEC     190
PCPG     187
PAAD     183
LAML     173
GBM      166
TGCT     139
THYM     122
READ     104
KICH      91
MESO      87
UVM       80
ACC       79
UCS       57
DLBC      48
CHOL      45
NaN       45
Name: count, dtype: int64
In [16]:
mask = meta_matched['cancer type abbreviation'].notna()
meta_clean = meta_matched.loc[mask]
expr_clean = expr.loc[mask]
meta_clean.shape
meta_clean.rename(
    columns={'cancer type abbreviation': 'cancer_type'},
    inplace=True
)
meta_clean.iloc[1:10,]
labels=meta_clean
In [17]:
expr_clean.shape
Out[17]:
(10414, 5000)

Part 2: Preprocessing¶

In [18]:
# Basic checks
print("Value range:", expr_clean.min().min(), expr_clean.max().max())
print("Missing:", expr_clean.isna().sum().sum())

# Standardize genes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(expr_clean)

# Remove low-variance genes (bottom 20%)
gene_variances = np.var(X_scaled, axis=0)
threshold = np.percentile(gene_variances, 20)
keep_mask = gene_variances > threshold
X_filtered = X_scaled[:, keep_mask]
kept_genes = expr.columns[keep_mask]
print("Kept genes:", X_filtered.shape[1])
Value range: 0.0 1.0000000000000002
Missing: 0
Kept genes: 3941

Part 3: Dimension Reduction¶

In [19]:
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_filtered)

plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('# PCs')
plt.ylabel('Cumulative explained variance')
plt.grid(True)
plt.savefig("PCA_VA.png")
plt.show()
No description has been provided for this image
In [20]:
X = X_pca
y = meta_clean['cancer_type'].values


fig, ax = plt.subplots(figsize=(7,5))

for ct in np.unique(y):
    idx = y == ct
    ax.scatter(
        X_pca[idx, 0],
        X_pca[idx, 1],
        label=ct,
        s=12,
        alpha=0.6
    )

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA of Pan-Cancer RNA-seq")
ax.legend(
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=8
)

fig.tight_layout()
plt.savefig("PCA_Pancancer.png", bbox_inches="tight")
plt.show()
No description has been provided for this image

The PCA projection of Pan-Cancer RNA-seq data reveals both distinct tissue-specific clusters and extensive overlap among epithelial cancers. Tumors such as gliomas, kidney cancers, thyroid carcinoma, and melanoma form well-separated clusters, reflecting strong lineage-specific transcriptional programs. In contrast, many epithelial cancers overlap substantially in the first two principal components, indicating shared molecular features and motivating the use of higher-dimensional representations and nonlinear classifiers for accurate cancer-type prediction.

In [21]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8,6))

for ct in np.unique(y):
    idx = y == ct
    x = X_pca[idx, 0]
    y_ = X_pca[idx, 1]

    # scatter points
    ax.scatter(x, y_, s=12, alpha=0.5, label=ct)

    # centroid
    cx, cy = x.mean(), y_.mean()

    # annotate at centroid
    ax.text(
        cx,
        cy,
        ct,
        fontsize=9,
        weight='bold',
        ha='center',
        va='center',
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", alpha=0.7)
    )

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA of Pan-Cancer RNA-seq with Cluster Labels")
plt.tight_layout()
plt.savefig("PCA_Pancancer2.png", bbox_inches="tight")
plt.show()
No description has been provided for this image

Part 4: Train multiple classifiers and compare¶

In [22]:
X = X_pca
y = meta_clean['cancer_type'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Questions:¶

  1. Pick your top 3 favoriate machine learning models. Train them with the pancancer data.
  2. Report training accuracy with epochs for each model.
  3. Evaluate and compare the algorithms using the test dataset.
  4. Generate confusion matrix for each algorithm.
In [23]:
# Define models
models = {
'LogisticRegression': LogisticRegression(max_iter=2000, multi_class='ovr'),
'RandomForest': RandomForestClassifier(n_estimators=300, random_state=42),
'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
'SVM-RBF': SVC(probability=True, kernel='rbf'),
'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500)
}


results = []
probas = {}

for name, clf in models.items():
    print(f"Training {name}...")

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)
    probas[name] = y_proba

    acc = accuracy_score(y_test, y_pred)

    # Binarize labels for multiclass AUC
    classes = clf.classes_
    y_test_bin = label_binarize(y_test, classes=classes)

    roc_auc = roc_auc_score(
        y_test_bin, y_proba,
        average='macro', multi_class='ovr'
    )
    pr_auc = average_precision_score(
        y_test_bin, y_proba,
        average='macro'
    )

    results.append({
        'model': name,
        'accuracy': acc,
        'roc_auc_macro': roc_auc,
        'pr_auc_macro': pr_auc
    })

    print(
        f"{name} | "
        f"Accuracy={acc:.4f} | "
        f"ROC-AUC={roc_auc:.4f} | "
        f"PR-AUC={pr_auc:.4f}"
    )

# Convert results to DataFrame
res_df = pd.DataFrame(results).set_index('model')
display(res_df)

# Plot comparison
res_df.plot.bar(figsize=(8,4))
plt.title('Model comparison: Accuracy, ROC-AUC (macro), PR-AUC (macro)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Training LogisticRegression...
LogisticRegression | Accuracy=0.9395 | ROC-AUC=0.9976 | PR-AUC=0.9290
Training RandomForest...
RandomForest | Accuracy=0.9184 | ROC-AUC=0.9981 | PR-AUC=0.9225
Training GradientBoosting...
GradientBoosting | Accuracy=0.9078 | ROC-AUC=0.9913 | PR-AUC=0.8983
Training SVM-RBF...
SVM-RBF | Accuracy=0.9328 | ROC-AUC=0.9987 | PR-AUC=0.9379
Training MLP...
MLP | Accuracy=0.9361 | ROC-AUC=0.9978 | PR-AUC=0.9361
accuracy roc_auc_macro pr_auc_macro
model
LogisticRegression 0.939510 0.997619 0.929049
RandomForest 0.918387 0.998109 0.922470
GradientBoosting 0.907825 0.991258 0.898289
SVM-RBF 0.932789 0.998694 0.937868
MLP 0.936150 0.997843 0.936086
No description has been provided for this image

Question:¶

Plot the ROC and PR curves and report area under the curves (AUC) for each algorithms.

In [35]:
# ── Micro-Average ROC & PR Curves (all models on one plot) ──────────────────
from sklearn.metrics import roc_curve, precision_recall_curve, auc

fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 5))

blues = plt.cm.Blues(np.linspace(0.4, 0.95, len(probas)))

lines = []
labels = []

for (name, y_proba), color in zip(probas.items(), blues):
    y_test_bin = label_binarize(y_test, classes=class_labels)

    # Micro ROC
    fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_proba.ravel())
    roc_auc = auc(fpr, tpr)
    line, = ax_roc.plot(fpr, tpr, color=color)

    # Micro PR
    precision, recall, _ = precision_recall_curve(y_test_bin.ravel(), y_proba.ravel())
    pr_auc = auc(recall, precision)
    ax_pr.plot(recall, precision, color=color)

    lines.append(line)
    labels.append(f"{name} (ROC-AUC={roc_auc:.2f} | PR-AUC={pr_auc:.2f})")

# ROC formatting
ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
ax_roc.set_title("Micro-Average ROC Curve", fontweight="bold")
ax_roc.set_xlabel("False Positive Rate")
ax_roc.set_ylabel("True Positive Rate")

# PR formatting
ax_pr.set_title("Micro-Average PR Curve", fontweight="bold")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")

# Single shared legend above both plots
fig.legend(
    lines, labels,
    loc="upper center",
    ncol=len(probas),
    fontsize=9,
    frameon=True,
    bbox_to_anchor=(0.5, 1.02)
)

fig.suptitle("Micro-Average Curves — All Models", fontsize=13, fontweight="bold", y=1.08)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [36]:
# ── Macro-Average ROC & PR Curves (all models on one plot) ──────────────────
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import numpy as np

fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 5))

lines = []
labels = []

for name, y_proba in probas.items():
    y_test_bin = label_binarize(y_test, classes=class_labels)
    n_classes = len(class_labels)

    # Macro ROC
    fpr_grid = np.linspace(0, 1, 300)
    tprs, roc_aucs = [], []
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
        tprs.append(np.interp(fpr_grid, fpr, tpr))
        roc_aucs.append(auc(fpr, tpr))
    mean_tpr = np.mean(tprs, axis=0)
    macro_roc_auc = np.mean(roc_aucs)
    line, = ax_roc.plot(fpr_grid, mean_tpr)

    # Macro PR
    recall_grid = np.linspace(0, 1, 300)
    precisions, pr_aucs = [], []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_proba[:, i])
        precisions.append(np.interp(recall_grid, recall[::-1], precision[::-1]))
        pr_aucs.append(auc(recall, precision))
    mean_precision = np.mean(precisions, axis=0)
    macro_pr_auc = np.mean(pr_aucs)
    ax_pr.plot(recall_grid, mean_precision, color=line.get_color())

    # Append label once both AUCs are ready
    lines.append(line)
    labels.append(f"{name} (ROC-AUC={macro_roc_auc:.2f} | PR-AUC={macro_pr_auc:.2f})")

# ROC formatting
ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
ax_roc.set_title("Macro-Average ROC Curve", fontweight="bold")
ax_roc.set_xlabel("False Positive Rate")
ax_roc.set_ylabel("True Positive Rate")

# PR formatting
ax_pr.set_title("Macro-Average PR Curve", fontweight="bold")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")

# Single shared legend above both plots
fig.legend(
    lines, labels,
    loc="upper center",
    ncol=len(probas),
    fontsize=9,
    frameon=True,
    bbox_to_anchor=(0.5, 1.02)
)

fig.suptitle("Macro-Average Curves — All Models", fontsize=13, fontweight="bold", y=1.08)
plt.tight_layout()
plt.show()
No description has been provided for this image

Summary¶

These three classifiers were trained to predict cancer type from PCA-reduced gene expression. SVM-RBF achieves a slight higher Macro PR-AUC (PR-AUC:0.94) compared to logistic regression (PR-AUC:0.93) and MLP (PR-AUC:0.93). Confusion matrices showed predictions were usually correct with only minor confusion with cancers that are biologically related like the Kidney and GI cancers. It seems that gene expression provdes strong discriminative signal for cancer type classiciation and even linear models perform well. I think it is interesting that the simple linear model worked the best, and my guess is its becuase the tissue of origin drives a lot of the differences in the expression levels.

Abbreviations: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations

Epithelial cancers: Difficult to classify¶

BRCA (breast)

LUAD (lung adenocarcinoma)

LUSC (lung squamous)

COAD / READ (colon / rectal)

STAD (stomach)

ESCA (esophagus)

Moderate Difficult¶

PAAD (pancreatic)

HNSC (head & neck)

BLCA (bladder)

Easiest¶

LGG / GBM (brain tumors)

KIRC / KIRP / KICH (kidney cancers)

THCA (thyroid)

SKCM (melanoma)