Set-Up¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
roc_auc_score, average_precision_score
)
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
# Optional: SHAP (explainable AI)
try:
import shap
_HAS_SHAP = True
except Exception:
_HAS_SHAP = False
# Optional: PyTorch for saliency (students can skip if not installed)
try:
import torch
import torch.nn as nn
_HAS_TORCH = True
except Exception:
_HAS_TORCH = False
Part 1: Load TCGA Pancancer Data¶
import pandas as pd
expr = pd.read_csv(
"pancan_scaled_zeroone_rnaseq.tsv.gz",
sep="\t",
index_col=0
)
print(expr.shape)
expr.head()
(10459, 5000)
| RPS4Y1 | XIST | KRT5 | AGR2 | CEACAM5 | KRT6A | KRT14 | CEACAM6 | DDX3Y | KDM5D | ... | FAM129A | C8orf48 | CDK5R1 | FAM81A | C13orf18 | GDPD3 | SMAGP | C2orf85 | POU5F1B | CHST2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TCGA-02-0047-01 | 0.678296 | 0.289910 | 0.034230 | 0.0 | 0.000000 | 0.084731 | 0.031863 | 0.037709 | 0.746797 | 0.687833 | ... | 0.440610 | 0.428782 | 0.732819 | 0.634340 | 0.580662 | 0.294313 | 0.458134 | 0.478219 | 0.168263 | 0.638497 |
| TCGA-02-0055-01 | 0.200633 | 0.654917 | 0.181993 | 0.0 | 0.000000 | 0.100606 | 0.050011 | 0.092586 | 0.103725 | 0.140642 | ... | 0.620658 | 0.363207 | 0.592269 | 0.602755 | 0.610192 | 0.374569 | 0.722420 | 0.271356 | 0.160465 | 0.602560 |
| TCGA-02-2483-01 | 0.785980 | 0.140842 | 0.081082 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.730648 | 0.657189 | ... | 0.437658 | 0.471489 | 0.868774 | 0.471141 | 0.487212 | 0.385521 | 0.466642 | 0.784059 | 0.160797 | 0.557074 |
| TCGA-02-2485-01 | 0.720258 | 0.122554 | 0.180042 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.720306 | 0.719855 | ... | 0.553306 | 0.373344 | 0.818608 | 0.691962 | 0.635023 | 0.430647 | 0.453690 | 0.364494 | 0.161363 | 0.607895 |
| TCGA-02-2486-01 | 0.767127 | 0.210393 | 0.034017 | 0.0 | 0.061161 | 0.000000 | 0.053021 | 0.000000 | 0.739546 | 0.665684 | ... | 0.601268 | 0.379943 | 0.506839 | 0.684320 | 0.607821 | 0.320113 | 0.476190 | 0.122722 | 0.389544 | 0.698548 |
5 rows × 5000 columns
sample_ids = expr.index
len(sample_ids)
sample_ids[:5]
Index(['TCGA-02-0047-01', 'TCGA-02-0055-01', 'TCGA-02-2483-01',
'TCGA-02-2485-01', 'TCGA-02-2486-01'],
dtype='object')
gene_names = expr.columns.tolist()
meta = pd.read_csv(
"denseDataOnlyDownload.tsv",
sep="\t",
index_col=0
)
meta
| samples | cancer type abbreviation | age_at_initial_pathologic_diagnosis | gender | sample_type | _primary_disease | |
|---|---|---|---|---|---|---|
| sample | ||||||
| TCGA-V4-A9EE-01 | TCGA-V4-A9EE-01 | UVM | 86.0 | MALE | Primary Tumor | uveal melanoma |
| TCGA-VD-AA8N-01 | TCGA-VD-AA8N-01 | UVM | 86.0 | MALE | Primary Tumor | uveal melanoma |
| TCGA-V4-A9F5-01 | TCGA-V4-A9F5-01 | UVM | 85.0 | FEMALE | Primary Tumor | uveal melanoma |
| TCGA-V4-A9EU-01 | TCGA-V4-A9EU-01 | UVM | 83.0 | FEMALE | Primary Tumor | uveal melanoma |
| TCGA-VD-AA8T-01 | TCGA-VD-AA8T-01 | UVM | 83.0 | FEMALE | Primary Tumor | uveal melanoma |
| ... | ... | ... | ... | ... | ... | ... |
| TCGA-DM-A286-01 | TCGA-DM-A286-01 | NaN | NaN | NaN | NaN | NaN |
| TCGA-EE-A2A6-01 | TCGA-EE-A2A6-01 | NaN | NaN | NaN | NaN | NaN |
| TCGA-HC-7741-01 | TCGA-HC-7741-01 | NaN | NaN | NaN | NaN | NaN |
| TCGA-HC-8212-01 | TCGA-HC-8212-01 | NaN | NaN | NaN | NaN | NaN |
| TCGA-NH-A6GC-06 | TCGA-NH-A6GC-06 | NaN | NaN | NaN | NaN | NaN |
12839 rows × 6 columns
tab = meta["cancer type abbreviation"].value_counts(dropna=False)
len(tab)
34
meta_id=meta.index
meta_id[:10]
Index(['TCGA-V4-A9EE-01', 'TCGA-VD-AA8N-01', 'TCGA-V4-A9F5-01',
'TCGA-V4-A9EU-01', 'TCGA-VD-AA8T-01', 'TCGA-YZ-A982-01',
'TCGA-V4-A9EZ-01', 'TCGA-V4-A9F2-01', 'TCGA-V4-A9F7-01',
'TCGA-VD-AA8R-01'],
dtype='object', name='sample')
expr_ids = set(expr.index)
meta_ids = set(meta.index)
print("Only in expression:", len(expr_ids - meta_ids))
print("Only in phenotype:", len(meta_ids - expr_ids))
print("Matched:", len(expr_ids & meta_ids))
Only in expression: 0 Only in phenotype: 2380 Matched: 10459
meta_matched = meta.loc[meta.index.intersection(expr.index)]
meta_matched = meta_matched.loc[expr.index]
meta_matched.shape
(10459, 6)
meta_matched.iloc[:10,]
| samples | cancer type abbreviation | age_at_initial_pathologic_diagnosis | gender | sample_type | _primary_disease | |
|---|---|---|---|---|---|---|
| TCGA-02-0047-01 | TCGA-02-0047-01 | GBM | 78.0 | MALE | Primary Tumor | glioblastoma multiforme |
| TCGA-02-0055-01 | TCGA-02-0055-01 | GBM | 62.0 | FEMALE | Primary Tumor | glioblastoma multiforme |
| TCGA-02-2483-01 | TCGA-02-2483-01 | GBM | 43.0 | MALE | Primary Tumor | glioblastoma multiforme |
| TCGA-02-2485-01 | TCGA-02-2485-01 | GBM | 53.0 | MALE | Primary Tumor | glioblastoma multiforme |
| TCGA-02-2486-01 | TCGA-02-2486-01 | GBM | 64.0 | MALE | Primary Tumor | glioblastoma multiforme |
| TCGA-04-1348-01 | TCGA-04-1348-01 | OV | 44.0 | FEMALE | Primary Tumor | ovarian serous cystadenocarcinoma |
| TCGA-04-1357-01 | TCGA-04-1357-01 | OV | 52.0 | FEMALE | Primary Tumor | ovarian serous cystadenocarcinoma |
| TCGA-04-1362-01 | TCGA-04-1362-01 | OV | 59.0 | FEMALE | Primary Tumor | ovarian serous cystadenocarcinoma |
| TCGA-04-1364-01 | TCGA-04-1364-01 | OV | 61.0 | FEMALE | Primary Tumor | ovarian serous cystadenocarcinoma |
| TCGA-04-1365-01 | TCGA-04-1365-01 | OV | 87.0 | FEMALE | Primary Tumor | ovarian serous cystadenocarcinoma |
print("Samples x genes:", expr.shape)
print(meta_matched['cancer type abbreviation'].value_counts(dropna=False))
Samples x genes: (10459, 5000) cancer type abbreviation BRCA 1215 KIRC 606 LUAD 576 THCA 572 HNSC 566 LUSC 552 PRAD 550 LGG 529 SKCM 473 STAD 450 BLCA 426 LIHC 423 COAD 326 KIRP 323 OV 308 CESC 307 SARC 265 ESCA 196 UCEC 190 PCPG 187 PAAD 183 LAML 173 GBM 166 TGCT 139 THYM 122 READ 104 KICH 91 MESO 87 UVM 80 ACC 79 UCS 57 DLBC 48 CHOL 45 NaN 45 Name: count, dtype: int64
mask = meta_matched['cancer type abbreviation'].notna()
meta_clean = meta_matched.loc[mask]
expr_clean = expr.loc[mask]
meta_clean.shape
meta_clean.rename(
columns={'cancer type abbreviation': 'cancer_type'},
inplace=True
)
meta_clean.iloc[1:10,]
labels=meta_clean
expr_clean.shape
(10414, 5000)
Part 2: Preprocessing¶
# Basic checks
print("Value range:", expr_clean.min().min(), expr_clean.max().max())
print("Missing:", expr_clean.isna().sum().sum())
# Standardize genes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(expr_clean)
# Remove low-variance genes (bottom 20%)
gene_variances = np.var(X_scaled, axis=0)
threshold = np.percentile(gene_variances, 20)
keep_mask = gene_variances > threshold
X_filtered = X_scaled[:, keep_mask]
kept_genes = expr.columns[keep_mask]
print("Kept genes:", X_filtered.shape[1])
Value range: 0.0 1.0000000000000002 Missing: 0 Kept genes: 3941
Part 3: Dimension Reduction¶
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_filtered)
plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('# PCs')
plt.ylabel('Cumulative explained variance')
plt.grid(True)
plt.savefig("PCA_VA.png")
plt.show()
X = X_pca
y = meta_clean['cancer_type'].values
fig, ax = plt.subplots(figsize=(7,5))
for ct in np.unique(y):
idx = y == ct
ax.scatter(
X_pca[idx, 0],
X_pca[idx, 1],
label=ct,
s=12,
alpha=0.6
)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA of Pan-Cancer RNA-seq")
ax.legend(
bbox_to_anchor=(1.05, 1),
loc="upper left",
fontsize=8
)
fig.tight_layout()
plt.savefig("PCA_Pancancer.png", bbox_inches="tight")
plt.show()
The PCA projection of Pan-Cancer RNA-seq data reveals both distinct tissue-specific clusters and extensive overlap among epithelial cancers. Tumors such as gliomas, kidney cancers, thyroid carcinoma, and melanoma form well-separated clusters, reflecting strong lineage-specific transcriptional programs. In contrast, many epithelial cancers overlap substantially in the first two principal components, indicating shared molecular features and motivating the use of higher-dimensional representations and nonlinear classifiers for accurate cancer-type prediction.
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,6))
for ct in np.unique(y):
idx = y == ct
x = X_pca[idx, 0]
y_ = X_pca[idx, 1]
# scatter points
ax.scatter(x, y_, s=12, alpha=0.5, label=ct)
# centroid
cx, cy = x.mean(), y_.mean()
# annotate at centroid
ax.text(
cx,
cy,
ct,
fontsize=9,
weight='bold',
ha='center',
va='center',
bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", alpha=0.7)
)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA of Pan-Cancer RNA-seq with Cluster Labels")
plt.tight_layout()
plt.savefig("PCA_Pancancer2.png", bbox_inches="tight")
plt.show()
Part 4: Train multiple classifiers and compare¶
X = X_pca
y = meta_clean['cancer_type'].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
Questions:¶
- Pick your top 3 favoriate machine learning models. Train them with the pancancer data.
- Report training accuracy with epochs for each model.
- Evaluate and compare the algorithms using the test dataset.
- Generate confusion matrix for each algorithm.
# Define models
models = {
'LogisticRegression': LogisticRegression(max_iter=2000, multi_class='ovr'),
'RandomForest': RandomForestClassifier(n_estimators=300, random_state=42),
'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
'SVM-RBF': SVC(probability=True, kernel='rbf'),
'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500)
}
results = []
probas = {}
for name, clf in models.items():
print(f"Training {name}...")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
probas[name] = y_proba
acc = accuracy_score(y_test, y_pred)
# Binarize labels for multiclass AUC
classes = clf.classes_
y_test_bin = label_binarize(y_test, classes=classes)
roc_auc = roc_auc_score(
y_test_bin, y_proba,
average='macro', multi_class='ovr'
)
pr_auc = average_precision_score(
y_test_bin, y_proba,
average='macro'
)
results.append({
'model': name,
'accuracy': acc,
'roc_auc_macro': roc_auc,
'pr_auc_macro': pr_auc
})
print(
f"{name} | "
f"Accuracy={acc:.4f} | "
f"ROC-AUC={roc_auc:.4f} | "
f"PR-AUC={pr_auc:.4f}"
)
# Convert results to DataFrame
res_df = pd.DataFrame(results).set_index('model')
display(res_df)
# Plot comparison
res_df.plot.bar(figsize=(8,4))
plt.title('Model comparison: Accuracy, ROC-AUC (macro), PR-AUC (macro)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Training LogisticRegression... LogisticRegression | Accuracy=0.9395 | ROC-AUC=0.9976 | PR-AUC=0.9290 Training RandomForest... RandomForest | Accuracy=0.9184 | ROC-AUC=0.9981 | PR-AUC=0.9225 Training GradientBoosting... GradientBoosting | Accuracy=0.9078 | ROC-AUC=0.9913 | PR-AUC=0.8983 Training SVM-RBF... SVM-RBF | Accuracy=0.9328 | ROC-AUC=0.9987 | PR-AUC=0.9379 Training MLP... MLP | Accuracy=0.9361 | ROC-AUC=0.9978 | PR-AUC=0.9361
| accuracy | roc_auc_macro | pr_auc_macro | |
|---|---|---|---|
| model | |||
| LogisticRegression | 0.939510 | 0.997619 | 0.929049 |
| RandomForest | 0.918387 | 0.998109 | 0.922470 |
| GradientBoosting | 0.907825 | 0.991258 | 0.898289 |
| SVM-RBF | 0.932789 | 0.998694 | 0.937868 |
| MLP | 0.936150 | 0.997843 | 0.936086 |
Question:¶
Plot the ROC and PR curves and report area under the curves (AUC) for each algorithms.
# ── Micro-Average ROC & PR Curves (all models on one plot) ──────────────────
from sklearn.metrics import roc_curve, precision_recall_curve, auc
fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 5))
blues = plt.cm.Blues(np.linspace(0.4, 0.95, len(probas)))
lines = []
labels = []
for (name, y_proba), color in zip(probas.items(), blues):
y_test_bin = label_binarize(y_test, classes=class_labels)
# Micro ROC
fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_proba.ravel())
roc_auc = auc(fpr, tpr)
line, = ax_roc.plot(fpr, tpr, color=color)
# Micro PR
precision, recall, _ = precision_recall_curve(y_test_bin.ravel(), y_proba.ravel())
pr_auc = auc(recall, precision)
ax_pr.plot(recall, precision, color=color)
lines.append(line)
labels.append(f"{name} (ROC-AUC={roc_auc:.2f} | PR-AUC={pr_auc:.2f})")
# ROC formatting
ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
ax_roc.set_title("Micro-Average ROC Curve", fontweight="bold")
ax_roc.set_xlabel("False Positive Rate")
ax_roc.set_ylabel("True Positive Rate")
# PR formatting
ax_pr.set_title("Micro-Average PR Curve", fontweight="bold")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
# Single shared legend above both plots
fig.legend(
lines, labels,
loc="upper center",
ncol=len(probas),
fontsize=9,
frameon=True,
bbox_to_anchor=(0.5, 1.02)
)
fig.suptitle("Micro-Average Curves — All Models", fontsize=13, fontweight="bold", y=1.08)
plt.tight_layout()
plt.show()
# ── Macro-Average ROC & PR Curves (all models on one plot) ──────────────────
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import numpy as np
fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 5))
lines = []
labels = []
for name, y_proba in probas.items():
y_test_bin = label_binarize(y_test, classes=class_labels)
n_classes = len(class_labels)
# Macro ROC
fpr_grid = np.linspace(0, 1, 300)
tprs, roc_aucs = [], []
for i in range(n_classes):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
tprs.append(np.interp(fpr_grid, fpr, tpr))
roc_aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(tprs, axis=0)
macro_roc_auc = np.mean(roc_aucs)
line, = ax_roc.plot(fpr_grid, mean_tpr)
# Macro PR
recall_grid = np.linspace(0, 1, 300)
precisions, pr_aucs = [], []
for i in range(n_classes):
precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_proba[:, i])
precisions.append(np.interp(recall_grid, recall[::-1], precision[::-1]))
pr_aucs.append(auc(recall, precision))
mean_precision = np.mean(precisions, axis=0)
macro_pr_auc = np.mean(pr_aucs)
ax_pr.plot(recall_grid, mean_precision, color=line.get_color())
# Append label once both AUCs are ready
lines.append(line)
labels.append(f"{name} (ROC-AUC={macro_roc_auc:.2f} | PR-AUC={macro_pr_auc:.2f})")
# ROC formatting
ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
ax_roc.set_title("Macro-Average ROC Curve", fontweight="bold")
ax_roc.set_xlabel("False Positive Rate")
ax_roc.set_ylabel("True Positive Rate")
# PR formatting
ax_pr.set_title("Macro-Average PR Curve", fontweight="bold")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
# Single shared legend above both plots
fig.legend(
lines, labels,
loc="upper center",
ncol=len(probas),
fontsize=9,
frameon=True,
bbox_to_anchor=(0.5, 1.02)
)
fig.suptitle("Macro-Average Curves — All Models", fontsize=13, fontweight="bold", y=1.08)
plt.tight_layout()
plt.show()
Summary¶
These three classifiers were trained to predict cancer type from PCA-reduced gene expression. SVM-RBF achieves a slight higher Macro PR-AUC (PR-AUC:0.94) compared to logistic regression (PR-AUC:0.93) and MLP (PR-AUC:0.93). Confusion matrices showed predictions were usually correct with only minor confusion with cancers that are biologically related like the Kidney and GI cancers. It seems that gene expression provdes strong discriminative signal for cancer type classiciation and even linear models perform well. I think it is interesting that the simple linear model worked the best, and my guess is its becuase the tissue of origin drives a lot of the differences in the expression levels.
Abbreviations: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
Epithelial cancers: Difficult to classify¶
BRCA (breast)
LUAD (lung adenocarcinoma)
LUSC (lung squamous)
COAD / READ (colon / rectal)
STAD (stomach)
ESCA (esophagus)
Moderate Difficult¶
PAAD (pancreatic)
HNSC (head & neck)
BLCA (bladder)
Easiest¶
LGG / GBM (brain tumors)
KIRC / KIRP / KICH (kidney cancers)
THCA (thyroid)
SKCM (melanoma)