import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, average_precision_score
)
from sklearn.inspection import permutation_importance

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Optional: SHAP (explainable AI)
try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

# Optional: PyTorch for saliency (you can skip if not installed)
try:
    import torch
    import torch.nn as nn
    _HAS_TORCH = True
except Exception:
    _HAS_TORCH = False

import os

current_directory = os.getcwd()
current_directory

'/Users/hoyen/Desktop/STAT718-2026/Homework/HW3'

new_path = '/Users/hoyen/Desktop/STAT718-2026/Homework/HW3' # Use the actual path here
os.chdir(new_path)
os.getcwd()

'/Users/hoyen/Desktop/STAT718-2026/Homework/HW3'

import pandas as pd

expr = pd.read_csv(
    "pancan_scaled_zeroone_rnaseq.tsv.gz",
    sep="\t",
    index_col=0
)

print(expr.shape)
expr.head()

(10459, 5000)

sample_ids = expr.index
len(sample_ids)
sample_ids[:5]

Index(['TCGA-02-0047-01', 'TCGA-02-0055-01', 'TCGA-02-2483-01',
       'TCGA-02-2485-01', 'TCGA-02-2486-01'],
      dtype='object')

gene_names = expr.columns.tolist()

meta = pd.read_csv(
    "denseDataOnlyDownload.tsv",
    sep="\t",
    index_col=0
)

meta

tab = meta["cancer type abbreviation"].value_counts(dropna=False)
len(tab)

34

meta_id=meta.index
meta_id[:10]

Index(['TCGA-V4-A9EE-01', 'TCGA-VD-AA8N-01', 'TCGA-V4-A9F5-01',
       'TCGA-V4-A9EU-01', 'TCGA-VD-AA8T-01', 'TCGA-YZ-A982-01',
       'TCGA-V4-A9EZ-01', 'TCGA-V4-A9F2-01', 'TCGA-V4-A9F7-01',
       'TCGA-VD-AA8R-01'],
      dtype='object', name='sample')

expr_ids = set(expr.index)
meta_ids = set(meta.index)

print("Only in expression:", len(expr_ids - meta_ids))
print("Only in phenotype:", len(meta_ids - expr_ids))
print("Matched:", len(expr_ids & meta_ids))

Only in expression: 0
Only in phenotype: 2380
Matched: 10459

meta_matched = meta.loc[meta.index.intersection(expr.index)]
meta_matched = meta_matched.loc[expr.index]

meta_matched.shape

(10459, 6)

meta_matched.iloc[:10,]

print("Samples x genes:", expr.shape)
print(meta_matched['cancer type abbreviation'].value_counts(dropna=False))

Samples x genes: (10459, 5000)
cancer type abbreviation
BRCA    1215
KIRC     606
LUAD     576
THCA     572
HNSC     566
LUSC     552
PRAD     550
LGG      529
SKCM     473
STAD     450
BLCA     426
LIHC     423
COAD     326
KIRP     323
OV       308
CESC     307
SARC     265
ESCA     196
UCEC     190
PCPG     187
PAAD     183
LAML     173
GBM      166
TGCT     139
THYM     122
READ     104
KICH      91
MESO      87
UVM       80
ACC       79
UCS       57
DLBC      48
CHOL      45
NaN       45
Name: count, dtype: int64

mask = meta_matched['cancer type abbreviation'].notna()
meta_clean = meta_matched.loc[mask]
expr_clean = expr.loc[mask]
meta_clean.shape

(10414, 6)

expr_clean.shape

(10414, 5000)

# Basic checks
print("Value range:", expr_clean.min().min(), expr_clean.max().max())
print("Missing:", expr_clean.isna().sum().sum())

# Standardize genes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(expr_clean)

# Remove low-variance genes (bottom 20%)
gene_variances = np.var(X_scaled, axis=0)
threshold = np.percentile(gene_variances, 20)
keep_mask = gene_variances > threshold
X_filtered = X_scaled[:, keep_mask]
kept_genes = expr.columns[keep_mask]
print("Kept genes:", X_filtered.shape[1])

Value range: 0.0 1.0000000000000002
Missing: 0
Kept genes: 3941

pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_filtered)

plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('# PCs')
plt.ylabel('Cumulative explained variance')
plt.grid(True)
plt.show()

X = X_pca
y = meta_clean['cancer type abbreviation'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8,6))

for ct in np.unique(y):
    idx = y == ct
    x = X_pca[idx, 0]
    y_ = X_pca[idx, 1]

    # scatter points
    ax.scatter(x, y_, s=12, alpha=0.5, label=ct)

    # centroid
    cx, cy = x.mean(), y_.mean()

    # annotate at centroid
    ax.text(
        cx,
        cy,
        ct,
        fontsize=9,
        weight='bold',
        ha='center',
        va='center',
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", alpha=0.7)
    )

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA of Pan-Cancer RNA-seq with Cluster Labels")
plt.tight_layout()
plt.show()

	RPS4Y1	XIST	KRT5	CEACAM5	KRT6A	KRT14	CEACAM6	DDX3Y	KDM5D	...	FAM129A	C8orf48	CDK5R1	FAM81A	C13orf18	GDPD3	SMAGP	C2orf85	POU5F1B	CHST2
TCGA-02-0047-01	0.678296	0.289910	0.034230	0.000000	0.084731	0.031863	0.037709	0.746797	0.687833	...	0.440610	0.428782	0.732819	0.634340	0.580662	0.294313	0.458134	0.478219	0.168263	0.638497
TCGA-02-0055-01	0.200633	0.654917	0.181993	0.000000	0.100606	0.050011	0.092586	0.103725	0.140642	...	0.620658	0.363207	0.592269	0.602755	0.610192	0.374569	0.722420	0.271356	0.160465	0.602560
TCGA-02-2483-01	0.785980	0.140842	0.081082	0.000000	0.000000	0.000000	0.000000	0.730648	0.657189	...	0.437658	0.471489	0.868774	0.471141	0.487212	0.385521	0.466642	0.784059	0.160797	0.557074
TCGA-02-2485-01	0.720258	0.122554	0.180042	0.000000	0.000000	0.000000	0.000000	0.720306	0.719855	...	0.553306	0.373344	0.818608	0.691962	0.635023	0.430647	0.453690	0.364494	0.161363	0.607895
TCGA-02-2486-01	0.767127	0.210393	0.034017	0.061161	0.000000	0.053021	0.000000	0.739546	0.665684	...	0.601268	0.379943	0.506839	0.684320	0.607821	0.320113	0.476190	0.122722	0.389544	0.698548

	samples	cancer type abbreviation	age_at_initial_pathologic_diagnosis	gender	sample_type	_primary_disease
TCGA-02-0047-01	TCGA-02-0047-01	GBM	78.0	MALE	Primary Tumor	glioblastoma multiforme
TCGA-02-0055-01	TCGA-02-0055-01	GBM	62.0	FEMALE	Primary Tumor	glioblastoma multiforme
TCGA-02-2483-01	TCGA-02-2483-01	GBM	43.0	MALE	Primary Tumor	glioblastoma multiforme
TCGA-02-2485-01	TCGA-02-2485-01	GBM	53.0	MALE	Primary Tumor	glioblastoma multiforme
TCGA-02-2486-01	TCGA-02-2486-01	GBM	64.0	MALE	Primary Tumor	glioblastoma multiforme
TCGA-04-1348-01	TCGA-04-1348-01	OV	44.0	FEMALE	Primary Tumor	ovarian serous cystadenocarcinoma
TCGA-04-1357-01	TCGA-04-1357-01	OV	52.0	FEMALE	Primary Tumor	ovarian serous cystadenocarcinoma
TCGA-04-1362-01	TCGA-04-1362-01	OV	59.0	FEMALE	Primary Tumor	ovarian serous cystadenocarcinoma
TCGA-04-1364-01	TCGA-04-1364-01	OV	61.0	FEMALE	Primary Tumor	ovarian serous cystadenocarcinoma
TCGA-04-1365-01	TCGA-04-1365-01	OV	87.0	FEMALE	Primary Tumor	ovarian serous cystadenocarcinoma

Homework 3 (Due Monday, March 02 before class)¶

Classification Using TCGA Pan-Cancer data¶

What makes Pan-Cancer data challenging and interesting?¶

Data used in this notebook¶

Biological meaning of labels¶

Learning objectives¶

Set-Up¶

Part 1: Load TCGA Pancancer Data¶

Part 2: Preprocessing¶

Part 3: Dimension Reduction¶

Question:¶

Part 4: Train multiple classifiers and compare¶

Questions:¶

	samples	cancer type abbreviation	age_at_initial_pathologic_diagnosis	gender	sample_type	_primary_disease
sample
TCGA-V4-A9EE-01	TCGA-V4-A9EE-01	UVM	86.0	MALE	Primary Tumor	uveal melanoma
TCGA-VD-AA8N-01	TCGA-VD-AA8N-01	UVM	86.0	MALE	Primary Tumor	uveal melanoma
TCGA-V4-A9F5-01	TCGA-V4-A9F5-01	UVM	85.0	FEMALE	Primary Tumor	uveal melanoma
TCGA-V4-A9EU-01	TCGA-V4-A9EU-01	UVM	83.0	FEMALE	Primary Tumor	uveal melanoma
TCGA-VD-AA8T-01	TCGA-VD-AA8T-01	UVM	83.0	FEMALE	Primary Tumor	uveal melanoma
...	...	...	...	...	...	...
TCGA-DM-A286-01	TCGA-DM-A286-01	NaN	NaN	NaN	NaN	NaN
TCGA-EE-A2A6-01	TCGA-EE-A2A6-01	NaN	NaN	NaN	NaN	NaN
TCGA-HC-7741-01	TCGA-HC-7741-01	NaN	NaN	NaN	NaN	NaN
TCGA-HC-8212-01	TCGA-HC-8212-01	NaN	NaN	NaN	NaN	NaN
TCGA-NH-A6GC-06	TCGA-NH-A6GC-06	NaN	NaN	NaN	NaN	NaN