In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
url1 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE37764.csv.zip"
url2 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE60052.csv.zip"
url3 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE87340.csv.zip"
url4 = "https://sbcb.inf.ufrgs.br/data/curda/dataset/GSE40419/GSE40419.csv.zip"
lung1 = pd.read_csv(url1, compression="zip")
lung2 = pd.read_csv(url2, compression="zip")
lung3 = pd.read_csv(url3, compression="zip")
lung4 = pd.read_csv(url4, compression="zip")
lung_1_4 = pd.concat([lung1, lung2, lung3, lung4], axis=0, ignore_index=True)
print(lung_1_4.head())
ID class ENSG00000000003 ENSG00000000005 ENSG00000000419 \ 0 P1N Normal 9.869563 3.359294 9.252274 1 P1T Tumor 11.957128 2.870308 9.673054 2 P3N Normal 9.573051 3.370413 9.177268 3 P3T Tumor 11.750409 2.870308 9.178781 4 P4N Normal 9.347072 3.703784 9.021748 ENSG00000000457 ENSG00000000460 ENSG00000000938 ENSG00000000971 \ 0 8.919576 7.394545 13.664387 13.268303 1 9.994453 8.393224 9.925747 14.117760 2 9.279816 7.376917 11.449583 13.214280 3 10.407126 8.580091 9.525149 12.777749 4 8.872918 7.587812 12.715550 13.486744 ENSG00000001036 ... ENSG00000285985 ENSG00000285986 ENSG00000285987 \ 0 10.681938 ... 2.870308 2.870308 2.870308 1 11.041954 ... 2.870308 2.870308 2.870308 2 10.657062 ... 2.870308 2.870308 2.870308 3 11.110023 ... 2.870308 2.870308 2.870308 4 10.544305 ... 2.870308 2.870308 2.870308 ENSG00000285988 ENSG00000285989 ENSG00000285990 ENSG00000285991 \ 0 2.870308 2.870308 2.870308 5.276223 1 2.870308 2.870308 2.870308 4.549299 2 2.870308 2.870308 2.870308 5.505797 3 3.698260 2.870308 2.870308 5.024206 4 2.870308 2.870308 2.870308 4.912281 ENSG00000285992 ENSG00000285993 ENSG00000285994 0 2.870308 2.870308 5.796883 1 3.561798 2.870308 5.341960 2 2.870308 3.369478 4.640682 3 2.870308 3.545341 4.818864 4 3.375706 2.870308 4.254559 [5 rows x 58737 columns]
In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
## Print the first 5 rows and 10 columns
print(lung_1_4.iloc[:,0:10].head())
ID class ENSG00000000003 ENSG00000000005 ENSG00000000419 \ 0 P1N Normal 9.869563 3.359294 9.252274 1 P1T Tumor 11.957128 2.870308 9.673054 2 P3N Normal 9.573051 3.370413 9.177268 3 P3T Tumor 11.750409 2.870308 9.178781 4 P4N Normal 9.347072 3.703784 9.021748 ENSG00000000457 ENSG00000000460 ENSG00000000938 ENSG00000000971 \ 0 8.919576 7.394545 13.664387 13.268303 1 9.994453 8.393224 9.925747 14.117760 2 9.279816 7.376917 11.449583 13.214280 3 10.407126 8.580091 9.525149 12.777749 4 8.872918 7.587812 12.715550 13.486744 ENSG00000001036 0 10.681938 1 11.041954 2 10.657062 3 11.110023 4 10.544305
In [3]:
# Data Preprocessing:
## Print the total number of missing values for each columns
print(lung_1_4.isna().sum())
## Print the total number of missing values for all gene expression columns combined together
print(lung_1_4.isna().sum().sum())
ID 0
class 0
ENSG00000000003 0
ENSG00000000005 0
ENSG00000000419 0
..
ENSG00000285990 0
ENSG00000285991 0
ENSG00000285992 0
ENSG00000285993 0
ENSG00000285994 0
Length: 58737, dtype: int64
0
In [4]:
# EDA
df_counts = lung_1_4['class'].value_counts().reset_index()
df_counts.columns = ['class_label', 'count'] # rename for clarity
print("\nClass counts:")
print(df_counts)
palette = sns.color_palette("tab10", n_colors=len(df_counts))
# Plot: horizontal barplot (class names on y)
plt.figure(figsize=(8, max(4, 0.4 * len(df_counts))))
sns.barplot(
x='count',
y='class_label',
hue='class_label', # assign hue
data=df_counts,
palette=palette,
orient='h',
legend=False # suppress legend
)
plt.xlabel("Number of samples")
plt.ylabel("Class")
plt.title("Sample counts per lung cancer class")
plt.tight_layout()
plt.savefig(
"lung_cancer_class_distribution.png",
dpi=300,
bbox_inches="tight"
)
plt.show()
Class counts: class_label count 0 Tumor 191 1 Normal 110 2 Normal 6 3 Tumor 6
In [5]:
### Correct Label
#print(set(lung_1_4['class']))
lung_1_4['class'] = lung_1_4['class'].replace(' Normal', 'Normal')
lung_1_4['class'] = lung_1_4['class'].replace(' Tumor', 'Tumor')
print(set(lung_1_4['class']))
{'Normal', 'Tumor'}
In [6]:
## plotting the distribution of samples corresponding to each lung cancer type
df_counts = lung_1_4['class'].value_counts().reset_index()
df_counts.columns = ['class_label', 'count'] # rename for clarity
print("\nClass counts:")
print(df_counts)
## visualize the classes after fixing the columns
plt.figure(figsize=(8, max(4, 0.4 * len(df_counts))))
sns.barplot(
x='count',
y='class_label',
hue='class_label', # assign hue
data=df_counts,
palette=palette,
orient='h',
legend=False # suppress legend
)
plt.xlabel("Number of samples")
plt.ylabel("Class")
plt.title("Sample counts per lung cancer class")
plt.tight_layout()
plt.savefig(
"lung_cancer_class_distribution2.png",
dpi=300,
bbox_inches="tight"
)
plt.show()
Class counts: class_label count 0 Tumor 197 1 Normal 116
/var/folders/kp/4rq4gv392szf4h4dx523w7s4_rl2wj/T/ipykernel_93647/1735670681.py:9: UserWarning: The palette list has more values (4) than needed (2), which may not be intended. sns.barplot(
In [7]:
classes = lung_1_4['class'].unique().tolist()
print(classes)
['Normal', 'Tumor']
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
# Separate features and labels
X = lung_1_4.drop(['class', 'ID'], axis=1)
y = lung_1_4['class']
# Encode labels (binary)
y_bin = (y == "Tumor").astype(int) # adjust label name if needed
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train logistic regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_scaled, y_bin)
# Extract coefficients
coef = lr.coef_[0]
# Create dataframe of gene importance
gene_importance = pd.DataFrame({
'gene': X.columns,
'coefficient': coef,
'abs_coefficient': np.abs(coef)
})
# Rank genes by importance
gene_importance = gene_importance.sort_values(
by='abs_coefficient', ascending=False
)
# Top 20 important genes
gene_importance.head(20)
Out[8]:
| gene | coefficient | abs_coefficient | |
|---|---|---|---|
| 10429 | ENSG00000160181 | 0.023452 | 0.023452 |
| 32688 | ENSG00000235142 | 0.018991 | 0.018991 |
| 4770 | ENSG00000118017 | 0.018199 | 0.018199 |
| 15216 | ENSG00000182333 | 0.018118 | 0.018118 |
| 5672 | ENSG00000125571 | 0.018036 | 0.018036 |
| 52027 | ENSG00000272620 | 0.018017 | 0.018017 |
| 15841 | ENSG00000184937 | -0.017902 | 0.017902 |
| 11301 | ENSG00000164266 | 0.016838 | 0.016838 |
| 14753 | ENSG00000179914 | -0.016679 | 0.016679 |
| 39312 | ENSG00000250726 | 0.016361 | 0.016361 |
| 2335 | ENSG00000100557 | 0.016244 | 0.016244 |
| 9123 | ENSG00000148702 | 0.016112 | 0.016112 |
| 36082 | ENSG00000241612 | -0.015629 | 0.015629 |
| 7937 | ENSG00000140297 | 0.015604 | 0.015604 |
| 3673 | ENSG00000109511 | 0.015490 | 0.015490 |
| 23070 | ENSG00000215183 | 0.015463 | 0.015463 |
| 52836 | ENSG00000274092 | -0.015353 | 0.015353 |
| 25628 | ENSG00000224821 | 0.015304 | 0.015304 |
| 57335 | ENSG00000283544 | 0.015304 | 0.015304 |
| 7223 | ENSG00000136149 | -0.014910 | 0.014910 |
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
top_n = 20
top_genes = gene_importance.head(top_n)
plt.figure(figsize=(8,6))
sns.barplot(
x='coefficient',
y='gene',
data=top_genes,
palette='coolwarm'
)
plt.axvline(0, color='black', linewidth=1)
plt.title("Top 20 important genes (Logistic Regression)")
plt.xlabel("Coefficient value")
plt.ylabel("Gene")
plt.tight_layout()
plt.show()
/var/folders/kp/4rq4gv392szf4h4dx523w7s4_rl2wj/T/ipykernel_93647/46062173.py:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
# X: samples x genes (DataFrame), y: binary labels (Series)
# Example feature transform: log1p
log1p = FunctionTransformer(np.log1p, validate=False)
pipe = Pipeline([
('log', log1p),
('scaler', StandardScaler()), # center & scale
('select', SelectKBest(f_classif, k=500)), # use top 500 genes
('clf', LogisticRegression(penalty='l2', C=1.0, max_iter=1000, solver='saga',
class_weight='balanced', random_state=0))
])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
aucs = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
print("AUROC (5-fold):", np.mean(aucs), np.std(aucs))
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( /opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( /opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( /opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( Exception ignored in: <function ResourceTracker.__del__ at 0x1066398a0> Traceback (most recent call last): File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 84, in __del__ File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 93, in _stop File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 118, in _stop_locked ChildProcessError: [Errno 10] No child processes
AUROC (5-fold): 0.9971460423634337 0.0033471561212566527
In [13]:
print(aucs)
[1. 0.99456522 1. 0.99888517 0.99331104]