In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

url1 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE37764.csv.zip"
url2 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE60052.csv.zip"
url3 = "https://raw.githubusercontent.com/PacktPublishing/Deep-Learning-for-Genomics-/main/Chapter03/lung/GSE87340.csv.zip"
url4 = "https://sbcb.inf.ufrgs.br/data/curda/dataset/GSE40419/GSE40419.csv.zip"
lung1 = pd.read_csv(url1, compression="zip")
lung2 = pd.read_csv(url2, compression="zip")
lung3 = pd.read_csv(url3, compression="zip")
lung4 = pd.read_csv(url4, compression="zip")

lung_1_4 = pd.concat([lung1, lung2, lung3, lung4], axis=0, ignore_index=True)


print(lung_1_4.head())
    ID   class  ENSG00000000003  ENSG00000000005  ENSG00000000419  \
0  P1N  Normal         9.869563         3.359294         9.252274   
1  P1T   Tumor        11.957128         2.870308         9.673054   
2  P3N  Normal         9.573051         3.370413         9.177268   
3  P3T   Tumor        11.750409         2.870308         9.178781   
4  P4N  Normal         9.347072         3.703784         9.021748   

   ENSG00000000457  ENSG00000000460  ENSG00000000938  ENSG00000000971  \
0         8.919576         7.394545        13.664387        13.268303   
1         9.994453         8.393224         9.925747        14.117760   
2         9.279816         7.376917        11.449583        13.214280   
3        10.407126         8.580091         9.525149        12.777749   
4         8.872918         7.587812        12.715550        13.486744   

   ENSG00000001036  ...  ENSG00000285985  ENSG00000285986  ENSG00000285987  \
0        10.681938  ...         2.870308         2.870308         2.870308   
1        11.041954  ...         2.870308         2.870308         2.870308   
2        10.657062  ...         2.870308         2.870308         2.870308   
3        11.110023  ...         2.870308         2.870308         2.870308   
4        10.544305  ...         2.870308         2.870308         2.870308   

   ENSG00000285988  ENSG00000285989  ENSG00000285990  ENSG00000285991  \
0         2.870308         2.870308         2.870308         5.276223   
1         2.870308         2.870308         2.870308         4.549299   
2         2.870308         2.870308         2.870308         5.505797   
3         3.698260         2.870308         2.870308         5.024206   
4         2.870308         2.870308         2.870308         4.912281   

   ENSG00000285992  ENSG00000285993  ENSG00000285994  
0         2.870308         2.870308         5.796883  
1         3.561798         2.870308         5.341960  
2         2.870308         3.369478         4.640682  
3         2.870308         3.545341         4.818864  
4         3.375706         2.870308         4.254559  

[5 rows x 58737 columns]
In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

## Print the first 5 rows and 10 columns
print(lung_1_4.iloc[:,0:10].head())
    ID   class  ENSG00000000003  ENSG00000000005  ENSG00000000419  \
0  P1N  Normal         9.869563         3.359294         9.252274   
1  P1T   Tumor        11.957128         2.870308         9.673054   
2  P3N  Normal         9.573051         3.370413         9.177268   
3  P3T   Tumor        11.750409         2.870308         9.178781   
4  P4N  Normal         9.347072         3.703784         9.021748   

   ENSG00000000457  ENSG00000000460  ENSG00000000938  ENSG00000000971  \
0         8.919576         7.394545        13.664387        13.268303   
1         9.994453         8.393224         9.925747        14.117760   
2         9.279816         7.376917        11.449583        13.214280   
3        10.407126         8.580091         9.525149        12.777749   
4         8.872918         7.587812        12.715550        13.486744   

   ENSG00000001036  
0        10.681938  
1        11.041954  
2        10.657062  
3        11.110023  
4        10.544305  
In [3]:
# Data Preprocessing:

## Print the total number of missing values for each columns
print(lung_1_4.isna().sum())

## Print the total number of missing values for all gene expression columns combined together
print(lung_1_4.isna().sum().sum())
ID                 0
class              0
ENSG00000000003    0
ENSG00000000005    0
ENSG00000000419    0
                  ..
ENSG00000285990    0
ENSG00000285991    0
ENSG00000285992    0
ENSG00000285993    0
ENSG00000285994    0
Length: 58737, dtype: int64
0
In [4]:
# EDA

df_counts = lung_1_4['class'].value_counts().reset_index()
df_counts.columns = ['class_label', 'count']   # rename for clarity
print("\nClass counts:")
print(df_counts)

palette = sns.color_palette("tab10", n_colors=len(df_counts))


# Plot: horizontal barplot (class names on y)
plt.figure(figsize=(8, max(4, 0.4 * len(df_counts))))
sns.barplot(
    x='count',
    y='class_label',
    hue='class_label',      # assign hue
    data=df_counts,
    palette=palette,
    orient='h',
    legend=False            # suppress legend
)

plt.xlabel("Number of samples")
plt.ylabel("Class")
plt.title("Sample counts per lung cancer class")
plt.tight_layout()
plt.savefig(
    "lung_cancer_class_distribution.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()
Class counts:
  class_label  count
0       Tumor    191
1      Normal    110
2      Normal      6
3       Tumor      6
No description has been provided for this image
In [5]:
### Correct Label 
#print(set(lung_1_4['class']))
lung_1_4['class'] = lung_1_4['class'].replace(' Normal', 'Normal')
lung_1_4['class'] = lung_1_4['class'].replace(' Tumor', 'Tumor')
print(set(lung_1_4['class']))
{'Normal', 'Tumor'}
In [6]:
## plotting the distribution of samples corresponding to each lung cancer type
df_counts = lung_1_4['class'].value_counts().reset_index()
df_counts.columns = ['class_label', 'count']   # rename for clarity
print("\nClass counts:")
print(df_counts)

## visualize the classes after fixing the columns
plt.figure(figsize=(8, max(4, 0.4 * len(df_counts))))
sns.barplot(
    x='count',
    y='class_label',
    hue='class_label',      # assign hue
    data=df_counts,
    palette=palette,
    orient='h',
    legend=False            # suppress legend
)

plt.xlabel("Number of samples")
plt.ylabel("Class")
plt.title("Sample counts per lung cancer class")
plt.tight_layout()
plt.savefig(
    "lung_cancer_class_distribution2.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()
Class counts:
  class_label  count
0       Tumor    197
1      Normal    116
/var/folders/kp/4rq4gv392szf4h4dx523w7s4_rl2wj/T/ipykernel_93647/1735670681.py:9: UserWarning: The palette list has more values (4) than needed (2), which may not be intended.
  sns.barplot(
No description has been provided for this image
In [7]:
classes = lung_1_4['class'].unique().tolist()
print(classes)
['Normal', 'Tumor']
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Separate features and labels
X = lung_1_4.drop(['class', 'ID'], axis=1)
y = lung_1_4['class']

# Encode labels (binary)
y_bin = (y == "Tumor").astype(int)   # adjust label name if needed

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train logistic regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_scaled, y_bin)

# Extract coefficients
coef = lr.coef_[0]

# Create dataframe of gene importance
gene_importance = pd.DataFrame({
    'gene': X.columns,
    'coefficient': coef,
    'abs_coefficient': np.abs(coef)
})

# Rank genes by importance
gene_importance = gene_importance.sort_values(
    by='abs_coefficient', ascending=False
)

# Top 20 important genes
gene_importance.head(20)
Out[8]:
gene coefficient abs_coefficient
10429 ENSG00000160181 0.023452 0.023452
32688 ENSG00000235142 0.018991 0.018991
4770 ENSG00000118017 0.018199 0.018199
15216 ENSG00000182333 0.018118 0.018118
5672 ENSG00000125571 0.018036 0.018036
52027 ENSG00000272620 0.018017 0.018017
15841 ENSG00000184937 -0.017902 0.017902
11301 ENSG00000164266 0.016838 0.016838
14753 ENSG00000179914 -0.016679 0.016679
39312 ENSG00000250726 0.016361 0.016361
2335 ENSG00000100557 0.016244 0.016244
9123 ENSG00000148702 0.016112 0.016112
36082 ENSG00000241612 -0.015629 0.015629
7937 ENSG00000140297 0.015604 0.015604
3673 ENSG00000109511 0.015490 0.015490
23070 ENSG00000215183 0.015463 0.015463
52836 ENSG00000274092 -0.015353 0.015353
25628 ENSG00000224821 0.015304 0.015304
57335 ENSG00000283544 0.015304 0.015304
7223 ENSG00000136149 -0.014910 0.014910
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

top_n = 20
top_genes = gene_importance.head(top_n)

plt.figure(figsize=(8,6))
sns.barplot(
    x='coefficient',
    y='gene',
    data=top_genes,
    palette='coolwarm'
)
plt.axvline(0, color='black', linewidth=1)
plt.title("Top 20 important genes (Logistic Regression)")
plt.xlabel("Coefficient value")
plt.ylabel("Gene")
plt.tight_layout()
plt.show()
/var/folders/kp/4rq4gv392szf4h4dx523w7s4_rl2wj/T/ipykernel_93647/46062173.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold

# X: samples x genes (DataFrame), y: binary labels (Series)
# Example feature transform: log1p
log1p = FunctionTransformer(np.log1p, validate=False)

pipe = Pipeline([
    ('log', log1p),
    ('scaler', StandardScaler()),                  # center & scale
    ('select', SelectKBest(f_classif, k=500)),     # use top 500 genes
    ('clf', LogisticRegression(penalty='l2', C=1.0, max_iter=1000, solver='saga',
                               class_weight='balanced', random_state=0))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
aucs = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
print("AUROC (5-fold):", np.mean(aucs), np.std(aucs))
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
Exception ignored in: <function ResourceTracker.__del__ at 0x1066398a0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 84, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 93, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 118, in _stop_locked
ChildProcessError: [Errno 10] No child processes
AUROC (5-fold): 0.9971460423634337 0.0033471561212566527
In [13]:
print(aucs)
[1.         0.99456522 1.         0.99888517 0.99331104]