import numpy as np
import pandas as pd
import os
os.getcwd()

'/Users/hoyen/Desktop/STAT718-2026/PythonCode/PythonIntro'

dr = '/Users/hoyen/Desktop/STAT718-2026/PythonCode/PythonIntro/'
filename = 'df3.csv'
file = dr + filename 
print(file)

/Users/hoyen/Desktop/STAT718-2026/PythonCode/PythonIntro/df3.csv

dat=pd.read_csv(file, header=0, index_col=0)

dat.head()

type(dat)

pandas.core.frame.DataFrame

print(dat['ENSG00000000419'])

ensgene
SRR1039508    467.0
SRR1039509    523.0
SRR1039512    616.0
SRR1039513    371.0
SRR1039516    582.0
SRR1039517    781.0
SRR1039520    417.0
SRR1039521    509.0
Name: ENSG00000000419, dtype: float64

print(dat.ENSG00000000419) # # columns of Pandas data frames are called Series.

ensgene
SRR1039508    467.0
SRR1039509    523.0
SRR1039512    616.0
SRR1039513    371.0
SRR1039516    582.0
SRR1039517    781.0
SRR1039520    417.0
SRR1039521    509.0
Name: ENSG00000000419, dtype: float64

dat.loc[:, "ENSG00000000457"] # select one gene across all samples

ensgene
SRR1039508    347.0
SRR1039509    258.0
SRR1039512    364.0
SRR1039513    237.0
SRR1039516    318.0
SRR1039517    447.0
SRR1039520    330.0
SRR1039521    324.0
Name: ENSG00000000457, dtype: float64

dat.loc["SRR1039509"]  # one sample across all genes

ENSG00000000003     486.0
ENSG00000000005       0.0
ENSG00000000419     523.0
ENSG00000000457     258.0
ENSG00000000460      81.0
ENSG00000000938       0.0
ENSG00000000971    3916.0
ENSG00000001036    1714.0
ENSG00000001084     372.0
ENSG00000001167     295.0
Name: SRR1039509, dtype: float64

dat.loc[["SRR1039512", "SRR1039521"], ["ENSG00000000005", "ENSG00000000457"]] ## select multiple genes and samples

dat["ENSG00000000003"] < 1000

ensgene
SRR1039508     True
SRR1039509     True
SRR1039512     True
SRR1039513     True
SRR1039516    False
SRR1039517    False
SRR1039520     True
SRR1039521     True
Name: ENSG00000000003, dtype: bool

dat.loc[dat["ENSG00000000003"] < 1000] ## Conditional selection with boolean filtering

dat.iloc[:5,:5] # the first 5 rows and 5 columns

dat.iloc[1,0]= None  ### or dat.iloc[1,0]= np.nan 
dat.head()

dat.notna().loc["SRR1039509"]

ENSG00000000003    False
ENSG00000000005     True
ENSG00000000419     True
ENSG00000000457     True
ENSG00000000460     True
ENSG00000000938     True
ENSG00000000971     True
ENSG00000001036     True
ENSG00000001084     True
ENSG00000001167     True
Name: SRR1039509, dtype: bool

dat.loc["SRR1039509"].dropna() ## drop the gene with NaN for one sample, this return a Pandas series

ENSG00000000005       0.0
ENSG00000000419     523.0
ENSG00000000457     258.0
ENSG00000000460      81.0
ENSG00000000938       0.0
ENSG00000000971    3916.0
ENSG00000001036    1714.0
ENSG00000001084     372.0
ENSG00000001167     295.0
Name: SRR1039509, dtype: float64

dat2=dat.dropna() ## the entire SRR1039509 sample was dropped just due to a missing value in one gene! 
dat2

dat3=dat.dropna(axis=1) ## the entire gene (ENSG00000000003) was dropped just due to a missing value in one sample! 
dat3

dat4=dat.dropna(axis=0, thresh=9) ## finer-grained control, the thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept
dat4

dat.mean(axis=0)

ENSG00000000003     821.285714
ENSG00000000005       0.000000
ENSG00000000419     533.250000
ENSG00000000457     328.125000
ENSG00000000460      88.000000
ENSG00000000938       0.375000
ENSG00000000971    5953.250000
ENSG00000001036    2056.375000
ENSG00000001084     666.875000
ENSG00000001167     438.000000
dtype: float64

dat_filled = dat.fillna(dat.mean(axis=0))
dat_filled

## Detecting missing value, calcuating missing percentage

url = "https://raw.githubusercontent.com/bioconnector/workshops/master/data/airway_scaledcounts.csv"  ## importing the gene expression data from a study
df2 = pd.read_csv(url, index_col=0)
df2.head()
df3=df2.T
df3.head()

missing_mask = df3.isna() ## detect missing value for the whole data set 
missing_mask

sample_missing=missing_mask.sum(axis=1) ## convert boolen to numeric
sample_missing

SRR1039508    0
SRR1039509    0
SRR1039512    0
SRR1039513    0
SRR1039516    0
SRR1039517    0
SRR1039520    0
SRR1039521    0
dtype: int64

def zero_inflation_rate(x):
    """
    Calculate zero-inflation rate for a gene.

    Parameters
    ----------
    x : pd.Series
        Expression values for one gene across samples

    Returns
    -------
    float
        Proportion of zero values among non-missing values
    """
    x = x.dropna()                  # remove missing values
    if len(x) == 0:
        return np.nan               # avoid division by zero
    return (x == 0).sum() / len(x)

zir_apply = df3.apply(zero_inflation_rate, axis=0) ### axis=0, collapse rows 
zir_apply.shape

(38694,)

zir_vec = (df3 == 0).sum(axis=0) / df3.notna().sum(axis=0)
np.allclose(zir_apply, zir_vec, equal_nan=True) ## double check

True

import matplotlib.pyplot as plt

plt.figure()
plt.hist(zir_apply, bins=30)
plt.xlabel("Zero-inflation rate")
plt.ylabel("Number of genes")
plt.title("Histogram of Zero-inflation Rates per Gene")
plt.show()

df4 = df3.loc[:, zir_apply<0.8]
df4.shape ### drop down to 22,627 genes

(8, 22627)

import anndata as ad

counts_A = pd.read_csv("adata1_counts.csv.gz", index_col=0, compression="gzip") ## read into data from batch A
counts_A.shape

(2000, 300)

counts_B = pd.read_csv("adata2_counts.csv.gz", index_col=0, compression="gzip")
counts_B.shape

(2000, 300)

counts_A.head()

obs_A = pd.read_csv("adata1_cell_metadata.csv", index_col=0)
obs_B= pd.read_csv("adata2_cell_metadata.csv", index_col=0)
var = pd.read_csv("gene_metadata.csv", index_col=0)
obs_A.shape

(300, 2)

obs_A.head()

obs_A["batch"].value_counts()

batch
A    300
Name: count, dtype: int64

obs_A["condition"].value_counts()

condition
ctrl     150
treat    150
Name: count, dtype: int64

var.head()

counts=pd.concat([counts_A, counts_B], axis=1, join="outer") ## axis=1 -> add samples, join="outer" -> keep all genes 
counts.shape

(2000, 600)

obs=pd.concat([obs_A, obs_B], axis=0, join="outer")
obs.shape

(600, 2)

obs.head()

pd.crosstab(obs["batch"], obs["condition"])

#### Annotated data object for a single-cell experiment in Scanpy

adata = ad.AnnData(
    X=counts.T,
    obs=obs.loc[counts.columns],   # align cells
    var=var.loc[counts.index]       # align genes
)
adata

AnnData object with n_obs × n_vars = 600 × 2000
    obs: 'batch', 'condition'

df4.head()

#### calculate the mean for each gene 
gene_mean=df4.mean(axis=0) ## collapsing the rows
gene_mean.head()

ensgene
ENSG00000000003    779.375
ENSG00000000419    533.250
ENSG00000000457    328.125
ENSG00000000460     88.000
ENSG00000000938      0.375
dtype: float64

obs.head()

batch = obs["batch"]
batch.shape

(600,)

gene_batch_mean=counts.groupby(batch, axis=1).mean()
gene_batch_mean.head()

/var/folders/kp/4rq4gv392szf4h4dx523w7s4_rl2wj/T/ipykernel_98485/2968416828.py:1: FutureWarning: DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.
  gene_batch_mean=counts.groupby(batch, axis=1).mean()

	A_cell0	A_cell1	A_cell2	A_cell3	A_cell4	A_cell5	A_cell6	A_cell7	A_cell8	A_cell9	...	A_cell290	A_cell291	A_cell292	A_cell293	A_cell294	A_cell295	A_cell296	A_cell297	A_cell298	A_cell299
Gene0	1	3	4	1	2	0	0	1	0	3	...	3	4	3	3	1	1	5	1	2	1
Gene1	4	2	13	0	3	2	3	2	0	1	...	2	1	1	11	0	2	0	2	4	0
Gene2	3	2	2	0	7	1	3	5	0	1	...	1	4	2	2	0	2	1	0	0	4
Gene3	0	2	1	0	2	2	3	1	2	1	...	3	1	1	7	0	3	0	0	4	6
Gene4	2	4	7	3	2	3	1	1	1	1	...	0	5	7	5	1	8	5	0	6	0

batch	A	B
Gene0	1.980000	3.260000
Gene1	3.253333	4.166667
Gene2	2.003333	3.056667
Gene3	1.753333	2.373333
Gene4	3.340000	4.923333

Importing Data and Manipulating Data with Pandas¶

Indexer: loc and iloc¶

Label-based indexing with .loc¶

Integer-position-based indexing with .iloc¶

Handling missing values: isnull(), notnull(), dropna(), fillna()¶

A function to calculate zero-inflation rate per gene¶

Apply¶

Vectorized version zero-inflation rate per gene¶

subsetting genes with zero-inflation rate < 0.8¶

Merging datasets: concat and append¶

Merging single-cell RNA-seq experiments from two batches¶

Merging two meta data¶

Aggregating data in Pandas¶

	ENSG00000000003	ENSG00000000005	ENSG00000000419	ENSG00000000457	ENSG00000000460	ENSG00000000938	ENSG00000000971	ENSG00000001036	ENSG00000001084	ENSG00000001167
ensgene
SRR1039508	723.0	0.0	467.0	347.0	96.0	0.0	3413.0	2328.0	670.0	426.0
SRR1039509	486.0	0.0	523.0	258.0	81.0	0.0	3916.0	1714.0	372.0	295.0
SRR1039512	904.0	0.0	616.0	364.0	73.0	1.0	6000.0	2640.0	692.0	531.0
SRR1039513	445.0	0.0	371.0	237.0	66.0	0.0	4308.0	1381.0	448.0	178.0
SRR1039516	1170.0	0.0	582.0	318.0	118.0	2.0	6424.0	2165.0	917.0	740.0

	ENSG00000000003	ENSG00000000005	ENSG00000000419	ENSG00000000457	ENSG00000000460	ENSG00000000938	ENSG00000000971	ENSG00000001036	ENSG00000001084	ENSG00000001167
ensgene
SRR1039508	723.000000	0.0	467.0	347.0	96.0	0.0	3413.0	2328.0	670.0	426.0
SRR1039509	821.285714	0.0	523.0	258.0	81.0	0.0	3916.0	1714.0	372.0	295.0
SRR1039512	904.000000	0.0	616.0	364.0	73.0	1.0	6000.0	2640.0	692.0	531.0
SRR1039513	445.000000	0.0	371.0	237.0	66.0	0.0	4308.0	1381.0	448.0	178.0
SRR1039516	1170.000000	0.0	582.0	318.0	118.0	2.0	6424.0	2165.0	917.0	740.0
SRR1039517	1097.000000	0.0	781.0	447.0	94.0	0.0	10723.0	2262.0	807.0	651.0
SRR1039520	806.000000	0.0	417.0	330.0	102.0	0.0	5039.0	2175.0	744.0	414.0
SRR1039521	604.000000	0.0	509.0	324.0	74.0	0.0	7803.0	1786.0	685.0	269.0

ensgene	ENSG00000000003	ENSG00000000005	ENSG00000000419	ENSG00000000457	ENSG00000000460	ENSG00000000938	ENSG00000000971	ENSG00000001036	ENSG00000001084	ENSG00000001167	...	ENSG00000283107	ENSG00000283110	ENSG00000283111	ENSG00000283113	ENSG00000283114	ENSG00000283115	ENSG00000283116	ENSG00000283119	ENSG00000283120	ENSG00000283123
SRR1039508	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039509	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039512	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039513	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039516	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039517	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039520	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
SRR1039521	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False

	batch	condition
cell_id
A_cell0	A	ctrl
A_cell1	A	treat
A_cell2	A	ctrl
A_cell3	A	treat
A_cell4	A	ctrl