# Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
import os

np.random.seed(2025)
os.makedirs('gwas_outputs', exist_ok=True)
print('Environment ready. Outputs will be written to gwas_outputs/')

Environment ready. Outputs will be written to gwas_outputs/

# Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
import os

np.random.seed(2025)
os.makedirs('gwas_outputs', exist_ok=True)
print('Environment ready. Outputs will be written to gwas_outputs/')

Environment ready. Outputs will be written to gwas_outputs/

# Generate synthetic genotype data
n_samples = 50
n_snps = 500
samples = [f"S{i+1:02d}" for i in range(n_samples)]
pop_labels = np.array(['A'] * (n_samples//2) + ['B'] * (n_samples - n_samples//2))

baseline_af = np.random.uniform(0.05, 0.5, size=n_snps)
popB_shift = np.random.normal(loc=0.0, scale=0.08, size=n_snps)
af_A = np.clip(baseline_af, 0.001, 0.999)
af_B = np.clip(baseline_af + popB_shift, 0.001, 0.999)

genotypes = np.full((n_samples, n_snps), np.nan)
for v in range(n_snps):
    pA = af_A[v]
    pB = af_B[v]
    probsA = [ (1-pA)**2, 2*pA*(1-pA), pA**2 ]
    probsB = [ (1-pB)**2, 2*pB*(1-pB), pB**2 ]
    genotypes[:n_samples//2, v] = np.random.choice([0,1,2], size=n_samples//2, p=probsA)
    genotypes[n_samples//2:, v] = np.random.choice([0,1,2], size=n_samples - n_samples//2, p=probsB)
genotypes[0:5, 0:5]

array([[0., 0., 1., 0., 1.],
       [0., 1., 1., 0., 2.],
       [0., 1., 1., 0., 0.],
       [0., 2., 0., 0., 1.],
       [0., 1., 2., 0., 0.]])

# Simulate covariates: age and sex (binary)
age = np.random.normal(50, 10, size=n_samples).astype(int)
sex = np.random.binomial(1, 0.5, size=n_samples)

# Create phenotype: a polygenic trait with 10 causal SNPs
causal_idx = np.random.choice(n_snps, size=10, replace=False)
beta_snps = np.zeros(n_snps)
beta_snps[causal_idx] = np.random.normal(0.3, 0.1, size=10)

# Genetic effect
# mean-impute genotypes for calculation
G_imp = np.where(np.isnan(genotypes), np.nanmean(genotypes, axis=0), genotypes)
polygenic = G_imp.dot(beta_snps)

# Covariate effects
beta_age = 0.02
beta_sex = -0.1
beta_pop = 0.4

# Environmental noise
env = np.random.normal(0, 1.0, size=n_samples)
## convert population label to numeric 
pop_numeric = np.array([0 if x == 'A' else 1 for x in pop_labels])

y = 0.5 + polygenic + beta_age * age + beta_sex * sex + beta_pop * pop_numeric + env

df_meta = pd.DataFrame({'sample': samples, 'age': age, 'sex': sex, 'pop': pop_labels, 'y': y}).set_index('sample')
df_meta.to_csv('gwas_outputs/phenotypes.csv')
print('Phenotypes saved: gwas_outputs/phenotypes.csv')

# ~2% missing
mask_missing = np.random.rand(n_samples, n_snps) < 0.02
genotypes[mask_missing] = np.nan

snps_ids = [f'rs{100000+i}' for i in range(n_snps)]
df_geno = pd.DataFrame(genotypes, index=samples, columns=snps_ids)

print('Generated genotype matrix shape:', df_geno.shape)
df_geno.head()

Phenotypes saved: gwas_outputs/phenotypes.csv
Generated genotype matrix shape: (50, 500)

import numpy as np

def compute_maf(genotypes):
    """
    Compute minor allele frequency (MAF) for a single SNP.

    Parameters
    ----------
    genotypes : array-like
        Genotypes coded as 0, 1, 2 (alt allele count).
        Missing values can be np.nan.

    Returns
    -------
    maf : float
        Minor allele frequency, or np.nan if no valid genotypes.
    """
    g = np.asarray(genotypes, dtype=float)
    g = g[~np.isnan(g)]  # remove missing

    if g.size == 0:
        return np.nan

    # total number of alternate alleles
    alt_allele_count = g.sum()
    # total number of alleles
    total_alleles = 2 * g.size

    af = alt_allele_count / total_alleles
    maf = min(af, 1 - af)

    return maf

maf = df_geno.apply(compute_maf, axis=0)
maf

rs100000    0.061224
rs100001    0.377551
rs100002    0.340000
rs100003    0.180000
rs100004    0.153061
              ...   
rs100495    0.360000
rs100496    0.071429
rs100497    0.438776
rs100498    0.010417
rs100499    0.132653
Length: 500, dtype: float64

maf.dropna().plot.hist(bins=30)
plt.xlabel("MAF")
plt.title("MAF distribution")
plt.show()

# 1) Compute missingness for each SNP and save as gwas_outputs/variant_qc.csv
missingness = df_geno.isna().mean(axis=0)
missingness.name = "prop_missing"
print(missingness.head())
qc = pd.DataFrame({
    "MAF": maf,
    "missingness": missingness
})
qc.to_csv("gwas_outputs/variant_qc.csv")

rs100000    0.02
rs100001    0.02
rs100002    0.00
rs100003    0.00
rs100004    0.02
Name: prop_missing, dtype: float64

# 2) Mean-impute genotypes and run PCA on a subset of SNPs to get PCs (save to gwas_outputs/pcs.csv)
# mean impute means nas get replaced by the mean of all non-na entries in the column
G_imp = df_geno.fillna(df_geno.mean(axis=0))

# pca wants centered
G_centered = G_imp - G_imp.mean(axis=0)

# to get the subset, we want to choose the SNPs that dont have too low of MAF or too high of missingness
good_snps = maf[(maf > 0.05) & (missingness < 0.05)].index

G_centered = G_centered.loc[:,good_snps]

print(G_centered.shape)

# run PCA
pca = PCA(n_components=4)
pcs = pca.fit_transform(G_centered.values)

pc_df = pd.DataFrame(pcs, index=G_centered.index, columns=['PC1','PC2','PC3','PC4'])
print(pc_df.head())

pc_df.to_csv("gwas_outputs/pcs.csv")

(50, 434)
          PC1       PC2       PC3       PC4
S01  2.883814  1.777985  3.227693 -1.363845
S02  2.232474 -0.799068  0.523152  1.308283
S03  3.039008  4.965095 -0.791743  0.538847
S04  1.287036 -1.235998 -1.985315 -1.384421
S05  2.072885 -1.522972 -4.730467 -2.029128

# 3) Plot PC1 vs PC2 colored by population and save as gwas_outputs/pca.png

plt.figure(figsize=(6, 5))
for label in df_meta['pop'].unique():
    idx = df_meta['pop'] == label
    plt.scatter(
        pc_df.loc[idx, 'PC1'],
        pc_df.loc[idx, 'PC2'],
        label=label,
        s=30,
        alpha=0.8
    )

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA: PC1 vs PC2')
plt.legend(title='Population',  loc='lower right')
plt.tight_layout()
plt.show()

qc.iloc[0:5, 0:5]

### Impute Missing Values and Center genotype for PCA analysis
G_imp = df_geno.fillna(df_geno.mean(axis=0))
G_centered = G_imp - G_imp.mean(axis=0)

# PCA on mean-imputed genotypes (for population structure)
# perform PCA on a random subset of SNPs for speed
pca = PCA(n_components=4)
pcs = pca.fit_transform(G_centered.values)

pc_df = pd.DataFrame(pcs, index=samples, columns=['PC1','PC2','PC3','PC4'])
pc_df.head()

df_meta.head()

plt.figure(figsize=(6, 5))
for label in df_meta['pop'].unique():
    idx = df_meta['pop'] == label
    plt.scatter(
        pc_df.loc[idx, 'PC1'],
        pc_df.loc[idx, 'PC2'],
        label=label,
        s=30,
        alpha=0.8
    )

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA: PC1 vs PC2')
plt.legend(title='Population',  loc='lower right')
plt.tight_layout()
plt.show()

	PC1	PC2	PC3	PC4
S01	3.022323	2.958171	-2.997434	2.000000
S02	2.579011	0.195340	-1.161110	-1.865826
S03	3.639002	4.192559	2.081473	0.728862
S04	1.436168	-1.578011	1.235982	0.333851
S05	2.060543	-4.638060	2.448404	2.724931

Homework: Introductory GWAS Analysis¶

Due Date: Feb 16, 2026¶

Part 1:¶

Questions¶

Questions:¶

Questions¶

Questions¶

Notes & Next steps¶

	rs100001	rs100002	rs100004	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100499
S01	0.0	1.0	1.0	0.0	0.0	1.0	1.0	...	0.0	1.0	NaN	2.0	0.0	2.0	0.0	1.0	0.0
S02	1.0	1.0	2.0	1.0	1.0	2.0	0.0	...	0.0	0.0	0.0	0.0	2.0	1.0	0.0	1.0	0.0
S03	1.0	1.0	0.0	0.0	1.0	1.0	0.0	...	1.0	0.0	1.0	1.0	0.0	2.0	0.0	0.0	0.0
S04	2.0	0.0	1.0	0.0	0.0	1.0	2.0	...	0.0	1.0	0.0	0.0	0.0	1.0	0.0	2.0	1.0
S05	1.0	2.0	0.0	2.0	2.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0

	MAF	missingness
rs100000	0.061224	0.02
rs100001	0.377551	0.02
rs100002	0.340000	0.00
rs100003	0.180000	0.00
rs100004	0.153061	0.02

	age	sex	pop	y
sample
S01	51	1	A	3.786828
S02	34	0	A	2.322089
S03	56	0	A	1.878572
S04	36	1	A	2.316062
S05	51	0	A	2.656425

	rs100001	rs100002	rs100004	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100499
S01	0.0	1.0	1.0	0.0	0.0	1.0	1.0	...	0.0	1.0	NaN	2.0	0.0	2.0	0.0	1.0	0.0
S02	1.0	1.0	2.0	1.0	1.0	2.0	0.0	...	0.0	0.0	0.0	0.0	2.0	1.0	0.0	1.0	0.0
S03	1.0	1.0	0.0	0.0	1.0	1.0	0.0	...	1.0	0.0	1.0	1.0	0.0	2.0	0.0	0.0	0.0
S04	2.0	0.0	1.0	0.0	0.0	1.0	2.0	...	0.0	1.0	0.0	0.0	0.0	1.0	0.0	2.0	1.0
S05	1.0	2.0	0.0	2.0	2.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0

	rs100001	rs100002	rs100004	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100499
S01	0.0	1.0	1.0	0.0	0.0	1.0	1.0	...	0.0	1.0	NaN	2.0	0.0	2.0	0.0	1.0	0.0
S02	1.0	1.0	2.0	1.0	1.0	2.0	0.0	...	0.0	0.0	0.0	0.0	2.0	1.0	0.0	1.0	0.0
S03	1.0	1.0	0.0	0.0	1.0	1.0	0.0	...	1.0	0.0	1.0	1.0	0.0	2.0	0.0	0.0	0.0
S04	2.0	0.0	1.0	0.0	0.0	1.0	2.0	...	0.0	1.0	0.0	0.0	0.0	1.0	0.0	2.0	1.0
S05	1.0	2.0	0.0	2.0	2.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0