# Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
import os

np.random.seed(2025)
os.makedirs('gwas_outputs', exist_ok=True)
print('Environment ready. Outputs will be written to gwas_outputs/')

Environment ready. Outputs will be written to gwas_outputs/

# Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
import os

np.random.seed(2025)
os.makedirs('gwas_outputs', exist_ok=True)
print('Environment ready. Outputs will be written to gwas_outputs/')

Environment ready. Outputs will be written to gwas_outputs/

# Generate synthetic genotype data
n_samples = 50
n_snps = 500
samples = [f"S{i+1:02d}" for i in range(n_samples)]
pop_labels = np.array(['A'] * (n_samples//2) + ['B'] * (n_samples - n_samples//2))

baseline_af = np.random.uniform(0.05, 0.5, size=n_snps)
popB_shift = np.random.normal(loc=0.0, scale=0.08, size=n_snps)
af_A = np.clip(baseline_af, 0.001, 0.999)
af_B = np.clip(baseline_af + popB_shift, 0.001, 0.999)

genotypes = np.full((n_samples, n_snps), np.nan)
for v in range(n_snps):
    pA = af_A[v]
    pB = af_B[v]
    probsA = [ (1-pA)**2, 2*pA*(1-pA), pA**2 ]
    probsB = [ (1-pB)**2, 2*pB*(1-pB), pB**2 ]
    genotypes[:n_samples//2, v] = np.random.choice([0,1,2], size=n_samples//2, p=probsA)
    genotypes[n_samples//2:, v] = np.random.choice([0,1,2], size=n_samples - n_samples//2, p=probsB)
genotypes[0:5, 0:5]

array([[0., 2., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [2., 0., 0., 0., 1.],
       [2., 1., 1., 1., 0.],
       [2., 0., 0., 0., 1.]])

# Simulate covariates: age and sex (binary)
age = np.random.normal(50, 10, size=n_samples).astype(int)
sex = np.random.binomial(1, 0.5, size=n_samples)

# Create phenotype: a polygenic trait with 10 causal SNPs
causal_idx = np.random.choice(n_snps, size=10, replace=False)
beta_snps = np.zeros(n_snps)
beta_snps[causal_idx] = np.random.normal(0.3, 0.1, size=10)

# Genetic effect
# mean-impute genotypes for calculation
G_imp = np.where(np.isnan(genotypes), np.nanmean(genotypes, axis=0), genotypes)
polygenic = G_imp.dot(beta_snps)

# Covariate effects
beta_age = 0.02
beta_sex = -0.1
beta_pop = 0.4

# Environmental noise
env = np.random.normal(0, 1.0, size=n_samples)
## convert population label to numeric 
pop_numeric = np.array([0 if x == 'A' else 1 for x in pop_labels])

y = 0.5 + polygenic + beta_age * age + beta_sex * sex + beta_pop * pop_numeric + env

df_meta = pd.DataFrame({'sample': samples, 'age': age, 'sex': sex, 'pop': pop_labels, 'y': y}).set_index('sample')
df_meta.to_csv('gwas_outputs/phenotypes.csv')
print('Phenotypes saved: gwas_outputs/phenotypes.csv')

# ~2% missing
mask_missing = np.random.rand(n_samples, n_snps) < 0.02
genotypes[mask_missing] = np.nan

snps_ids = [f'rs{100000+i}' for i in range(n_snps)]
df_geno = pd.DataFrame(genotypes, index=samples, columns=snps_ids)

print('Generated genotype matrix shape:', df_geno.shape)
df_geno.head()

Phenotypes saved: gwas_outputs/phenotypes.csv
Generated genotype matrix shape: (50, 500)

import numpy as np

def compute_maf(genotypes):
    """
    Compute minor allele frequency (MAF) for a single SNP.

    Parameters
    ----------
    genotypes : array-like
        Genotypes coded as 0, 1, 2 (alt allele count).
        Missing values can be np.nan.

    Returns
    -------
    maf : float
        Minor allele frequency, or np.nan if no valid genotypes.
    """
    g = np.asarray(genotypes, dtype=float)
    g = g[~np.isnan(g)]  # remove missing

    if g.size == 0:
        return np.nan

    # total number of alternate alleles
    alt_allele_count = g.sum()
    # total number of alleles
    total_alleles = 2 * g.size

    af = alt_allele_count / total_alleles
    maf = min(af, 1 - af)

    return maf

maf = df_geno.apply(compute_maf, axis=0)
maf

rs100000    0.061224
rs100001    0.377551
rs100002    0.340000
rs100003    0.180000
rs100004    0.153061
              ...   
rs100495    0.360000
rs100496    0.071429
rs100497    0.438776
rs100498    0.010417
rs100499    0.132653
Length: 500, dtype: float64

# TASK: Compute QC metrics and run PCA
# 1) Compute missingness for each SNP and save as gwas_outputs/variant_qc.csv
# 2) Mean-impute genotypes and run PCA on a subset of SNPs to get PCs (save to gwas_outputs/pcs.csv)
# 3) Plot PC1 vs PC2 colored by population and save as gwas_outputs/pca.png
#
# --- TODO: Computing missingness for each snp ---
### YOUR CODE HERE ###

# Save QC (once implemented)
# qc = pd.DataFrame({'MAF': maf, 'missingness': missingness})
# qc[1:10, 1:10]

### Impute Missing Values and Center genotype for PCA analysis
G_imp = df_geno.fillna(df_geno.mean(axis=0))
G_centered = G_imp - G_imp.mean(axis=0)

# PCA on mean-imputed genotypes (for population structure)
# perform PCA on a random subset of SNPs for speed
pca = PCA(n_components=4)
pcs = pca.fit_transform(G_centered.values)

pc_df = pd.DataFrame(pcs, index=samples, columns=['PC1','PC2','PC3','PC4'])
pc_df.head()

df_meta.head()

plt.figure(figsize=(6, 5))
for label in df_meta['pop'].unique():
    idx = df_meta['pop'] == label
    plt.scatter(
        pc_df.loc[idx, 'PC1'],
        pc_df.loc[idx, 'PC2'],
        label=label,
        s=30,
        alpha=0.8
    )

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA: PC1 vs PC2')
plt.legend(title='Population',  loc='lower right')
plt.tight_layout()
plt.show()

	PC1	PC2	PC3	PC4
S01	-1.888524	-3.009312	0.707643	0.659718
S02	-2.375150	4.459011	-3.793163	1.617335
S03	-5.630537	-2.389041	-3.736545	-2.093097
S04	-1.966514	0.735002	-4.722171	0.654252
S05	-2.190321	0.636415	3.232525	-0.872838

	age	sex	pop	y
sample
S01	45	0	A	1.592461
S02	45	1	A	3.078932
S03	48	1	A	0.203718
S04	45	1	A	2.332965
S05	54	0	A	2.585327

Homework: Introductory GWAS Analysis¶

Due Date: Feb 16, 2026¶

Part 1:¶

Questions¶

Questions:¶

Questions¶

Questions¶

Notes & Next steps¶

	rs100000	rs100001	rs100002	rs100003	rs100004	rs100005	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100498	rs100499
S01	0.0	2.0	0.0	0.0	0.0	0.0	2.0	1.0	0.0	0.0	...	0.0	2.0	1.0	0.0	NaN	1.0	0.0	1.0	NaN	1.0
S02	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
S03	NaN	0.0	0.0	0.0	1.0	1.0	1.0	0.0	2.0	1.0	...	0.0	0.0	1.0	0.0	2.0	1.0	0.0	1.0	1.0	NaN
S04	2.0	1.0	1.0	1.0	0.0	0.0	2.0	NaN	1.0	0.0	...	1.0	1.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
S05	2.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	...	0.0	1.0	1.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0

	rs100000	rs100001	rs100002	rs100003	rs100004	rs100005	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100498	rs100499
S01	0.0	2.0	0.0	0.0	0.0	0.0	2.0	1.0	0.0	0.0	...	0.0	2.0	1.0	0.0	NaN	1.0	0.0	1.0	NaN	1.0
S02	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
S03	NaN	0.0	0.0	0.0	1.0	1.0	1.0	0.0	2.0	1.0	...	0.0	0.0	1.0	0.0	2.0	1.0	0.0	1.0	1.0	NaN
S04	2.0	1.0	1.0	1.0	0.0	0.0	2.0	NaN	1.0	0.0	...	1.0	1.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
S05	2.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	...	0.0	1.0	1.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0

	rs100000	rs100001	rs100002	rs100003	rs100004	rs100005	rs100006	rs100007	rs100008	rs100009	...	rs100490	rs100491	rs100492	rs100493	rs100494	rs100495	rs100496	rs100497	rs100498	rs100499
S01	0.0	2.0	0.0	0.0	0.0	0.0	2.0	1.0	0.0	0.0	...	0.0	2.0	1.0	0.0	NaN	1.0	0.0	1.0	NaN	1.0
S02	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
S03	NaN	0.0	0.0	0.0	1.0	1.0	1.0	0.0	2.0	1.0	...	0.0	0.0	1.0	0.0	2.0	1.0	0.0	1.0	1.0	NaN
S04	2.0	1.0	1.0	1.0	0.0	0.0	2.0	NaN	1.0	0.0	...	1.0	1.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
S05	2.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	...	0.0	1.0	1.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0