Tutorial on PCA¶

import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch

import snputils as su
from snputils.processing import PCA

1. Load SNP Data¶

Load the SNP data from a VCF file and initialize it for PCA.

snpobj = su.load_dataset("1kgp", chromosomes=21, sample_ids=["HG00096", "HG00097", "HG00099", "HG00100"], sum_strands=False)

2. PCA with Scikit-Learn Backend¶

Perform PCA using PCA with sklearn backend.

# Initialize PCA with sklearn backend and 2 principal components
pca = PCA(backend="sklearn", n_components=2)

# Perform PCA on the SNP data
components = pca.fit_transform(snpobj)

# Create DataFrame for the PCA components
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
df.head()

# Visualize the PCA components in a scatter plot
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.tight_layout()
plt.show()

../_images/9185eed022c83dcd90fa46edea50908228eeb0dcf05a4768d592bfd5dfbda308.png

fit() and then transform()¶

pca = PCA(backend="sklearn", n_components=2)
pca.fit(snpobj)
components = pca.transform(snpobj)

df = pd.DataFrame({
    "Principal Component 1": components[:,0],
    "Principal Component 2": components[:,1],
})

plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.tight_layout()
plt.show()

../_images/7da417cd3395e42ff5bc85717ca1a34eb18ce82903b6f9a14c47f0f7ba74f74c.png

3. Separate fit() and transform() Functions¶

Use separate fit() and transform() methods to apply PCA on the same data.

# Initialize PCA and fit the model
pca = PCA(backend="sklearn", n_components=2)
pca.fit(snpobj)

# Transform the SNP data using the fitted model
components = pca.transform(snpobj)

# Visualize the PCA results
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.tight_layout()
plt.show()

4. Separate Strands into Independent Samples¶

Separate maternal and paternal strands and treat them as independent samples.

# Perform PCA with separate strands
pca = PCA(backend="sklearn", n_components=2)
components = pca.fit_transform(snpobj, average_strands=False)
print("Shape of PCA components with separate strands:", components.shape)

# Create DataFrame and plot
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Separate Strands)", fontsize=30)
plt.tight_layout()
plt.show()

Shape of PCA components with separate strands: (8, 2)

../_images/ea3b4c13f186e68ed7cb6277ca1a252107ce95130dac5cf7edd13d0260c3a1d7.png

5. Subset of Samples¶

Perform PCA on a subset of samples to optimize performance.

# Perform PCA on a subset of samples
pca = PCA(backend="sklearn", n_components=2)
components = pca.fit_transform(snpobj, samples_subset=100)
print("Shape of PCA components with subset of samples:", components.shape)

# Plot results
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Subset of Samples)", fontsize=30)
plt.tight_layout()
plt.show()

Shape of PCA components with subset of samples: (4, 2)

../_images/a1eb3fb4807209feb083e825940737d4eb071baa26cbb938925a02ed6dafa6ee.png

6. Subset of SNPs¶

Perform PCA on a specific subset of SNPs.

# Perform PCA on a subset of SNPs
pca = PCA(backend="sklearn", n_components=2)
components = pca.fit_transform(snpobj, snps_subset=500)
print("Shape of PCA components with subset of SNPs:", components.shape)

# Create DataFrame and plot
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Subset of SNPs)", fontsize=30)
plt.tight_layout()
plt.show()

Shape of PCA components with subset of SNPs: (4, 2)

../_images/b4474c51231399c1ed358ae688cd4e94ed4704b06466b7fd52e931a33ce66da1.png

7. Subset of Both Samples and SNPs¶

Perform PCA on both a subset of samples and a subset of SNPs.

# Perform PCA on subset of samples and subset of SNPs
pca = PCA(backend="sklearn", n_components=2)
components = pca.fit_transform(snpobj, snps_subset=500, samples_subset=50)
print("Shape of PCA components with subset of samples and SNPs:", components.shape)

# Plot the results
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Subset of Samples and SNPs)", fontsize=30)
plt.tight_layout()
plt.show()

Shape of PCA components with subset of samples and SNPs: (4, 2)

../_images/acd462cff6bb7a07817d95d0069906c80ec0801e9b90ae598903f1c3d7b219a3.png

8. GPU-Optimized: TorchPCA with PCA¶

If CUDA is available, use the pytorch backend for efficient computation on GPU. Use subsets of data when running on GPU to avoid memory overflow.

# Choose backend based on device availability
backend = "pytorch" if torch.cuda.is_available() else "sklearn"
print(f"Using {backend} backend")

# Initialize PCA with 2 components
pca = PCA(backend=backend, n_components=2)

# Fit and transform with a subset of samples
pca.fit(snpobj, snps_subset=500)
components = pca.transform(snpobj, snps_subset=500)

# Plot the PCA results
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Torch Backend)", fontsize=30)
plt.tight_layout()
plt.show()

Using pytorch backend
Converting data to PyTorch tensor on device cpu
Converting data to PyTorch tensor on device cpu

../_images/654b8a8e7d926462a8971e9f7767659acd3b42b8390eb2e968df626a1e0a55a5.png

9. Low-Rank PCA¶

Use low-rank approximation for large datasets.

# Perform low-rank PCA with `sklearn` backend for large datasets
pca = PCA(backend="sklearn", n_components=2, fitting="lowrank")
pca.fit(snpobj)
components = pca.transform(snpobj, samples_subset=100)

10. Separate Strands with Subset of SNPs¶

Separate maternal and paternal strands with a subset of SNPs.

# Perform PCA with separate strands and a subset of SNPs
pca = PCA(backend="sklearn", n_components=2)
components = pca.fit_transform(snpobj, average_strands=False, snps_subset=200)
print("Shape of PCA components with separate strands and subset of SNPs:", components.shape)

# Create DataFrame and plot
df = pd.DataFrame({
    "Principal Component 1": components[:, 0],
    "Principal Component 2": components[:, 1]
})
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="Principal Component 1", y="Principal Component 2", linewidth=0, alpha=0.5)
plt.grid()
plt.xlabel("Principal Component 1", fontsize=20)
plt.ylabel("Principal Component 2", fontsize=20)
plt.title("PCA (Separate Strands, Subset of SNPs)", fontsize=30)
plt.tight_layout()
plt.show()

Shape of PCA components with separate strands and subset of SNPs: (8, 2)

../_images/911f9e74b026105e639f8560ee1bb93a314733ff8808c271018ab3ad52ab0f39.png