Tutorial on SNPObj Functionalities

import logging
import os
import sys
import numpy as np

dir = os.path.abspath('../')
if not dir in sys.path: sys.path.append(dir)

from snputils.snp.io.read.vcf import VCFReader, VCFReaderPolars

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

1. Reading VCF Files into SNPObject

We can read VCF files into a SNPObject using either the standard VCFReader or the VCFReaderPolars. The Polars-based reader is optimized for speed but may consume more memory.

# Define the path to the VCF file
query_path = '../data/vcf/subset.vcf'

# Read VCF into SNPObject with the standard reader
reader = VCFReader(query_path)
snpobj = reader.read(sum_strands=False)

print("Attributes of the SNPObject:", snpobj.keys())

# Read VCF into SNPObject with the Polars-based reader
reader_polars = VCFReaderPolars(query_path)
snpobj_polars = reader_polars.read()
INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf
INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf
Attributes of the SNPObject: ['calldata_gt', 'samples', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai', 'ancestry_map']
INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf
INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf

2. Accessing and Modifying Attributes

You can access and modify attributes using either attribute syntax or dictionary-like access.

# Accessing the original sample IDs
print("Original sample IDs:", snpobj.samples)

# Updating sample IDs using attribute access
snpobj.samples = ['sample_1', 'sample_2', 'sample_3', 'sample_4']
print("Updated sample IDs (attribute access):", snpobj.samples)

# Updating sample IDs using dictionary-like access
snpobj['samples'] = ['sample_A', 'sample_B', 'sample_C', 'sample_D']
print("Updated sample IDs (dictionary access):", snpobj['samples'])
Original sample IDs: ['HG00096' 'HG00097' 'HG00099' 'HG00100']
Updated sample IDs (attribute access): ['sample_1' 'sample_2' 'sample_3' 'sample_4']
Updated sample IDs (dictionary access): ['sample_A' 'sample_B' 'sample_C' 'sample_D']

3. Retrieving Counts and Unique Chromosomes

Number of Samples and SNPs

# Retrieving the number of samples
n_samples = snpobj.n_samples
print("Number of samples:", n_samples)

# Retrieving the number of SNPs
n_snps = snpobj.n_snps
print("Number of SNPs:", n_snps)
Number of samples: 4
Number of SNPs: 976599

Unique Chromosome Names

print("Unique chromosomes:", snpobj.unique_chrom)
Unique chromosomes: ['21']

4. Chromosome Nomenclature and Renaming

Chromosome nomenclature can vary between datasets. The SNPObject provides methods to detect and convert chromosome formats.

Detecting Chromosome Format

chrom_format = snpobj.detect_chromosome_format()
print("Detected chromosome format:", chrom_format)
Detected chromosome format: plain

Converting Chromosome Format

You can convert the chromosome format to match a standard or another SNPObject.

# Convert from 'chr' format to 'plain' format
snpobj_plain = snpobj.convert_chromosome_format(from_format='plain', to_format='chr', inplace=False)
print("Chromosomes after conversion to chr format:", snpobj_plain.unique_chrom)
Chromosomes after conversion to chr format: ['chr21']

Matching Chromosome Format with Another SNPObject

# Assume snpobj_reference is another SNPObject with a different chromosome format
snpobj_reference = VCFReader(query_path).read()
snpobj_matched = snpobj.match_chromosome_format(snpobj_reference, inplace=False)
print("Chromosomes after matching format:", snpobj_matched.unique_chrom)
INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf
INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf
Chromosomes after matching format: ['21']

Renaming Chromosomes with Custom Patterns

The rename_chrom method allows for custom renaming using regex patterns.

# Custom renaming using regex
snpobj_renamed = snpobj.rename_chrom(to_replace={'^chr': ''}, regex=True, inplace=False)
print("Chromosomes after custom renaming:", snpobj_renamed.unique_chrom)
Chromosomes after custom renaming: ['21']

5. Managing Missing Data

The rename_missings method replaces missing values in the calldata_gt attribute.

print("Unique genotype values before renaming missings:", np.unique(snpobj['calldata_gt']))

# Replace missing values (-1) with '.'
snpobj_no_missing = snpobj.rename_missings(before='.', after=-1, inplace=False)
print("Unique genotype values after renaming missings:", np.unique(snpobj_no_missing['calldata_gt']))
Unique genotype values before renaming missings: [0 1]
Unique genotype values after renaming missings: [0 1]

6. Filtering SNPs

The filter_variants method filters SNPs based on chromosomes, positions, or indexes.

Filtering by Chromosome and Position

print("Number of SNPs before filtering:", snpobj.n_snps)

# Include SNPs on chromosome '21' between positions 500,000 and 600,000
filtered_snpobj = snpobj.filter_variants(chrom='21', pos=range(500000, 600000), include=True, inplace=False)

print("Number of SNPs after filtering:", filtered_snpobj.n_snps)
Number of SNPs before filtering: 976599
Number of SNPs after filtering: 0

Excluding SNPs by Chromosome

print("Unique chromosomes before filtering:", snpobj.unique_chrom)

# Exclude SNPs on chromosome '21'
snpobj_excluded = snpobj.filter_variants(chrom='21', include=False, inplace=False)

print("Unique chromosomes after filtering:", snpobj_excluded.unique_chrom)
Unique chromosomes before filtering: ['21']
Unique chromosomes after filtering: []

Filtering by Indexes

print("Number of SNPs before filtering by indexes:", snpobj.n_snps)

# Include SNPs at indexes 0, 2, and 4
snpobj_by_index = snpobj.filter_variants(indexes=[0, 2, 4], include=True, inplace=False)

print("Number of SNPs after filtering by indexes:", snpobj_by_index.n_snps)
Number of SNPs before filtering by indexes: 976599
Number of SNPs after filtering by indexes: 3

7. Filtering Samples

The filter_samples method filters samples based on names or indexes.

Excluding Samples by Name

print("Samples before filtering:", snpobj.samples)

# Exclude samples 'sample_C' and 'sample_D'
snpobj_filtered_samples = snpobj.filter_samples(samples=['sample_C', 'sample_D'], include=False, inplace=False)

print("Samples after filtering:", snpobj_filtered_samples.samples)
Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']
Samples after filtering: ['sample_A' 'sample_B']

Excluding Samples by Indexes

print("Samples before filtering:", snpobj.samples)

# Exclude samples at indexes 0 and 3
snpobj_filtered_indexes = snpobj.filter_samples(indexes=[0, 3], include=False, inplace=False)

print("Samples after filtering:", snpobj_filtered_indexes.samples)
Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']
Samples after filtering: ['sample_B' 'sample_C']

8. Subsetting to Common Variants and Markers

Subsetting to Common Variants

The subset_to_common_variants method subsets the SNPObject to variants common with another SNPObject.

# Read another SNPObject for comparison
snpobj2 = VCFReader(query_path).read()

# Subset to common variants based on position
snpobj_common_variants = snpobj.subset_to_common_variants(snpobj2, index_by='pos', inplace=False)

print("Number of SNPs after subsetting to common variants:", snpobj_common_variants.n_snps)
INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf
INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf
Number of SNPs after subsetting to common variants: 976599

Subsetting to Common Markers

The subset_to_common_markers method subsets to markers that have the same chrom, pos, ref, and alt.

snpobj_common_markers = snpobj.subset_to_common_markers(snpobj2, inplace=False)

print("Number of SNPs after subsetting to common markers:", snpobj_common_markers.n_snps)
Number of SNPs after subsetting to common markers: 976599

9. Data Cleaning and Quality Control

Removing Strand-Ambiguous SNPs

The remove_strand_ambiguous_variants method removes SNPs that are strand-ambiguous.

print("Number of SNPs before removing ambiguous SNPs:", snpobj.n_snps)

snpobj_no_ambiguous = snpobj.remove_strand_ambiguous_variants(inplace=False)

print("Number of SNPs after removing ambiguous SNPs:", snpobj_no_ambiguous.n_snps)
Number of SNPs before removing ambiguous SNPs: 976599
INFO:snputils.snp.genobj.snpobj:35183 ambiguities of A-T type.
INFO:snputils.snp.genobj.snpobj:35105 ambiguities of T-A type.
INFO:snputils.snp.genobj.snpobj:39334 ambiguities of C-G type.
INFO:snputils.snp.genobj.snpobj:38992 ambiguities of G-C type.
Number of SNPs after removing ambiguous SNPs: 827985

Correcting SNP Flips

snpobj_corrected = snpobj.correct_flipped_variants(snpobj2, check_complement=True, index_by='pos', inplace=False)

print("SNP flips corrected.")
INFO:snputils.snp.genobj.snpobj:Matching reference alleles (ref=ref'): 976599, Matching alternate alleles (alt=alt'): 976599.
INFO:snputils.snp.genobj.snpobj:Number of ambiguous alleles (ref=alt): 0.
INFO:snputils.snp.genobj.snpobj:Correcting 148614 variant flips...
SNP flips corrected.

Removing Mismatching SNPs

The remove_mismatching_variants method removes SNPs where ref or alt alleles do not match between two SNPObjects.

print("Number of SNPs before removing mismatches:", snpobj.n_snps)

snpobj_no_mismatches = snpobj.remove_mismatching_variants(snpobj2, index_by='pos', inplace=False)

print("Number of SNPs after removing mismatches:", snpobj_no_mismatches.n_snps)
Number of SNPs before removing mismatches: 976599
Number of SNPs after removing mismatches: 0

10. Shuffling Variants

The shuffle_variants method randomly shuffles the order of variants.

print("First 5 variant positions before shuffling:", snpobj['variants_pos'][:5])

snpobj_shuffled = snpobj.shuffle_variants(inplace=False)

print("First 5 variant positions after shuffling:", snpobj_shuffled['variants_pos'][:5])
First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]
First 5 variant positions after shuffling: [40064132 19950752 21721370 46148286 36956322]

11. Handling Empty Data Entries

The set_empty_to_missing method replaces empty strings with missing values ‘.’.

# Introduce some empty strings in variants_ref
snpobj['variants_ref'][0] = ''

print("Variants_ref before handling empty entries:", snpobj['variants_ref'][:5])

snpobj_handled = snpobj.set_empty_to_missing(inplace=False)

print("Variants_ref after handling empty entries:", snpobj_handled['variants_ref'][:5])
Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']
Variants_ref after handling empty entries: ['.' 'G' 'G' 'C' 'A']

12. Saving SNPObject to Files

The save method saves the SNPObject to a file, with the format determined by the file extension.

Saving as VCF

# Define the path to the VCF file
query_path = '../data/vcf/subset.vcf'

# Read VCF into SNPObject with the standard reader
reader = VCFReader(query_path)
snpobj = reader.read(sum_strands=False)
INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf
INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf
# Define the path to save the VCF file
output_vcf_path1 = '../data/output.vcf'
output_vcf_path2 = '../data/output.unphased'

# Save the SNPObject as a VCF file (Option 1)
snpobj.save(output_vcf_path1)
print(f"SNPObject saved to {output_vcf_path1}")

# Save the SNPObject as a VCF file (Option 2)
snpobj.save_vcf(output_vcf_path2)
print(f"SNPObject saved to {output_vcf_path2}")
SNPObject saved to ../data/output.vcf
SNPObject saved to ../data/output.unphased
/home/miriam/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: Using fork() can cause Polars to deadlock in the child process.
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

If you really know what your doing, you can silence this warning with the warning module
or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid = os.fork()

Saving as PGEN

# Define the path to save the BED file
output_pgen_path1 = '../data/output.pgen'
output_pgen_path2 = '../data/output.unphased'

# Save the SNPObject as a PGEN file (option 1)
snpobj.save(output_pgen_path1)
print(f"SNPObject saved to {output_pgen_path1}")

# Save the SNPObject as a PGEN file (option 2)
snpobj.save_pgen(output_pgen_path2)
print(f"SNPObject saved to {output_pgen_path2}")
INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar
INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam
INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen
SNPObject saved to ../data/output.pgen
INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pvar
INFO:snputils.snp.io.write.pgen:Writing ../data/output.unphased.psam
INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pgen
SNPObject saved to ../data/output.unphased

Saving as BED

# Define the path to save the BED file
output_bed_path1 = '../data/output.bed'
output_bed_path2 = '../data/output.unphased'

# Save the SNPObject as a BED file (option 1)
snpobj.save(output_bed_path1)
print(f"SNPObject saved to {output_bed_path1}")

# Save the SNPObject as a BED file (option 2)
snpobj.save_bed(output_bed_path2)
print(f"SNPObject saved to {output_bed_path2}")
INFO:snputils.snp.io.write.bed:Writing .bed file: ../data/output.bed
INFO:snputils.snp.io.write.bed:Finished writing .bed file: ../data/output.bed
INFO:snputils.snp.io.write.bed:Writing .fam file: ../data/output
INFO:snputils.snp.io.write.bed:Finished writing .fam file: ../data/output
INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output
WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.
INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output
SNPObject saved to ../data/output.bed
INFO:snputils.snp.io.write.bed:Writing .bed file: ../data/output.bed
INFO:snputils.snp.io.write.bed:Finished writing .bed file: ../data/output.bed
INFO:snputils.snp.io.write.bed:Writing .fam file: ../data/output
INFO:snputils.snp.io.write.bed:Finished writing .fam file: ../data/output
INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output
WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.
INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output
SNPObject saved to ../data/output.unphased

Saving as Pickle

# Define the path to save the pickle file
output_pkl_path1 = '../data/output.pkl'
output_pkl_path2 = '../data/output.unphased'

# Save the SNPObject as a pickle file (option 1)
snpobj.save(output_pkl_path1)
print(f"SNPObject saved to {output_pkl_path1}")

# Save the SNPObject as a pickle file (option 2)
snpobj.save_pickle(output_pkl_path2)
print(f"SNPObject saved to {output_pkl_path2}")
SNPObject saved to ../data/output.pkl
SNPObject saved to ../data/output.unphased