GRG Demo¶
This notebook demonstrates current snputils + pygrgl workflows:
Build a toy GRG in memory
Save/load GRG with
snputilsConvert
GRGObject->SNPObjectWrite/read PGEN from converted SNP data
Optional: VCF -> GRG conversion via
grg construct
from pathlib import Path
import tempfile
import shutil
import numpy as np
import pygrgl as pyg
import snputils
from snputils.snp.genobj.grgobj import GRGObject
from snputils.snp.io import GRGWriter
from snputils.snp.io.read.functional import read_grg
from snputils.snp.io.read.grg import GRGReader
from snputils.snp.io.read.pgen import PGENReader
from snputils.snp.io.write.grg_from_vcf import vcf_to_grg
from snputils.snp.io.write.pgen import PGENWriter
print('snputils:', snputils.__version__)
print('grg CLI available:', shutil.which('grg') is not None)
print('grapp CLI available:', shutil.which('grapp') is not None)
snputils: 2.78
grg CLI available: True
grapp CLI available: False
1) Build a toy GRG in memory¶
def build_toy_grg() -> pyg.MutableGRG:
# 3 diploid individuals => 6 haplotype sample nodes
grg = pyg.MutableGRG(6, 2, True)
root = grg.make_node()
left = grg.make_node()
right = grg.make_node()
grg.connect(root, left)
grg.connect(root, right)
for sample in [0, 1, 2]:
grg.connect(left, sample)
for sample in [3, 4, 5]:
grg.connect(right, sample)
grg.add_mutation(pyg.Mutation(100.0, 'G', 'A', 0.0), root)
grg.add_mutation(pyg.Mutation(110.0, 'T', 'C', 0.0), left)
grg.add_mutation(pyg.Mutation(120.0, 'C', 'G', 0.0), right)
grg.add_mutation(pyg.Mutation(130.0, 'A', 'T', 0.0), 0)
grg.add_mutation(pyg.Mutation(140.0, 'G', 'T', 0.0), 5)
return grg
toy_grg = build_toy_grg()
toy_obj = GRGObject(calldata_gt=toy_grg, mutable=True)
print('samples (haplotypes):', toy_grg.num_samples)
print('samples (diploid):', toy_obj.n_samples())
print('mutations:', toy_obj.n_snps())
print('allele freq:', toy_obj.allele_freq())
samples (haplotypes): 6
samples (diploid): 3
mutations: 5
allele freq: [1. 0.5 0.5 0.16666667 0.16666667]
2) Save and reload GRG with snputils¶
tmpdir = Path(tempfile.mkdtemp(prefix='snputils_grg_demo_'))
grg_path = tmpdir / 'toy.grg'
GRGWriter(toy_grg, str(grg_path)).write()
loaded_via_class = GRGReader(grg_path).read(mutable=False)
loaded_via_func = read_grg(grg_path, mutable=False)
print('tmpdir:', tmpdir)
print('grg_path exists:', grg_path.exists())
print('class reader -> mutable:', loaded_via_class.mutable, 'snps:', loaded_via_class.n_snps())
print('functional read_grg -> mutable:', loaded_via_func.mutable, 'snps:', loaded_via_func.n_snps())
tmpdir: /var/folders/wt/pt9z1wnd52v6pjpg9y6z42nm0000gn/T/snputils_grg_demo_76vbeu9s
grg_path exists: True
class reader -> mutable: False snps: 5
functional read_grg -> mutable: False snps: 5
3) Convert GRGObject to SNPObject¶
snp_phased = loaded_via_func.to_snpobject(chrom='22', sum_strands=False)
snp_summed = loaded_via_func.to_snpobject(chrom='22', sum_strands=True)
print('phased calldata_gt shape:', snp_phased.calldata_gt.shape)
print('summed calldata_gt shape:', snp_summed.calldata_gt.shape)
print('samples:', snp_phased.samples.tolist())
print('variant IDs:', snp_phased.variants_id.tolist())
# Sanity check: summed view should match phased sum over strands
np.testing.assert_array_equal(snp_summed.calldata_gt, snp_phased.calldata_gt.sum(axis=2))
print('summed == phased.sum(axis=2): OK')
phased calldata_gt shape: (5, 3, 2)
summed calldata_gt shape: (5, 3)
samples: ['sample_0', 'sample_1', 'sample_2']
variant IDs: ['22:100', '22:110', '22:120', '22:130', '22:140']
summed == phased.sum(axis=2): OK
4) PGEN roundtrip from converted SNPObject¶
pgen_prefix = tmpdir / 'toy_from_grg'
PGENWriter(snp_phased, str(pgen_prefix)).write(vzs=False, rename_missing_values=False)
snp_roundtrip = PGENReader(pgen_prefix).read(sum_strands=False)
print('pgen prefix:', pgen_prefix)
print('roundtrip calldata_gt shape:', snp_roundtrip.calldata_gt.shape)
np.testing.assert_array_equal(snp_roundtrip.calldata_gt, snp_phased.calldata_gt)
print('roundtrip genotypes: OK')
pgen prefix: /var/folders/wt/pt9z1wnd52v6pjpg9y6z42nm0000gn/T/snputils_grg_demo_76vbeu9s/toy_from_grg
roundtrip calldata_gt shape: (5, 3, 2)
roundtrip genotypes: OK
5) Optional: VCF -> GRG with vcf_to_grg()¶
This example uses a temporary tiny VCF and force=True.
For tiny synthetic VCFs, GRG construction may produce 0 mapped mutations depending on GRG construction heuristics;
the goal here is to demonstrate the API call and output file generation.
if shutil.which('grg') is None:
print('grg CLI not available; skipping VCF -> GRG example')
else:
tiny_vcf = tmpdir / 'tiny.vcf'
samples = [f's{i}' for i in range(12)]
header = '\t'.join(['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', *samples])
gt_row = '\t'.join((['0|0'] * 6) + (['1|1'] * 6))
tiny_vcf.write_text(
'##fileformat=VCFv4.2\n'
'##contig=<ID=1>\n'
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
+ header + '\n'
+ f'1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t{gt_row}\n'
)
tiny_grg = tmpdir / 'tiny_from_vcf.grg'
vcf_to_grg(tiny_vcf, parts=1, jobs=1, trees=1, force=True, out_file=str(tiny_grg))
tiny_loaded = pyg.load_immutable_grg(str(tiny_grg), True)
print('tiny_grg exists:', tiny_grg.exists())
print('tiny_grg samples:', tiny_loaded.num_samples)
print('tiny_grg mutations:', tiny_loaded.num_mutations)
tiny_grg exists: True
tiny_grg samples: 24
tiny_grg mutations: 0
Notes¶
GRGObject.to_snpobject()currently materializes a dense genotype matrix, so memory scales withn_snps * n_samples.Once converted to
SNPObject, all regular SNP workflows insnputilsapply (writing PGEN/VCF, plotting, etc.).