GRG Demo

This notebook demonstrates current snputils + pygrgl workflows:

  • Build a toy GRG in memory

  • Save/load GRG with snputils

  • Convert GRGObject -> SNPObject

  • Write/read PGEN from converted SNP data

  • Optional: VCF -> GRG conversion via grg construct

from pathlib import Path
import tempfile
import shutil

import numpy as np
import pygrgl as pyg

import snputils
from snputils.snp.genobj.grgobj import GRGObject
from snputils.snp.io import GRGWriter
from snputils.snp.io.read.functional import read_grg
from snputils.snp.io.read.grg import GRGReader
from snputils.snp.io.read.pgen import PGENReader
from snputils.snp.io.write.grg_from_vcf import vcf_to_grg
from snputils.snp.io.write.pgen import PGENWriter

print('snputils:', snputils.__version__)
print('grg CLI available:', shutil.which('grg') is not None)
print('grapp CLI available:', shutil.which('grapp') is not None)
snputils: 2.78
grg CLI available: True
grapp CLI available: False

1) Build a toy GRG in memory

def build_toy_grg() -> pyg.MutableGRG:
    # 3 diploid individuals => 6 haplotype sample nodes
    grg = pyg.MutableGRG(6, 2, True)
    root = grg.make_node()
    left = grg.make_node()
    right = grg.make_node()

    grg.connect(root, left)
    grg.connect(root, right)
    for sample in [0, 1, 2]:
        grg.connect(left, sample)
    for sample in [3, 4, 5]:
        grg.connect(right, sample)

    grg.add_mutation(pyg.Mutation(100.0, 'G', 'A', 0.0), root)
    grg.add_mutation(pyg.Mutation(110.0, 'T', 'C', 0.0), left)
    grg.add_mutation(pyg.Mutation(120.0, 'C', 'G', 0.0), right)
    grg.add_mutation(pyg.Mutation(130.0, 'A', 'T', 0.0), 0)
    grg.add_mutation(pyg.Mutation(140.0, 'G', 'T', 0.0), 5)
    return grg

toy_grg = build_toy_grg()
toy_obj = GRGObject(calldata_gt=toy_grg, mutable=True)

print('samples (haplotypes):', toy_grg.num_samples)
print('samples (diploid):', toy_obj.n_samples())
print('mutations:', toy_obj.n_snps())
print('allele freq:', toy_obj.allele_freq())
samples (haplotypes): 6
samples (diploid): 3
mutations: 5
allele freq: [1.         0.5        0.5        0.16666667 0.16666667]

2) Save and reload GRG with snputils

tmpdir = Path(tempfile.mkdtemp(prefix='snputils_grg_demo_'))
grg_path = tmpdir / 'toy.grg'
GRGWriter(toy_grg, str(grg_path)).write()

loaded_via_class = GRGReader(grg_path).read(mutable=False)
loaded_via_func = read_grg(grg_path, mutable=False)

print('tmpdir:', tmpdir)
print('grg_path exists:', grg_path.exists())
print('class reader -> mutable:', loaded_via_class.mutable, 'snps:', loaded_via_class.n_snps())
print('functional read_grg -> mutable:', loaded_via_func.mutable, 'snps:', loaded_via_func.n_snps())
tmpdir: /var/folders/wt/pt9z1wnd52v6pjpg9y6z42nm0000gn/T/snputils_grg_demo_76vbeu9s
grg_path exists: True
class reader -> mutable: False snps: 5
functional read_grg -> mutable: False snps: 5

3) Convert GRGObject to SNPObject

snp_phased = loaded_via_func.to_snpobject(chrom='22', sum_strands=False)
snp_summed = loaded_via_func.to_snpobject(chrom='22', sum_strands=True)

print('phased calldata_gt shape:', snp_phased.calldata_gt.shape)
print('summed calldata_gt shape:', snp_summed.calldata_gt.shape)
print('samples:', snp_phased.samples.tolist())
print('variant IDs:', snp_phased.variants_id.tolist())

# Sanity check: summed view should match phased sum over strands
np.testing.assert_array_equal(snp_summed.calldata_gt, snp_phased.calldata_gt.sum(axis=2))
print('summed == phased.sum(axis=2): OK')
phased calldata_gt shape: (5, 3, 2)
summed calldata_gt shape: (5, 3)
samples: ['sample_0', 'sample_1', 'sample_2']
variant IDs: ['22:100', '22:110', '22:120', '22:130', '22:140']
summed == phased.sum(axis=2): OK

4) PGEN roundtrip from converted SNPObject

pgen_prefix = tmpdir / 'toy_from_grg'
PGENWriter(snp_phased, str(pgen_prefix)).write(vzs=False, rename_missing_values=False)
snp_roundtrip = PGENReader(pgen_prefix).read(sum_strands=False)

print('pgen prefix:', pgen_prefix)
print('roundtrip calldata_gt shape:', snp_roundtrip.calldata_gt.shape)
np.testing.assert_array_equal(snp_roundtrip.calldata_gt, snp_phased.calldata_gt)
print('roundtrip genotypes: OK')
pgen prefix: /var/folders/wt/pt9z1wnd52v6pjpg9y6z42nm0000gn/T/snputils_grg_demo_76vbeu9s/toy_from_grg
roundtrip calldata_gt shape: (5, 3, 2)
roundtrip genotypes: OK

5) Optional: VCF -> GRG with vcf_to_grg()

This example uses a temporary tiny VCF and force=True. For tiny synthetic VCFs, GRG construction may produce 0 mapped mutations depending on GRG construction heuristics; the goal here is to demonstrate the API call and output file generation.

if shutil.which('grg') is None:
    print('grg CLI not available; skipping VCF -> GRG example')
else:
    tiny_vcf = tmpdir / 'tiny.vcf'
    samples = [f's{i}' for i in range(12)]
    header = '\t'.join(['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', *samples])
    gt_row = '\t'.join((['0|0'] * 6) + (['1|1'] * 6))
    tiny_vcf.write_text(
        '##fileformat=VCFv4.2\n'
        '##contig=<ID=1>\n'
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
        + header + '\n'
        + f'1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t{gt_row}\n'
    )

    tiny_grg = tmpdir / 'tiny_from_vcf.grg'
    vcf_to_grg(tiny_vcf, parts=1, jobs=1, trees=1, force=True, out_file=str(tiny_grg))

    tiny_loaded = pyg.load_immutable_grg(str(tiny_grg), True)
    print('tiny_grg exists:', tiny_grg.exists())
    print('tiny_grg samples:', tiny_loaded.num_samples)
    print('tiny_grg mutations:', tiny_loaded.num_mutations)
tiny_grg exists: True
tiny_grg samples: 24
tiny_grg mutations: 0

Notes

  • GRGObject.to_snpobject() currently materializes a dense genotype matrix, so memory scales with n_snps * n_samples.

  • Once converted to SNPObject, all regular SNP workflows in snputils apply (writing PGEN/VCF, plotting, etc.).