snputils.simulation.simulator_cli

CLI wrapper for OnlineSimulator.

Example

python3 /home/users/geleta/snputils/snputils/simulation/simulator_cli.py --vcf /oak/stanford/projects/tobias/rita/1000G/ref_final_beagle_phased_1kg_hgdp_sgdp_chr22_hg19.vcf.gz --metadata /oak/stanford/projects/tobias/rita/1000G/reference_panel_metadata_v2.tsv --genetic-map /oak/stanford/projects/tobias/rita/1000G/allchrs.b37.gmap --chromosome 22 --window-size 1000 --store-latlon-as-nvec --batch-size 512 --num-generations 64 --n-batches 10 --output-dir /tmp

  1"""
  2CLI wrapper for OnlineSimulator.
  3
  4Example
  5-------
  6python3 /home/users/geleta/snputils/snputils/simulation/simulator_cli.py \
  7    --vcf /oak/stanford/projects/tobias/rita/1000G/ref_final_beagle_phased_1kg_hgdp_sgdp_chr22_hg19.vcf.gz \
  8    --metadata /oak/stanford/projects/tobias/rita/1000G/reference_panel_metadata_v2.tsv \
  9    --genetic-map /oak/stanford/projects/tobias/rita/1000G/allchrs.b37.gmap \
 10    --chromosome 22 \
 11    --window-size 1000 \
 12    --store-latlon-as-nvec \
 13    --batch-size 512 \
 14    --num-generations 64 \
 15    --n-batches 10 \
 16    --output-dir /tmp
 17"""
 18
 19import sys
 20import argparse
 21import logging
 22from pathlib import Path
 23import numpy as np
 24import pandas as pd
 25
 26from snputils.snp.io.read.vcf import VCFReaderPolars
 27from snputils.simulation.simulator import OnlineSimulator
 28
 29logging.basicConfig(level=logging.INFO,
 30                    format="%(asctime)s%(levelname)-8s%(message)s",
 31                    datefmt="%Y-%m-%d %H:%M:%S")
 32log = logging.getLogger("simulator_cli")
 33    
 34def parse_sim_args() -> argparse.Namespace:
 35    """Parse command-line flags for the simulator CLI."""
 36    p = argparse.ArgumentParser(
 37        prog="simulator_cli",
 38        description="Batch-simulate admixed haplotypes with OnlineSimulator.",
 39        formatter_class=argparse.ArgumentDefaultsHelpFormatter
 40    )
 41
 42    # Required I/O
 43    p.add_argument("--vcf", required=True,
 44                   help="Path to the phased VCF/VCF-gz file.")
 45    p.add_argument("--metadata", required=True,
 46                   help="TSV/CSV file with at least Sample / Population / Latitude / Longitude.")
 47    p.add_argument("--output-dir", required=True,
 48                   help="Directory in which to save the simulated batches.")
 49    # Optional genetic map
 50    p.add_argument("--genetic-map", default=None,
 51                   help="Genetic map table with columns: chrom, pos, cM.")
 52    p.add_argument("--chromosome", type=int, default=None,
 53                   help="If provided, restrict genetic map rows to this chromosome id.")
 54
 55    # Simulator hyper-parameters
 56    p.add_argument("--window-size", type=int, default=1000,
 57                   help="#SNPs per window.")
 58    p.add_argument("--store-latlon-as-nvec", action="store_true",
 59                   help="Convert lat/lon to unit n-vectors (x,y,z).")
 60    p.add_argument("--make-haploid", action="store_true",
 61                   help="Flatten diploid genotypes into haplotypes.")
 62    p.add_argument("--device", default="cpu",
 63                   help="torch device string, e.g. 'cuda:0'.")
 64
 65    # Admixture parameters
 66    p.add_argument("--batch-size", type=int, default=256,
 67                   help="#simulated haplotypes per batch.")
 68    p.add_argument("--num-generations", type=int, default=10,
 69                   help="Upper bound on random generations since admixture.")
 70    p.add_argument("--n-batches", type=int, default=1,
 71                   help="#separate batches to generate & save.")
 72
 73    # Misc
 74    p.add_argument("-v", "--verbose", action="store_true",
 75                   help="Print additional debugging info.")
 76
 77    args = p.parse_args()
 78
 79    if args.verbose:
 80        log.setLevel(logging.DEBUG)
 81
 82    return args
 83
 84def main():
 85    args = parse_sim_args()
 86
 87    # 1) Sanity checks / output directory
 88    out_dir = Path(args.output_dir).expanduser()
 89    out_dir.mkdir(parents=True, exist_ok=True)
 90    log.info("Output directory: %s", out_dir.resolve())
 91
 92    # 2) Read inputs
 93    log.info("Reading VCF …")
 94    vcf_reader = VCFReaderPolars(args.vcf)
 95    vcf_data   = vcf_reader.read()
 96
 97    log.info("Reading metadata table …")
 98    meta = pd.read_csv(args.metadata, sep=None, engine="python")
 99    # Subset to single-ancestry samples if that column exists
100    if "Single_Ancestry" in meta.columns:
101        meta = meta[meta.Single_Ancestry == True]
102
103    # Keep only the required columns
104    cols_needed = ["Sample", "Population", "Latitude", "Longitude"]
105    missing = [c for c in cols_needed if c not in meta.columns]
106    if missing:
107        log.error("Metadata is missing columns: %s", ", ".join(missing))
108        sys.exit(1)
109    meta = meta[cols_needed]
110
111    # Genetic map (optional)
112    genetic_map = None
113    if args.genetic_map:
114        log.info("Reading genetic map …")
115        gm = pd.read_csv(args.genetic_map, sep=None, engine="python",
116                         names=["chm", "pos", "cM"])
117        if args.chromosome is not None:
118            gm = gm[gm.chm == args.chromosome]
119        genetic_map = gm
120
121    # 3) Instantiate simulator
122    log.info("Initialising OnlineSimulator …")
123    simulator = OnlineSimulator(
124        vcf_data           = vcf_data,
125        meta               = meta,
126        genetic_map        = genetic_map,
127        make_haploid       = args.make_haploid,
128        window_size        = args.window_size,
129        store_latlon_as_nvec = args.store_latlon_as_nvec,
130    )
131
132    # 4) Run batches
133    log.info("Generating %d batch(es)…", args.n_batches)
134    for b in range(1, args.n_batches + 1):
135        snps, labels_d, labels_c, cp = simulator.simulate(
136            batch_size        = args.batch_size,
137            num_generation_max= args.num_generations,
138            pool_method       = "mode",
139            device            = args.device
140        )
141
142        # 5) Save
143        out_path = out_dir / f"batch_{b:04d}.npz"
144        np.savez_compressed(
145            out_path,
146            snps      = snps.cpu().numpy(),
147            labels_d  = (labels_d.cpu().numpy()
148                         if labels_d is not None else np.empty(0)),
149            labels_c  = (labels_c.cpu().numpy()
150                         if labels_c is not None else np.empty(0)),
151            cp        = (cp.cpu().numpy()
152                         if cp is not None else np.empty(0)),
153        )
154        log.info("Saved %s", out_path.name)
155
156    log.info("[✓] All done. %d files written to %s", args.n_batches, out_dir)
157
158
159if __name__ == "__main__":
160    try:
161        main()
162    except Exception as exc:
163        log.exception("Fatal error: %s", exc)
164        sys.exit(1)
log = <Logger simulator_cli (INFO)>
def parse_sim_args() -> argparse.Namespace:
35def parse_sim_args() -> argparse.Namespace:
36    """Parse command-line flags for the simulator CLI."""
37    p = argparse.ArgumentParser(
38        prog="simulator_cli",
39        description="Batch-simulate admixed haplotypes with OnlineSimulator.",
40        formatter_class=argparse.ArgumentDefaultsHelpFormatter
41    )
42
43    # Required I/O
44    p.add_argument("--vcf", required=True,
45                   help="Path to the phased VCF/VCF-gz file.")
46    p.add_argument("--metadata", required=True,
47                   help="TSV/CSV file with at least Sample / Population / Latitude / Longitude.")
48    p.add_argument("--output-dir", required=True,
49                   help="Directory in which to save the simulated batches.")
50    # Optional genetic map
51    p.add_argument("--genetic-map", default=None,
52                   help="Genetic map table with columns: chrom, pos, cM.")
53    p.add_argument("--chromosome", type=int, default=None,
54                   help="If provided, restrict genetic map rows to this chromosome id.")
55
56    # Simulator hyper-parameters
57    p.add_argument("--window-size", type=int, default=1000,
58                   help="#SNPs per window.")
59    p.add_argument("--store-latlon-as-nvec", action="store_true",
60                   help="Convert lat/lon to unit n-vectors (x,y,z).")
61    p.add_argument("--make-haploid", action="store_true",
62                   help="Flatten diploid genotypes into haplotypes.")
63    p.add_argument("--device", default="cpu",
64                   help="torch device string, e.g. 'cuda:0'.")
65
66    # Admixture parameters
67    p.add_argument("--batch-size", type=int, default=256,
68                   help="#simulated haplotypes per batch.")
69    p.add_argument("--num-generations", type=int, default=10,
70                   help="Upper bound on random generations since admixture.")
71    p.add_argument("--n-batches", type=int, default=1,
72                   help="#separate batches to generate & save.")
73
74    # Misc
75    p.add_argument("-v", "--verbose", action="store_true",
76                   help="Print additional debugging info.")
77
78    args = p.parse_args()
79
80    if args.verbose:
81        log.setLevel(logging.DEBUG)
82
83    return args

Parse command-line flags for the simulator CLI.

def main():
 85def main():
 86    args = parse_sim_args()
 87
 88    # 1) Sanity checks / output directory
 89    out_dir = Path(args.output_dir).expanduser()
 90    out_dir.mkdir(parents=True, exist_ok=True)
 91    log.info("Output directory: %s", out_dir.resolve())
 92
 93    # 2) Read inputs
 94    log.info("Reading VCF …")
 95    vcf_reader = VCFReaderPolars(args.vcf)
 96    vcf_data   = vcf_reader.read()
 97
 98    log.info("Reading metadata table …")
 99    meta = pd.read_csv(args.metadata, sep=None, engine="python")
100    # Subset to single-ancestry samples if that column exists
101    if "Single_Ancestry" in meta.columns:
102        meta = meta[meta.Single_Ancestry == True]
103
104    # Keep only the required columns
105    cols_needed = ["Sample", "Population", "Latitude", "Longitude"]
106    missing = [c for c in cols_needed if c not in meta.columns]
107    if missing:
108        log.error("Metadata is missing columns: %s", ", ".join(missing))
109        sys.exit(1)
110    meta = meta[cols_needed]
111
112    # Genetic map (optional)
113    genetic_map = None
114    if args.genetic_map:
115        log.info("Reading genetic map …")
116        gm = pd.read_csv(args.genetic_map, sep=None, engine="python",
117                         names=["chm", "pos", "cM"])
118        if args.chromosome is not None:
119            gm = gm[gm.chm == args.chromosome]
120        genetic_map = gm
121
122    # 3) Instantiate simulator
123    log.info("Initialising OnlineSimulator …")
124    simulator = OnlineSimulator(
125        vcf_data           = vcf_data,
126        meta               = meta,
127        genetic_map        = genetic_map,
128        make_haploid       = args.make_haploid,
129        window_size        = args.window_size,
130        store_latlon_as_nvec = args.store_latlon_as_nvec,
131    )
132
133    # 4) Run batches
134    log.info("Generating %d batch(es)…", args.n_batches)
135    for b in range(1, args.n_batches + 1):
136        snps, labels_d, labels_c, cp = simulator.simulate(
137            batch_size        = args.batch_size,
138            num_generation_max= args.num_generations,
139            pool_method       = "mode",
140            device            = args.device
141        )
142
143        # 5) Save
144        out_path = out_dir / f"batch_{b:04d}.npz"
145        np.savez_compressed(
146            out_path,
147            snps      = snps.cpu().numpy(),
148            labels_d  = (labels_d.cpu().numpy()
149                         if labels_d is not None else np.empty(0)),
150            labels_c  = (labels_c.cpu().numpy()
151                         if labels_c is not None else np.empty(0)),
152            cp        = (cp.cpu().numpy()
153                         if cp is not None else np.empty(0)),
154        )
155        log.info("Saved %s", out_path.name)
156
157    log.info("[✓] All done. %d files written to %s", args.n_batches, out_dir)