snputils.simulation.simulator_cli
CLI wrapper for OnlineSimulator.
Example
python3 /home/users/geleta/snputils/snputils/simulation/simulator_cli.py --vcf /oak/stanford/projects/tobias/rita/1000G/ref_final_beagle_phased_1kg_hgdp_sgdp_chr22_hg19.vcf.gz --metadata /oak/stanford/projects/tobias/rita/1000G/reference_panel_metadata_v2.tsv --genetic-map /oak/stanford/projects/tobias/rita/1000G/allchrs.b37.gmap --chromosome 22 --window-size 1000 --store-latlon-as-nvec --batch-size 512 --num-generations 64 --n-batches 10 --output-dir /tmp
1""" 2CLI wrapper for OnlineSimulator. 3 4Example 5------- 6python3 /home/users/geleta/snputils/snputils/simulation/simulator_cli.py \ 7 --vcf /oak/stanford/projects/tobias/rita/1000G/ref_final_beagle_phased_1kg_hgdp_sgdp_chr22_hg19.vcf.gz \ 8 --metadata /oak/stanford/projects/tobias/rita/1000G/reference_panel_metadata_v2.tsv \ 9 --genetic-map /oak/stanford/projects/tobias/rita/1000G/allchrs.b37.gmap \ 10 --chromosome 22 \ 11 --window-size 1000 \ 12 --store-latlon-as-nvec \ 13 --batch-size 512 \ 14 --num-generations 64 \ 15 --n-batches 10 \ 16 --output-dir /tmp 17""" 18 19import sys 20import argparse 21import logging 22from pathlib import Path 23import numpy as np 24import pandas as pd 25 26from snputils.snp.io.read.vcf import VCFReaderPolars 27from snputils.simulation.simulator import OnlineSimulator 28 29logging.basicConfig(level=logging.INFO, 30 format="%(asctime)s │ %(levelname)-8s │ %(message)s", 31 datefmt="%Y-%m-%d %H:%M:%S") 32log = logging.getLogger("simulator_cli") 33 34def parse_sim_args() -> argparse.Namespace: 35 """Parse command-line flags for the simulator CLI.""" 36 p = argparse.ArgumentParser( 37 prog="simulator_cli", 38 description="Batch-simulate admixed haplotypes with OnlineSimulator.", 39 formatter_class=argparse.ArgumentDefaultsHelpFormatter 40 ) 41 42 # Required I/O 43 p.add_argument("--vcf", required=True, 44 help="Path to the phased VCF/VCF-gz file.") 45 p.add_argument("--metadata", required=True, 46 help="TSV/CSV file with at least Sample / Population / Latitude / Longitude.") 47 p.add_argument("--output-dir", required=True, 48 help="Directory in which to save the simulated batches.") 49 # Optional genetic map 50 p.add_argument("--genetic-map", default=None, 51 help="Genetic map table with columns: chrom, pos, cM.") 52 p.add_argument("--chromosome", type=int, default=None, 53 help="If provided, restrict genetic map rows to this chromosome id.") 54 55 # Simulator hyper-parameters 56 p.add_argument("--window-size", type=int, default=1000, 57 help="#SNPs per window.") 58 p.add_argument("--store-latlon-as-nvec", action="store_true", 59 help="Convert lat/lon to unit n-vectors (x,y,z).") 60 p.add_argument("--make-haploid", action="store_true", 61 help="Flatten diploid genotypes into haplotypes.") 62 p.add_argument("--device", default="cpu", 63 help="torch device string, e.g. 'cuda:0'.") 64 65 # Admixture parameters 66 p.add_argument("--batch-size", type=int, default=256, 67 help="#simulated haplotypes per batch.") 68 p.add_argument("--num-generations", type=int, default=10, 69 help="Upper bound on random generations since admixture.") 70 p.add_argument("--n-batches", type=int, default=1, 71 help="#separate batches to generate & save.") 72 73 # Misc 74 p.add_argument("-v", "--verbose", action="store_true", 75 help="Print additional debugging info.") 76 77 args = p.parse_args() 78 79 if args.verbose: 80 log.setLevel(logging.DEBUG) 81 82 return args 83 84def main(): 85 args = parse_sim_args() 86 87 # 1) Sanity checks / output directory 88 out_dir = Path(args.output_dir).expanduser() 89 out_dir.mkdir(parents=True, exist_ok=True) 90 log.info("Output directory: %s", out_dir.resolve()) 91 92 # 2) Read inputs 93 log.info("Reading VCF …") 94 vcf_reader = VCFReaderPolars(args.vcf) 95 vcf_data = vcf_reader.read() 96 97 log.info("Reading metadata table …") 98 meta = pd.read_csv(args.metadata, sep=None, engine="python") 99 # Subset to single-ancestry samples if that column exists 100 if "Single_Ancestry" in meta.columns: 101 meta = meta[meta.Single_Ancestry == True] 102 103 # Keep only the required columns 104 cols_needed = ["Sample", "Population", "Latitude", "Longitude"] 105 missing = [c for c in cols_needed if c not in meta.columns] 106 if missing: 107 log.error("Metadata is missing columns: %s", ", ".join(missing)) 108 sys.exit(1) 109 meta = meta[cols_needed] 110 111 # Genetic map (optional) 112 genetic_map = None 113 if args.genetic_map: 114 log.info("Reading genetic map …") 115 gm = pd.read_csv(args.genetic_map, sep=None, engine="python", 116 names=["chm", "pos", "cM"]) 117 if args.chromosome is not None: 118 gm = gm[gm.chm == args.chromosome] 119 genetic_map = gm 120 121 # 3) Instantiate simulator 122 log.info("Initialising OnlineSimulator …") 123 simulator = OnlineSimulator( 124 vcf_data = vcf_data, 125 meta = meta, 126 genetic_map = genetic_map, 127 make_haploid = args.make_haploid, 128 window_size = args.window_size, 129 store_latlon_as_nvec = args.store_latlon_as_nvec, 130 ) 131 132 # 4) Run batches 133 log.info("Generating %d batch(es)…", args.n_batches) 134 for b in range(1, args.n_batches + 1): 135 snps, labels_d, labels_c, cp = simulator.simulate( 136 batch_size = args.batch_size, 137 num_generation_max= args.num_generations, 138 pool_method = "mode", 139 device = args.device 140 ) 141 142 # 5) Save 143 out_path = out_dir / f"batch_{b:04d}.npz" 144 np.savez_compressed( 145 out_path, 146 snps = snps.cpu().numpy(), 147 labels_d = (labels_d.cpu().numpy() 148 if labels_d is not None else np.empty(0)), 149 labels_c = (labels_c.cpu().numpy() 150 if labels_c is not None else np.empty(0)), 151 cp = (cp.cpu().numpy() 152 if cp is not None else np.empty(0)), 153 ) 154 log.info("Saved %s", out_path.name) 155 156 log.info("[✓] All done. %d files written to %s", args.n_batches, out_dir) 157 158 159if __name__ == "__main__": 160 try: 161 main() 162 except Exception as exc: 163 log.exception("Fatal error: %s", exc) 164 sys.exit(1)
log =
<Logger simulator_cli (INFO)>
def
parse_sim_args() -> argparse.Namespace:
35def parse_sim_args() -> argparse.Namespace: 36 """Parse command-line flags for the simulator CLI.""" 37 p = argparse.ArgumentParser( 38 prog="simulator_cli", 39 description="Batch-simulate admixed haplotypes with OnlineSimulator.", 40 formatter_class=argparse.ArgumentDefaultsHelpFormatter 41 ) 42 43 # Required I/O 44 p.add_argument("--vcf", required=True, 45 help="Path to the phased VCF/VCF-gz file.") 46 p.add_argument("--metadata", required=True, 47 help="TSV/CSV file with at least Sample / Population / Latitude / Longitude.") 48 p.add_argument("--output-dir", required=True, 49 help="Directory in which to save the simulated batches.") 50 # Optional genetic map 51 p.add_argument("--genetic-map", default=None, 52 help="Genetic map table with columns: chrom, pos, cM.") 53 p.add_argument("--chromosome", type=int, default=None, 54 help="If provided, restrict genetic map rows to this chromosome id.") 55 56 # Simulator hyper-parameters 57 p.add_argument("--window-size", type=int, default=1000, 58 help="#SNPs per window.") 59 p.add_argument("--store-latlon-as-nvec", action="store_true", 60 help="Convert lat/lon to unit n-vectors (x,y,z).") 61 p.add_argument("--make-haploid", action="store_true", 62 help="Flatten diploid genotypes into haplotypes.") 63 p.add_argument("--device", default="cpu", 64 help="torch device string, e.g. 'cuda:0'.") 65 66 # Admixture parameters 67 p.add_argument("--batch-size", type=int, default=256, 68 help="#simulated haplotypes per batch.") 69 p.add_argument("--num-generations", type=int, default=10, 70 help="Upper bound on random generations since admixture.") 71 p.add_argument("--n-batches", type=int, default=1, 72 help="#separate batches to generate & save.") 73 74 # Misc 75 p.add_argument("-v", "--verbose", action="store_true", 76 help="Print additional debugging info.") 77 78 args = p.parse_args() 79 80 if args.verbose: 81 log.setLevel(logging.DEBUG) 82 83 return args
Parse command-line flags for the simulator CLI.
def
main():
85def main(): 86 args = parse_sim_args() 87 88 # 1) Sanity checks / output directory 89 out_dir = Path(args.output_dir).expanduser() 90 out_dir.mkdir(parents=True, exist_ok=True) 91 log.info("Output directory: %s", out_dir.resolve()) 92 93 # 2) Read inputs 94 log.info("Reading VCF …") 95 vcf_reader = VCFReaderPolars(args.vcf) 96 vcf_data = vcf_reader.read() 97 98 log.info("Reading metadata table …") 99 meta = pd.read_csv(args.metadata, sep=None, engine="python") 100 # Subset to single-ancestry samples if that column exists 101 if "Single_Ancestry" in meta.columns: 102 meta = meta[meta.Single_Ancestry == True] 103 104 # Keep only the required columns 105 cols_needed = ["Sample", "Population", "Latitude", "Longitude"] 106 missing = [c for c in cols_needed if c not in meta.columns] 107 if missing: 108 log.error("Metadata is missing columns: %s", ", ".join(missing)) 109 sys.exit(1) 110 meta = meta[cols_needed] 111 112 # Genetic map (optional) 113 genetic_map = None 114 if args.genetic_map: 115 log.info("Reading genetic map …") 116 gm = pd.read_csv(args.genetic_map, sep=None, engine="python", 117 names=["chm", "pos", "cM"]) 118 if args.chromosome is not None: 119 gm = gm[gm.chm == args.chromosome] 120 genetic_map = gm 121 122 # 3) Instantiate simulator 123 log.info("Initialising OnlineSimulator …") 124 simulator = OnlineSimulator( 125 vcf_data = vcf_data, 126 meta = meta, 127 genetic_map = genetic_map, 128 make_haploid = args.make_haploid, 129 window_size = args.window_size, 130 store_latlon_as_nvec = args.store_latlon_as_nvec, 131 ) 132 133 # 4) Run batches 134 log.info("Generating %d batch(es)…", args.n_batches) 135 for b in range(1, args.n_batches + 1): 136 snps, labels_d, labels_c, cp = simulator.simulate( 137 batch_size = args.batch_size, 138 num_generation_max= args.num_generations, 139 pool_method = "mode", 140 device = args.device 141 ) 142 143 # 5) Save 144 out_path = out_dir / f"batch_{b:04d}.npz" 145 np.savez_compressed( 146 out_path, 147 snps = snps.cpu().numpy(), 148 labels_d = (labels_d.cpu().numpy() 149 if labels_d is not None else np.empty(0)), 150 labels_c = (labels_c.cpu().numpy() 151 if labels_c is not None else np.empty(0)), 152 cp = (cp.cpu().numpy() 153 if cp is not None else np.empty(0)), 154 ) 155 log.info("Saved %s", out_path.name) 156 157 log.info("[✓] All done. %d files written to %s", args.n_batches, out_dir)