Source code for snputils.ibd.io.read.anc_ibd

import logging
from pathlib import Path
from typing import Optional, Sequence, Union

import polars as pl
import numpy as np

from snputils.ibd.genobj.ibdobj import IBDObject
from snputils.ibd.io.read.base import IBDBaseReader


log = logging.getLogger(__name__)


[docs] class AncIBDReader(IBDBaseReader): """ Reads IBD data from ancIBD outputs (TSV), accepting a file (`ch_all.tsv` or `ch*.tsv`) or a directory. """
[docs] def read( self, path: Optional[Union[str, Path]] = None, include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"), ) -> IBDObject: """ Read ancIBD outputs and convert to `IBDObject`. Inputs accepted: - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`. - A directory containing per-chromosome TSVs or `ch_all.tsv`. Column schema (tab-separated with header): iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type Notes: - Haplotype indices are not provided by ancIBD; set to -1. - Positions in IBDObject use base-pair StartBP/EndBP. - Length uses centiMorgan as `lengthM * 100`. Args: path (str or Path, optional): Override input path. Defaults to `self.file`. include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable. Returns: **IBDObject**: An IBDObject instance. """ p = Path(path) if path is not None else Path(self.file) log.info(f"Reading ancIBD from {p}") files: list[Path] if p.is_dir(): # Prefer combined file if present, else gather per-chromosome files combined = p / "ch_all.tsv" combined_gz = p / "ch_all.tsv.gz" if combined.exists(): files = [combined] elif combined_gz.exists(): files = [combined_gz] else: files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz"))) if not files: raise FileNotFoundError("No ancIBD output files found in directory.") else: files = [p] frames = [] schema_overrides = { "iid1": pl.Utf8, "iid2": pl.Utf8, "ch": pl.Utf8, "Start": pl.Int64, "End": pl.Int64, "length": pl.Int64, # marker span; not used "StartM": pl.Float64, "EndM": pl.Float64, "lengthM": pl.Float64, "StartBP": pl.Int64, "EndBP": pl.Int64, "segment_type": pl.Utf8, } for f in files: frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides) frames.append(frame) df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0] if include_segment_types is not None: df = df.filter(pl.col("segment_type").is_in(list(include_segment_types))) # Map columns to IBDObject schema sample_id_1 = df["iid1"].to_numpy() sample_id_2 = df["iid2"].to_numpy() chrom = df["ch"].to_numpy() start_bp = df["StartBP"].to_numpy() end_bp = df["EndBP"].to_numpy() length_cm = (df["lengthM"] * 100.0).to_numpy() # ancIBD doesn't include haplotype indices; set to -1 hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8) hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8) ibdobj = IBDObject( sample_id_1=sample_id_1, haplotype_id_1=hap1, sample_id_2=sample_id_2, haplotype_id_2=hap2, chrom=chrom, start=start_bp, end=end_bp, length_cm=length_cm, segment_type=df["segment_type"].to_numpy(), ) log.info(f"Finished reading ancIBD from {p}") return ibdobj