snputils.ibd

View Source

1from .genobj.ibdobj import IBDObject
2from .io import read_ibd, HapIBDReader, AncIBDReader, IBDReader
3
4__all__ = ['IBDObject', 'read_ibd', 'HapIBDReader', 'AncIBDReader', 'IBDReader']

def read_ibd( file: str | pathlib.Path, **kwargs) -> IBDObject: View Source

 8def read_ibd(file: Union[str, Path], **kwargs) -> IBDObject:
 9    """
10    Automatically detect the IBD data file format from the file's extension and read it into an `IBDObject`.
11
12    Supported formats:
13    - Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
14    - ancIBD (template only).
15
16    Args:
17        file (str or pathlib.Path): Path to the file to be read.
18        **kwargs: Additional arguments passed to the reader method.
19    """
20    from snputils.ibd.io.read.auto import IBDReader
21
22    return IBDReader(file).read(**kwargs)

Automatically detect the IBD data file format from the file's extension and read it into an IBDObject.

Supported formats:

Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
ancIBD (template only).

Arguments:

file (str or pathlib.Path): Path to the file to be read.
**kwargs: Additional arguments passed to the reader method.

class HapIBDReader(snputils.ibd.io.read.base.IBDBaseReader): View Source

 18class HapIBDReader(IBDBaseReader):
 19    """
 20    Reads an IBD file in Hap-IBD format and processes it into an `IBDObject`.
 21    """
 22
 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Reads an IBD file in Hap-IBD format and processes it into an IBDObject.

def read( self, separator: str | None = None) -> IBDObject: View Source

 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Read a Hap-IBD file into an IBDObject.

The Hap-IBD format is a delimited text without a header with columns: sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm

Notes:

Haplotype identifiers are 1-based and take values in {1, 2}.

Arguments:

separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.

Returns:

IBDObject: An IBDObject instance.

class AncIBDReader(snputils.ibd.io.read.base.IBDBaseReader): View Source

 17class AncIBDReader(IBDBaseReader):
 18    """
 19    Reads IBD data from ancIBD outputs (TSV), accepting a file (`ch_all.tsv` or `ch*.tsv`) or a directory.
 20    """
 21
 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Reads IBD data from ancIBD outputs (TSV), accepting a file (ch_all.tsv or ch*.tsv) or a directory.

def read( self, path: str | pathlib.Path | None = None, include_segment_types: Sequence[str] | None = ('IBD1', 'IBD2')) -> IBDObject: View Source

 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Read ancIBD outputs and convert to IBDObject.

Inputs accepted:

A single TSV (optionally gzipped), e.g. ch_all.tsv[.gz] or ch{CHR}.tsv[.gz].
A directory containing per-chromosome TSVs or ch_all.tsv.

Column schema (tab-separated with header): iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type

Notes:

Haplotype indices are not provided by ancIBD; set to -1.
Positions in IBDObject use base-pair StartBP/EndBP.
Length uses centiMorgan as lengthM * 100.

Arguments:

path (str or Path, optional): Override input path. Defaults to self.file.
include_segment_types (sequence of str, optional): Filter by segment_type (e.g., IBD1, IBD2). None to disable.

Returns:

IBDObject: An IBDObject instance.

class IBDReader: View Source

 8class IBDReader:
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)

IBDReader(file: 'Union[str, Path]') View Source

 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)

A factory class that attempts to detect the IBD file format and returns the corresponding reader.

Supported detections:

Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
ancIBD: directories with ch_all.tsv/ch*.tsv or files *.tsv / *.tsv.gz with ancIBD schema