snputils

 1from importlib import import_module
 2from importlib.metadata import PackageNotFoundError, version
 3from typing import Dict, Tuple
 4
 5try:
 6    __version__ = version("snputils")
 7except PackageNotFoundError:
 8    __version__ = "unknown"
 9
10_LAZY_ATTRS: Dict[str, Tuple[str, str]] = {
11    "SNPObject": (".snp", "SNPObject"),
12    "GRGObject": (".snp", "GRGObject"),
13    "SNPReader": (".snp", "SNPReader"),
14    "BEDReader": (".snp", "BEDReader"),
15    "GRGReader": (".snp", "GRGReader"),
16    "GRGWriter": (".snp", "GRGWriter"),
17    "PGENReader": (".snp", "PGENReader"),
18    "VCFReader": (".snp", "VCFReader"),
19    "BEDWriter": (".snp", "BEDWriter"),
20    "PGENWriter": (".snp", "PGENWriter"),
21    "VCFWriter": (".snp", "VCFWriter"),
22    "read_snp": (".snp", "read_snp"),
23    "read_bed": (".snp", "read_bed"),
24    "read_pgen": (".snp", "read_pgen"),
25    "read_vcf": (".snp", "read_vcf"),
26    "read_grg": (".snp", "read_grg"),
27    "LocalAncestryObject": (".ancestry", "LocalAncestryObject"),
28    "GlobalAncestryObject": (".ancestry", "GlobalAncestryObject"),
29    "MSPReader": (".ancestry", "MSPReader"),
30    "MSPWriter": (".ancestry", "MSPWriter"),
31    "AdmixtureMappingVCFWriter": (".ancestry", "AdmixtureMappingVCFWriter"),
32    "AdmixtureReader": (".ancestry", "AdmixtureReader"),
33    "AdmixtureWriter": (".ancestry", "AdmixtureWriter"),
34    "read_lai": (".ancestry", "read_lai"),
35    "read_msp": (".ancestry", "read_msp"),
36    "read_adm": (".ancestry", "read_adm"),
37    "read_admixture": (".ancestry", "read_admixture"),
38    "IBDObject": (".ibd", "IBDObject"),
39    "read_ibd": (".ibd", "read_ibd"),
40    "HapIBDReader": (".ibd", "HapIBDReader"),
41    "AncIBDReader": (".ibd", "AncIBDReader"),
42    "IBDReader": (".ibd", "IBDReader"),
43    "MultiPhenotypeObject": (".phenotype", "MultiPhenotypeObject"),
44    "PhenotypeObject": (".phenotype", "PhenotypeObject"),
45    "MultiPhenReader": (".phenotype", "MultiPhenReader"),
46    "PhenotypeReader": (".phenotype", "PhenotypeReader"),
47    "load_dataset": (".datasets", "load_dataset"),
48    "viz": (".visualization", ""),
49}
50
51__all__ = list(_LAZY_ATTRS.keys())
52
53
54def __getattr__(name):
55    target = _LAZY_ATTRS.get(name)
56    if target is None:
57        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
58
59    module_name, attr_name = target
60    module = import_module(module_name, package=__name__)
61    value = module if attr_name == "" else getattr(module, attr_name)
62    globals()[name] = value
63    return value
64
65
66def __dir__():
67    return sorted(set(globals().keys()) | set(__all__))
class SNPObject:
  21class SNPObject:
  22    """
  23    A class for Single Nucleotide Polymorphism (SNP) data, with optional support for 
  24    SNP-level Local Ancestry Information (LAI).
  25    """
  26    def __init__(
  27        self,
  28        calldata_gt: Optional[np.ndarray] = None,
  29        samples: Optional[np.ndarray] = None,
  30        variants_ref: Optional[np.ndarray] = None,
  31        variants_alt: Optional[np.ndarray] = None,
  32        variants_chrom: Optional[np.ndarray] = None,
  33        variants_filter_pass: Optional[np.ndarray] = None,
  34        variants_id: Optional[np.ndarray] = None,
  35        variants_pos: Optional[np.ndarray] = None,
  36        variants_qual: Optional[np.ndarray] = None,
  37        calldata_lai: Optional[np.ndarray] = None,
  38        ancestry_map: Optional[Dict[str, str]] = None
  39    ) -> None:
  40        """
  41        Args:
  42            calldata_gt (array, optional): 
  43                An array containing genotype data for each sample. This array can be either 2D with shape 
  44                `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 
  45                `(n_snps, n_samples, 2)` if the strands are kept separate.
  46            samples (array of shape (n_samples,), optional): 
  47                An array containing unique sample identifiers.
  48            variants_ref (array of shape (n_snps,), optional): 
  49                An array containing the reference allele for each SNP.
  50            variants_alt (array of shape (n_snps,), optional): 
  51                An array containing the alternate allele for each SNP.
  52            variants_chrom (array of shape (n_snps,), optional): 
  53                An array containing the chromosome for each SNP.
  54            variants_filter_pass (array of shape (n_snps,), optional): 
  55                An array indicating whether each SNP passed control checks.
  56            variants_id (array of shape (n_snps,), optional): 
  57                An array containing unique identifiers (IDs) for each SNP.
  58            variants_pos (array of shape (n_snps,), optional): 
  59                An array containing the chromosomal positions for each SNP.
  60            variants_qual (array of shape (n_snps,), optional): 
  61                An array containing the Phred-scaled quality score for each SNP.
  62            calldata_lai (array, optional): 
  63                An array containing the ancestry for each SNP. This array can be either 2D with shape
  64                `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2).
  65            ancestry_map (dict of str to str, optional): 
  66                A dictionary mapping ancestry codes to region names.
  67        """
  68        self.__calldata_gt = calldata_gt
  69        self.__samples = samples
  70        self.__variants_ref = variants_ref
  71        self.__variants_alt = variants_alt
  72        self.__variants_chrom = variants_chrom
  73        self.__variants_filter_pass = variants_filter_pass
  74        self.__variants_id = variants_id
  75        self.__variants_pos = variants_pos
  76        self.__variants_qual = variants_qual
  77        self.__calldata_lai = calldata_lai
  78        self.__ancestry_map = ancestry_map
  79
  80        self._sanity_check()
  81
  82    def __getitem__(self, key: str) -> Any:
  83        """
  84        To access an attribute of the class using the square bracket notation,
  85        similar to a dictionary.
  86        """
  87        try:
  88            return getattr(self, key)
  89        except:
  90            raise KeyError(f'Invalid key: {key}.')
  91
  92    def __setitem__(self, key: str, value: Any):
  93        """
  94        To set an attribute of the class using the square bracket notation,
  95        similar to a dictionary.
  96        """
  97        try:
  98            setattr(self, key, value)
  99        except:
 100            raise KeyError(f'Invalid key: {key}.')
 101
 102    @property
 103    def calldata_gt(self) -> np.ndarray:
 104        """
 105        Retrieve `calldata_gt`.
 106
 107        Returns:
 108            **array:** 
 109                An array containing genotype data for each sample. This array can be either 2D with shape 
 110                `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 
 111                `(n_snps, n_samples, 2)` if the strands are kept separate.
 112        """
 113        return self.__calldata_gt
 114
 115    @calldata_gt.setter
 116    def calldata_gt(self, x: np.ndarray):
 117        """
 118        Update `calldata_gt`.
 119        """
 120        self.__calldata_gt = x
 121
 122    @property
 123    def samples(self) -> Optional[np.ndarray]:
 124        """
 125        Retrieve `samples`.
 126
 127        Returns:
 128            **array of shape (n_samples,):** 
 129                An array containing unique sample identifiers.
 130        """
 131        return self.__samples
 132
 133    @samples.setter
 134    def samples(self, x: Union[List, np.ndarray]):
 135        """
 136        Update `samples`.
 137        """
 138        self.__samples = np.asarray(x)
 139
 140    @property
 141    def variants_ref(self) -> Optional[np.ndarray]:
 142        """
 143        Retrieve `variants_ref`.
 144
 145        Returns:
 146            **array of shape (n_snps,):** An array containing the reference allele for each SNP.
 147        """
 148        return self.__variants_ref
 149
 150    @variants_ref.setter
 151    def variants_ref(self, x: np.ndarray):
 152        """
 153        Update `variants_ref`.
 154        """
 155        self.__variants_ref = x
 156
 157    @property
 158    def variants_alt(self) -> Optional[np.ndarray]:
 159        """
 160        Retrieve `variants_alt`.
 161
 162        Returns:
 163            **array of shape (n_snps,):** An array containing the alternate allele for each SNP.
 164        """
 165        return self.__variants_alt
 166
 167    @variants_alt.setter
 168    def variants_alt(self, x: np.ndarray):
 169        """
 170        Update `variants_alt`.
 171        """
 172        self.__variants_alt = x
 173
 174    @property
 175    def variants_chrom(self) -> Optional[np.ndarray]:
 176        """
 177        Retrieve `variants_chrom`.
 178
 179        Returns:
 180            **array of shape (n_snps,):** An array containing the chromosome for each SNP.
 181        """
 182        return self.__variants_chrom
 183
 184    @variants_chrom.setter
 185    def variants_chrom(self, x: np.ndarray):
 186        """
 187        Update `variants_chrom`.
 188        """
 189        self.__variants_chrom = x
 190
 191    @property
 192    def variants_filter_pass(self) -> Optional[np.ndarray]:
 193        """
 194        Retrieve `variants_filter_pass`.
 195
 196        Returns:
 197            **array of shape (n_snps,):** An array indicating whether each SNP passed control checks.
 198        """
 199        return self.__variants_filter_pass
 200
 201    @variants_filter_pass.setter
 202    def variants_filter_pass(self, x: np.ndarray):
 203        """
 204        Update `variants_filter_pass`.
 205        """
 206        self.__variants_filter_pass = x
 207
 208    @property
 209    def variants_id(self) -> Optional[np.ndarray]:
 210        """
 211        Retrieve `variants_id`.
 212
 213        Returns:
 214            **array of shape (n_snps,):** An array containing unique identifiers (IDs) for each SNP.
 215        """
 216        return self.__variants_id
 217
 218    @variants_id.setter
 219    def variants_id(self, x: np.ndarray):
 220        """
 221        Update `variants_id`.
 222        """
 223        self.__variants_id = x
 224
 225    @property
 226    def variants_pos(self) -> Optional[np.ndarray]:
 227        """
 228        Retrieve `variants_pos`.
 229
 230        Returns:
 231            **array of shape (n_snps,):** An array containing the chromosomal positions for each SNP.
 232        """
 233        return self.__variants_pos
 234
 235    @variants_pos.setter
 236    def variants_pos(self, x: np.ndarray):
 237        """
 238        Update `variants_pos`.
 239        """
 240        self.__variants_pos = x
 241
 242    @property
 243    def variants_qual(self) -> Optional[np.ndarray]:
 244        """
 245        Retrieve `variants_qual`.
 246
 247        Returns:
 248            **array of shape (n_snps,):** An array containing the Phred-scaled quality score for each SNP.
 249        """
 250        return self.__variants_qual
 251
 252    @variants_qual.setter
 253    def variants_qual(self, x: np.ndarray):
 254        """
 255        Update `variants_qual`.
 256        """
 257        self.__variants_qual = x
 258
 259    @property
 260    def calldata_lai(self) -> Optional[np.ndarray]:
 261        """
 262        Retrieve `calldata_lai`.
 263
 264        Returns:
 265            **array:** 
 266                An array containing the ancestry for each SNP. This array can be either 2D with shape
 267                `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2).
 268        """
 269        return self.__calldata_lai
 270
 271    @calldata_lai.setter
 272    def calldata_lai(self, x: np.ndarray):
 273        """
 274        Update `calldata_lai`.
 275        """
 276        self.__calldata_lai = x
 277
 278    @property
 279    def ancestry_map(self) -> Optional[Dict[str, str]]:
 280        """
 281        Retrieve `ancestry_map`.
 282
 283        Returns:
 284            **dict of str to str:** A dictionary mapping ancestry codes to region names.
 285        """
 286        return self.__ancestry_map
 287
 288    @ancestry_map.setter
 289    def ancestry_map(self, x):
 290        """
 291        Update `ancestry_map`.
 292        """
 293        self.__ancestry_map = x
 294
 295    @property
 296    def n_samples(self) -> int:
 297        """
 298        Retrieve `n_samples`.
 299
 300        Returns:
 301            **int:** The total number of samples.
 302        """
 303        if self.__samples is not None:
 304            return len(self.__samples)
 305        elif self.__calldata_gt is not None:
 306            return self.__calldata_gt.shape[1]
 307        elif self.__calldata_lai is not None:
 308            if self.__calldata_lai.ndim == 2:
 309                return self.__calldata_lai.shape[1] // 2
 310            elif self.__calldata_lai.ndim == 3:
 311                return self.__calldata_lai.shape[1]
 312        else:
 313            raise ValueError("Unable to determine the total number of samples: no relevant data is available.")
 314
 315    @property
 316    def n_snps(self) -> int:
 317        """
 318        Retrieve `n_snps`.
 319
 320        Returns:
 321            **int:** The total number of SNPs.
 322        """
 323        # List of attributes that can indicate the number of SNPs
 324        potential_attributes = [
 325            self.__calldata_gt,
 326            self.__variants_ref,
 327            self.__variants_alt,
 328            self.__variants_chrom,
 329            self.__variants_filter_pass,
 330            self.__variants_id,
 331            self.__variants_pos,
 332            self.__variants_qual,
 333            self.__calldata_lai
 334        ]
 335
 336        # Check each attribute for its first dimension, which corresponds to `n_snps`
 337        for attr in potential_attributes:
 338            if attr is not None:
 339                return attr.shape[0]
 340
 341        raise ValueError("Unable to determine the total number of SNPs: no relevant data is available.")
 342
 343    @property
 344    def n_chrom(self) -> Optional[int]:
 345        """
 346        Retrieve `n_chrom`.
 347
 348        Returns:
 349            **int:** The total number of unique chromosomes in `variants_chrom`.
 350        """
 351        if self.variants_chrom is None:
 352            warnings.warn("Chromosome data `variants_chrom` is None.")
 353            return None
 354
 355        return len(self.unique_chrom)
 356
 357    @property
 358    def n_ancestries(self) -> int:
 359        """
 360        Retrieve `n_ancestries`.
 361
 362        Returns:
 363            **int:** The total number of unique ancestries.
 364        """
 365        if self.__calldata_lai is not None:
 366            return len(np.unique(self.__calldata_lai))
 367        else:
 368            raise ValueError("Unable to determine the total number of ancestries: no relevant data is available.")
 369
 370    @property
 371    def unique_chrom(self) -> Optional[np.ndarray]:
 372        """
 373        Retrieve `unique_chrom`.
 374
 375        Returns:
 376            **array:** The unique chromosome names in `variants_chrom`, preserving their order of appearance.
 377        """
 378        if self.variants_chrom is None:
 379            warnings.warn("Chromosome data `variants_chrom` is None.")
 380            return None
 381
 382        # Identify unique chromosome names and their first indexes of occurrence
 383        _, idx = np.unique(self.variants_chrom, return_index=True)
 384        # Return chromosome names sorted by their first occurrence to maintain original order
 385        return self.variants_chrom[np.sort(idx)]
 386
 387    @property
 388    def are_strands_summed(self) -> bool:
 389        """
 390        Retrieve `are_strands_summed`.
 391        
 392        Returns:
 393            **bool:** 
 394                True if the maternal and paternal strands have been summed together, which is indicated by 
 395                `calldata_gt` having shape `(n_samples, n_snps)`. False if the strands are stored separately, 
 396                indicated by `calldata_gt` having shape `(n_samples, n_snps, 2)`.
 397        """
 398        if self.calldata_gt is None:
 399            warnings.warn("Genotype data `calldata_gt` is None.")
 400            return None
 401        
 402        return self.calldata_gt.ndim == 2
 403
 404    def copy(self) -> SNPObject:
 405        """
 406        Create and return a copy of `self`.
 407
 408        Returns:
 409            **SNPObject:** 
 410                A new instance of the current object.
 411        """
 412        return copy.deepcopy(self)
 413
 414    def keys(self) -> List[str]:
 415        """
 416        Retrieve a list of public attribute names for `self`.
 417
 418        Returns:
 419            **list of str:** 
 420                A list of attribute names, with internal name-mangling removed, 
 421                for easier reference to public attributes in the instance.
 422        """
 423        return [attr.replace('_SNPObject__', '') for attr in vars(self)]
 424
 425    def allele_freq(
 426        self,
 427        sample_labels: Optional[Sequence[Any]] = None,
 428        ancestry: Optional[Union[str, int]] = None,
 429        laiobj: Optional["LocalAncestryObject"] = None,
 430        return_counts: bool = False,
 431        as_dataframe: bool = False,
 432    ) -> Any:
 433        """
 434        Compute per-SNP alternate allele frequencies from `calldata_gt`.
 435
 436        Args:
 437            sample_labels (sequence, optional):
 438                Population label per sample. If None, computes cohort-level frequencies.
 439            ancestry (str or int, optional):
 440                If provided, compute ancestry-masked frequencies using SNP-level LAI.
 441            laiobj (LocalAncestryObject, optional):
 442                Optional LAI object used when `self.calldata_lai` is not set.
 443            return_counts (bool, default=False):
 444                If True, also return called-allele counts with the same shape as frequencies.
 445            as_dataframe (bool, default=False):
 446                If True, return pandas DataFrame output.
 447
 448        Returns:
 449            Frequencies as a NumPy array (or DataFrame if `as_dataframe=True`).
 450            If `return_counts=True`, returns `(freq, counts)`.
 451        """
 452        if self.calldata_gt is None:
 453            raise ValueError("Genotype data `calldata_gt` is None.")
 454
 455        gt = np.asarray(self.calldata_gt)
 456        if gt.ndim not in (2, 3):
 457            raise ValueError("'calldata_gt' must be 2D or 3D array")
 458
 459        n_samples = gt.shape[1]
 460
 461        grouped_output = sample_labels is not None
 462        if sample_labels is None:
 463            labels = np.repeat("__all__", n_samples)
 464        else:
 465            labels = np.asarray(sample_labels)
 466            if labels.ndim != 1:
 467                labels = labels.ravel()
 468            if labels.shape[0] != n_samples:
 469                raise ValueError(
 470                    "'sample_labels' must have length equal to the number of samples in `calldata_gt`."
 471                )
 472
 473        calldata_lai = None
 474        if ancestry is not None:
 475            if self.calldata_lai is not None:
 476                calldata_lai = self.calldata_lai
 477            elif laiobj is not None:
 478                try:
 479                    converted_lai = laiobj.convert_to_snp_level(snpobject=self, lai_format="3D")
 480                    calldata_lai = getattr(converted_lai, "calldata_lai", None)
 481                except Exception:
 482                    calldata_lai = None
 483
 484            if calldata_lai is None:
 485                raise ValueError(
 486                    "Ancestry-specific masking requires SNP-level LAI "
 487                    "(provide a LocalAncestryObject via 'laiobj' or ensure 'self.calldata_lai' is set)."
 488                )
 489
 490        afs, counts, pops = aggregate_pop_allele_freq(
 491            calldata_gt=gt,
 492            sample_labels=labels,
 493            ancestry=ancestry,
 494            calldata_lai=calldata_lai,
 495        )
 496
 497        if grouped_output:
 498            freq_out = afs
 499            count_out = counts
 500            if as_dataframe:
 501                import pandas as pd
 502
 503                freq_out = pd.DataFrame(afs, columns=pops)
 504                count_out = pd.DataFrame(counts, columns=pops)
 505        else:
 506            freq_out = afs[:, 0]
 507            count_out = counts[:, 0]
 508            if as_dataframe:
 509                import pandas as pd
 510
 511                freq_out = pd.DataFrame({"allele_freq": freq_out})
 512                count_out = pd.DataFrame({"called_alleles": count_out})
 513
 514        if return_counts:
 515            return freq_out, count_out
 516        return freq_out
 517
 518    def sum_strands(self, inplace: bool = False) -> Optional['SNPObject']:
 519        """
 520        Sum paternal and maternal strands.
 521
 522        Args:
 523            inplace (bool, default=False): 
 524                If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 
 525                filtered. Default is False.
 526
 527        Returns:
 528            **Optional[SNPObject]:** 
 529                A new `SNPObject` with summed strands if `inplace=False`. 
 530                If `inplace=True`, modifies `self` in place and returns None.
 531        """
 532        if self.calldata_gt is None:
 533            warnings.warn("Genotype data `calldata_gt` is None.")
 534            return None if not inplace else self
 535
 536        if self.are_strands_summed:
 537            warnings.warn("Genotype data `calldata_gt` is already summed.")
 538            return self if inplace else self.copy()
 539        
 540        if inplace:
 541            self.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8)
 542            return self
 543        else:
 544            snpobj = self.copy()
 545            snpobj.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8)
 546            return snpobj
 547
 548    def filter_variants(
 549            self, 
 550            chrom: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 
 551            pos: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 
 552            indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 
 553            include: bool = True, 
 554            inplace: bool = False
 555        ) -> Optional['SNPObject']:
 556        """
 557        Filter variants based on specified chromosome names, variant positions, or variant indexes.
 558
 559        This method updates the `calldata_gt`, `variants_ref`, `variants_alt`, 
 560        `variants_chrom`, `variants_filter_pass`, `variants_id`, `variants_pos`,  
 561        `variants_qual`, and `lai` attributes to include or exclude the specified variants. The filtering 
 562        criteria can be based on chromosome names, variant positions, or indexes. If multiple 
 563        criteria are provided, their union is used for filtering. The order of the variants is preserved.
 564        
 565        Negative indexes are supported and follow 
 566        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
 567
 568        Args:
 569            chrom (str or array_like of str, optional): 
 570                Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence 
 571                of chromosomes. If both `chrom` and `pos` are provided, they must either have matching lengths 
 572                (pairing each chromosome with a position) or `chrom` should be a single value that applies to 
 573                all positions in `pos`. Default is None. 
 574            pos (int or array_like of int, optional): 
 575                Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions. 
 576                If `chrom` is also provided, `pos` should either match `chrom` in length or `chrom` should be a 
 577                single value. Default is None.
 578            indexes (int or array_like of int, optional): 
 579                Index(es) of the variants to include or exclude. Can be a single index or a sequence
 580                of indexes. Negative indexes are supported. Default is None.
 581            include (bool, default=True): 
 582                If True, includes only the specified variants. If False, excludes the specified
 583                variants. Default is True.
 584            inplace (bool, default=False): 
 585                If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 
 586                filtered. Default is False.
 587
 588        Returns:
 589            **Optional[SNPObject]:** 
 590                A new `SNPObject` with the specified variants filtered if `inplace=False`. 
 591                If `inplace=True`, modifies `self` in place and returns None.
 592        """
 593        if chrom is None and pos is None and indexes is None:
 594            raise ValueError("At least one of 'chrom', 'pos', or 'indexes' must be provided.")
 595
 596        n_snps = self.n_snps
 597
 598        # Convert inputs to arrays for consistency
 599        chrom = np.atleast_1d(chrom) if chrom is not None else None
 600        pos = np.atleast_1d(pos) if pos is not None else None
 601        indexes = np.atleast_1d(indexes) if indexes is not None else None
 602
 603        # Validate chrom and pos lengths if both are provided
 604        if chrom is not None and pos is not None:
 605            if len(chrom) != len(pos) and len(chrom) > 1:
 606                raise ValueError(
 607                    "When both 'chrom' and 'pos' are provided, they must either be of the same length "
 608                    "or 'chrom' must be a single value."
 609                )
 610
 611        # Create a mask for chromosome and position filtering
 612        mask_combined = np.zeros(n_snps, dtype=bool)
 613        if chrom is not None and pos is not None:
 614            if len(chrom) == 1:
 615                # Apply single chromosome to all positions in `pos`
 616                mask_combined = (self['variants_chrom'] == chrom[0]) & np.isin(self['variants_pos'], pos)
 617            else:
 618                # Vectorized pair matching for chrom and pos
 619                query_pairs = np.array(
 620                    list(zip(chrom, pos)),
 621                    dtype=[
 622                        ('chrom', self['variants_chrom'].dtype),
 623                        ('pos', self['variants_pos'].dtype)
 624                    ]
 625                )
 626                data_pairs = np.array(
 627                    list(zip(self['variants_chrom'], self['variants_pos'])),
 628                    dtype=[
 629                        ('chrom', self['variants_chrom'].dtype),
 630                        ('pos', self['variants_pos'].dtype)
 631                    ]
 632                )
 633                mask_combined = np.isin(data_pairs, query_pairs)
 634
 635        elif chrom is not None:
 636            # Only chromosome filtering
 637            mask_combined = np.isin(self['variants_chrom'], chrom)
 638        elif pos is not None:
 639            # Only position filtering
 640            mask_combined = np.isin(self['variants_pos'], pos)
 641
 642        # Create mask based on indexes if provided
 643        if indexes is not None:
 644            # Validate indexes, allowing negative indexes
 645            out_of_bounds_indexes = indexes[(indexes < -n_snps) | (indexes >= n_snps)]
 646            if out_of_bounds_indexes.size > 0:
 647                raise ValueError(f"One or more sample indexes are out of bounds.")
 648
 649            # Handle negative indexes and check for out-of-bounds indexes
 650            adjusted_indexes = np.mod(indexes, n_snps)
 651
 652            # Create mask for specified indexes
 653            mask_indexes = np.zeros(n_snps, dtype=bool)
 654            mask_indexes[adjusted_indexes] = True
 655
 656            # Combine with `chrom` and `pos` mask using logical OR (union of all specified criteria)
 657            mask_combined = mask_combined | mask_indexes
 658
 659        # Invert mask if `include` is False
 660        if not include:
 661            mask_combined = ~mask_combined
 662
 663        # Define keys to filter
 664        keys = [
 665            'calldata_gt', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 
 666            'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai'
 667        ]
 668
 669        # Apply filtering based on inplace parameter
 670        if inplace:
 671            for key in keys:
 672                if self[key] is not None:
 673                    if self[key].ndim > 1:
 674                        self[key] = np.asarray(self[key])[mask_combined, ...]
 675                    else:
 676                        self[key] = np.asarray(self[key])[mask_combined]
 677
 678            return None
 679        else:
 680            # Create A new `SNPObject` with filtered data
 681            snpobj = self.copy()
 682            for key in keys:
 683                if snpobj[key] is not None:
 684                    if snpobj[key].ndim > 1:
 685                        snpobj[key] = np.asarray(snpobj[key])[mask_combined, ...]
 686                    else:
 687                        snpobj[key] = np.asarray(snpobj[key])[mask_combined]
 688
 689            return snpobj
 690
 691    def filter_samples(
 692            self, 
 693            samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
 694            indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
 695            include: bool = True,
 696            reorder: bool = False,
 697            inplace: bool = False
 698        ) -> Optional['SNPObject']:
 699        """
 700        Filter samples based on specified names or indexes.
 701
 702        This method updates the `samples` and `calldata_gt` attributes to include or exclude the specified 
 703        samples. The order of the samples is preserved. Set `reorder=True` to match the ordering of the
 704        provided `samples` and/or `indexes` lists when including.
 705
 706        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
 707        indexes will be included or excluded.
 708
 709        This method allows inclusion or exclusion of specific samples by their names or 
 710        indexes. When both sample names and indexes are provided, the union of the specified samples 
 711        is used. Negative indexes are supported and follow 
 712        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
 713
 714        Args:
 715            samples (str or array_like of str, optional): 
 716                 Name(s) of the samples to include or exclude. Can be a single sample name or a
 717                 sequence of sample names. Default is None.
 718            indexes (int or array_like of int, optional):
 719                Index(es) of the samples to include or exclude. Can be a single index or a sequence
 720                of indexes. Negative indexes are supported. Default is None.
 721            include (bool, default=True): 
 722                If True, includes only the specified samples. If False, excludes the specified
 723                samples. Default is True.
 724            inplace (bool, default=False): 
 725                If True, modifies `self` in place. If False, returns a new `SNPObject` with the samples 
 726                filtered. Default is False.
 727
 728        Returns:
 729            **Optional[SNPObject]:** 
 730                A new `SNPObject` with the specified samples filtered if `inplace=False`. 
 731                If `inplace=True`, modifies `self` in place and returns None.
 732        """
 733        if samples is None and indexes is None:
 734            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
 735
 736        n_samples = self.n_samples
 737        sample_names = np.array(self['samples'])
 738
 739        # Create mask based on sample names
 740        if samples is not None:
 741            samples = np.asarray(samples).ravel()
 742            mask_samples = np.isin(sample_names, samples)
 743            missing_samples = samples[~np.isin(samples, sample_names)]
 744            if missing_samples.size > 0:
 745                raise ValueError(f"The following specified samples were not found: {missing_samples.tolist()}")
 746        else:
 747            mask_samples = np.zeros(n_samples, dtype=bool)
 748
 749        # Create mask based on sample indexes
 750        if indexes is not None:
 751            indexes = np.asarray(indexes).ravel()
 752
 753            # Validate indexes, allowing negative indexes
 754            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
 755            if out_of_bounds_indexes.size > 0:
 756                raise ValueError(f"One or more sample indexes are out of bounds.")
 757            
 758            # Handle negative indexes
 759            adjusted_indexes = np.mod(indexes, n_samples)
 760
 761            mask_indexes = np.zeros(n_samples, dtype=bool)
 762            mask_indexes[adjusted_indexes] = True
 763        else:
 764            mask_indexes = np.zeros(n_samples, dtype=bool)
 765
 766        # Combine masks using logical OR (union of samples)
 767        mask_combined = mask_samples | mask_indexes
 768
 769        if not include:
 770            mask_combined = ~mask_combined
 771
 772        # If requested, compute an ordering of selected samples that follows the provided lists.
 773        ordered_indices = None
 774        if include and reorder:
 775            sel_indices = np.where(mask_combined)[0]
 776            ordered_list: List[int] = []
 777            added = np.zeros(n_samples, dtype=bool)
 778
 779            # Prioritize the order in `samples`
 780            if samples is not None:
 781                name_to_idx = {name: idx for idx, name in enumerate(sample_names)}
 782                for s in samples:
 783                    idx = name_to_idx.get(s)
 784                    if idx is not None and mask_combined[idx] and not added[idx]:
 785                        ordered_list.append(idx)
 786                        added[idx] = True
 787
 788            # Then respect the order in `indexes`
 789            if indexes is not None:
 790                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
 791                for idx in adj_idx:
 792                    if mask_combined[idx] and not added[idx]:
 793                        ordered_list.append(int(idx))
 794                        added[idx] = True
 795
 796            # Finally, append any remaining selected samples in their original order
 797            for idx in sel_indices:
 798                if not added[idx]:
 799                    ordered_list.append(int(idx))
 800
 801            ordered_indices = np.asarray(ordered_list, dtype=int)
 802
 803        # Define keys to filter
 804        keys = ['samples', 'calldata_gt', 'calldata_lai']
 805
 806        # Apply filtering based on inplace parameter
 807        if inplace:
 808            for key in keys:
 809                if self[key] is not None:
 810                    arr = np.asarray(self[key])
 811                    if ordered_indices is not None:
 812                        if key == 'calldata_lai' and arr.ndim == 2:
 813                            # Haplotype-aware reordering for 2D LAI (n_snps, n_samples*2)
 814                            hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1])
 815                            self[key] = arr[:, hap_idx]
 816                        elif arr.ndim > 1:
 817                            self[key] = arr[:, ordered_indices, ...]
 818                        else:
 819                            self[key] = arr[ordered_indices]
 820                    else:
 821                        if arr.ndim > 1:
 822                            self[key] = arr[:, mask_combined, ...]
 823                        else:
 824                            self[key] = arr[mask_combined]
 825
 826            return None
 827        else:
 828            # Create A new `SNPObject` with filtered data
 829            snpobj = self.copy()
 830            for key in keys:
 831                if snpobj[key] is not None:
 832                    arr = np.asarray(snpobj[key])
 833                    if ordered_indices is not None:
 834                        if key == 'calldata_lai' and arr.ndim == 2:
 835                            hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1])
 836                            snpobj[key] = arr[:, hap_idx]
 837                        elif arr.ndim > 1:
 838                            snpobj[key] = arr[:, ordered_indices, ...]
 839                        else:
 840                            snpobj[key] = arr[ordered_indices]
 841                    else:
 842                        if arr.ndim > 1:
 843                            snpobj[key] = arr[:, mask_combined, ...]
 844                        else:
 845                            snpobj[key] = arr[mask_combined]
 846            return snpobj
 847
 848    def detect_chromosome_format(self) -> str:
 849        """
 850        Detect the chromosome naming convention in `variants_chrom` based on the prefix 
 851        of the first chromosome identifier in `unique_chrom`.
 852        
 853        **Recognized formats:**
 854
 855        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
 856        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
 857        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
 858        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
 859        
 860        If the format does not match any recognized pattern, `'Unknown format'` is returned.
 861
 862        Returns:
 863            **str:** 
 864                A string indicating the detected chromosome format (`'chr'`, `'chm'`, `'chrom'`, or `'plain'`).
 865                If no recognized format is matched, returns `'Unknown format'`.
 866        """
 867        # Select the first unique chromosome identifier for format detection
 868        chromosome_str = self.unique_chrom[0]
 869
 870        # Define regular expressions to match each recognized chromosome format
 871        patterns = {
 872            'chr': r'^chr(\d+|X|Y|M)$',    # Matches 'chr' prefixed format
 873            'chm': r'^chm(\d+|X|Y|M)$',    # Matches 'chm' prefixed format
 874            'chrom': r'^chrom(\d+|X|Y|M)$', # Matches 'chrom' prefixed format
 875            'plain': r'^(\d+|X|Y|M)$'       # Matches plain format without prefix
 876        }
 877
 878        # Iterate through the patterns to identify the chromosome format
 879        for prefix, pattern in patterns.items():
 880            if re.match(pattern, chromosome_str):
 881                return prefix  # Return the recognized format prefix
 882
 883        # If no pattern matches, return 'Unknown format'
 884        return 'Unknown format'
 885
 886    def convert_chromosome_format(
 887        self, 
 888        from_format: str, 
 889        to_format: str, 
 890        inplace: bool = False
 891    ) -> Optional['SNPObject']:
 892        """
 893        Convert the chromosome format from one naming convention to another in `variants_chrom`.
 894
 895        **Supported formats:**
 896
 897        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
 898        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
 899        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
 900        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
 901
 902        Args:
 903            from_format (str): 
 904                The current chromosome format. Acceptable values are `'chr'`, `'chm'`, `'chrom'`, or `'plain'`.
 905            to_format (str): 
 906                The target format for chromosome data conversion. Acceptable values match `from_format` options.
 907            inplace (bool, default=False): 
 908                If True, modifies `self` in place. If False, returns a new `SNPObject` with the converted format.
 909                Default is False.
 910
 911        Returns:
 912            **Optional[SNPObject]:** A new `SNPObject` with the converted chromosome format if `inplace=False`. 
 913            If `inplace=True`, modifies `self` in place and returns None.
 914        """
 915        # Define the list of standard chromosome identifiers
 916        chrom_list = [*map(str, range(1, 23)), 'X', 'Y', 'M']  # M for mitochondrial chromosomes
 917
 918        # Format mappings for different chromosome naming conventions
 919        format_mappings = {
 920            'chr': [f'chr{i}' for i in chrom_list],
 921            'chm': [f'chm{i}' for i in chrom_list],
 922            'chrom': [f'chrom{i}' for i in chrom_list],
 923            'plain': chrom_list,
 924        }
 925
 926        # Verify that from_format and to_format are valid naming conventions
 927        if from_format not in format_mappings or to_format not in format_mappings:
 928            raise ValueError(f"Invalid format: {from_format} or {to_format}. Must be one of {list(format_mappings.keys())}.")
 929
 930        # Convert chromosomes to string for consistent comparison
 931        variants_chrom = self['variants_chrom'].astype(str)
 932
 933        # Verify that all chromosomes in the object follow the specified `from_format`
 934        expected_chroms = set(format_mappings[from_format])
 935        mismatched_chroms = set(variants_chrom) - expected_chroms
 936
 937        if mismatched_chroms:
 938            raise ValueError(f"The following chromosomes do not match the `from_format` '{from_format}': {mismatched_chroms}.")
 939
 940        # Create conditions for selecting based on current `from_format` names
 941        conditions = [variants_chrom == chrom for chrom in format_mappings[from_format]]
 942
 943        # Rename chromosomes based on inplace flag
 944        if inplace:
 945            self['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown')
 946            return None
 947        else:
 948            snpobject = self.copy()
 949            snpobject['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown')
 950            return snpobject
 951
 952    def match_chromosome_format(self, snpobj: 'SNPObject', inplace: bool = False) -> Optional['SNPObject']:
 953        """
 954        Convert the chromosome format in `variants_chrom` from `self` to match the format of a reference `snpobj`.
 955
 956        **Recognized formats:**
 957
 958        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
 959        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
 960        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
 961        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
 962
 963        Args:
 964            snpobj (SNPObject): 
 965                The reference SNPObject whose chromosome format will be matched.
 966            inplace (bool, default=False): 
 967                If True, modifies `self` in place. If False, returns a new `SNPObject` with the 
 968                chromosome format matching that of `snpobj`. Default is False.
 969
 970        Returns:
 971            **Optional[SNPObject]:** 
 972                A new `SNPObject` with matched chromosome format if `inplace=False`. 
 973                If `inplace=True`, modifies `self` in place and returns None.
 974        """
 975        # Detect the chromosome naming format of the current SNPObject
 976        fmt1 = self.detect_chromosome_format()
 977        if fmt1 == 'Unknown format':
 978            raise ValueError("The chromosome format of the current SNPObject is unrecognized.")
 979        
 980        # Detect the chromosome naming format of the reference SNPObject
 981        fmt2 = snpobj.detect_chromosome_format()
 982        if fmt2 == 'Unknown format':
 983            raise ValueError("The chromosome format of the reference SNPObject is unrecognized.")
 984
 985        # Convert the current SNPObject's chromosome format to match the reference format
 986        return self.convert_chromosome_format(fmt1, fmt2, inplace=inplace)
 987
 988    def rename_chrom(
 989        self,
 990        to_replace: Union[Dict[str, str], str, List[str]] = {'^([0-9]+)$': r'chr\1', r'^chr([0-9]+)$': r'\1'},
 991        value: Optional[Union[str, List[str]]] = None,
 992        regex: bool = True,
 993        inplace: bool = False
 994    ) -> Optional['SNPObject']:
 995        """
 996        Replace chromosome values in `variants_chrom` using patterns or exact matches.
 997
 998        This method allows flexible chromosome replacements, using regex or exact matches, useful 
 999        for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'), 
1000        consider `convert_chromosome_format`.
1001
1002        Args:
1003            to_replace (dict, str, or list of str): 
1004                Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior 
1005                transforms `<chrom_num>` to `chr<chrom_num>` or vice versa. Non-matching values 
1006                remain unchanged.
1007                - If str or list of str: Matches will be replaced with `value`.
1008                - If regex (bool), then any regex matches will be replaced with `value`.
1009                - If dict: Keys defines values to replace, with corresponding replacements as values.
1010            value (str or list of str, optional): 
1011                Replacement value(s) if `to_replace` is a string or list. Ignored if `to_replace` 
1012                is a dictionary.
1013            regex (bool, default=True): 
1014                If True, interprets `to_replace` keys as regex patterns.
1015            inplace (bool, default=False): 
1016                If True, modifies `self` in place. If False, returns a new `SNPObject` with the chromosomes
1017                renamed. Default is False.
1018
1019        Returns:
1020            **Optional[SNPObject]:** A new `SNPObject` with the renamed chromosome format if `inplace=False`. 
1021            If `inplace=True`, modifies `self` in place and returns None.
1022        """
1023        # Standardize input format: convert `to_replace` and `value` to a dictionary if needed
1024        if isinstance(to_replace, (str, int)):
1025            to_replace = [to_replace]
1026        if isinstance(value, (str, int)):
1027            value = [value]
1028        if isinstance(to_replace, list) and isinstance(value, list):
1029            dictionary = dict(zip(to_replace, value))
1030        elif isinstance(to_replace, dict) and value is None:
1031            dictionary = to_replace
1032        else:
1033            raise ValueError(
1034            "Invalid input: `to_replace` and `value` must be compatible types (both str, list of str, or dict)."
1035        )
1036
1037        # Vectorized function for replacing values in chromosome array
1038        vec_replace_values = np.vectorize(self._match_to_replace)
1039
1040        # Rename chromosomes based on inplace flag
1041        if inplace:
1042            self.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex)
1043            return None
1044        else:
1045            snpobj = self.copy()
1046            snpobj.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex)
1047            return snpobj
1048
1049    def rename_missings(
1050        self, 
1051        before: Union[int, float, str] = -1, 
1052        after: Union[int, float, str] = '.', 
1053        inplace: bool = False
1054    ) -> Optional['SNPObject']:
1055        """
1056        Replace missing values in the `calldata_gt` attribute.
1057
1058        This method identifies missing values in 'calldata_gt' and replaces them with a specified 
1059        value. By default, it replaces occurrences of `-1` (often used to signify missing data) with `'.'`.
1060
1061        Args:
1062            before (int, float, or str, default=-1): 
1063                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
1064                Default is -1.
1065            after (int, float, or str, default='.'): 
1066                The value that will replace `before`. Default is '.'.
1067            inplace (bool, default=False): 
1068                If True, modifies `self` in place. If False, returns a new `SNPObject` with the applied 
1069                replacements. Default is False.
1070
1071        Returns:
1072            **Optional[SNPObject]:** 
1073                A new `SNPObject` with the renamed missing values if `inplace=False`. 
1074                If `inplace=True`, modifies `self` in place and returns None.
1075        """
1076        # Rename missing values in the `calldata_gt` attribute based on inplace flag
1077        if inplace:
1078            self['calldata_gt'] = np.where(self['calldata_gt'] == before, after, self['calldata_gt'])
1079            return None
1080        else:
1081            snpobj = self.copy()
1082            snpobj['calldata_gt'] = np.where(snpobj['calldata_gt'] == before, after, snpobj['calldata_gt'])
1083            return snpobj
1084
1085    def get_common_variants_intersection(
1086        self, 
1087        snpobj: 'SNPObject', 
1088        index_by: str = 'pos'
1089    ) -> Tuple[List[str], np.ndarray, np.ndarray]:
1090        """
1091        Identify common variants between `self` and the `snpobj` instance based on the specified `index_by` criterion, 
1092        which may match based on chromosome and position (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both.
1093
1094        This method returns the identifiers of common variants and their corresponding indices in both objects.
1095
1096        Args:
1097            snpobj (SNPObject): 
1098                The reference SNPObject to compare against.
1099            index_by (str, default='pos'): 
1100                Criteria for matching variants. Options:
1101                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1102                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1103                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1104                Default is 'pos'.
1105
1106        Returns:
1107            Tuple containing:
1108            - **list of str:** A list of common variant identifiers (as strings).
1109            - **array:** An array of indices in `self` where common variants are located.
1110            - **array:** An array of indices in `snpobj` where common variants are located.
1111        """
1112        # Create unique identifiers for each variant in both SNPObjects based on the specified criterion
1113        if index_by == 'pos':
1114            query_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(self['variants_chrom'], self['variants_pos'])]
1115            reference_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(snpobj['variants_chrom'], snpobj['variants_pos'])]
1116        elif index_by == 'id':
1117            query_identifiers = self['variants_id'].tolist()
1118            reference_identifiers = snpobj['variants_id'].tolist()
1119        elif index_by == 'pos+id':
1120            query_identifiers = [
1121                f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(self['variants_chrom'], self['variants_pos'], self['variants_id'])
1122            ]
1123            reference_identifiers = [
1124                f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_id'])
1125            ]
1126        else:
1127            raise ValueError("`index_by` must be one of 'pos', 'id', or 'pos+id'.")
1128
1129        # Convert to sets for intersection
1130        common_ids = set(query_identifiers).intersection(reference_identifiers)
1131
1132        # Collect indices for common identifiers
1133        query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids]
1134        reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids]
1135
1136        return list(common_ids), np.array(query_idx), np.array(reference_idx)
1137
1138    def get_common_markers_intersection(
1139        self, 
1140        snpobj: 'SNPObject'
1141    ) -> Tuple[List[str], np.ndarray, np.ndarray]:
1142        """
1143        Identify common markers between between `self` and the `snpobj` instance. Common markers are identified 
1144        based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 
1145        and alternate (`variants_alt`) alleles.
1146
1147        This method returns the identifiers of common markers and their corresponding indices in both objects.
1148        
1149        Args:
1150            snpobj (SNPObject): 
1151                The reference SNPObject to compare against.
1152        
1153        Returns:
1154            Tuple containing:
1155            - **list of str:** A list of common variant identifiers (as strings).
1156            - **array:** An array of indices in `self` where common variants are located.
1157            - **array:** An array of indices in `snpobj` where common variants are located.
1158        """
1159        # Generate unique identifiers based on chrom, pos, ref, and alt alleles
1160        query_identifiers = [
1161            f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 
1162            zip(self['variants_chrom'], self['variants_pos'], self['variants_ref'], self['variants_alt'])
1163        ]
1164        reference_identifiers = [
1165            f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 
1166            zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_ref'], snpobj['variants_alt'])
1167        ]
1168
1169        # Convert to sets for intersection
1170        common_ids = set(query_identifiers).intersection(reference_identifiers)
1171
1172        # Collect indices for common identifiers in both SNPObjects
1173        query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids]
1174        reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids]
1175
1176        return list(common_ids), np.array(query_idx), np.array(reference_idx)
1177
1178    def subset_to_common_variants(
1179        self, 
1180        snpobj: 'SNPObject', 
1181        index_by: str = 'pos', 
1182        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1183        inplace: bool = False
1184    ) -> Optional['SNPObject']:
1185        """
1186        Subset `self` to include only the common variants with a reference `snpobj` based on 
1187        the specified `index_by` criterion, which may match based on chromosome and position 
1188        (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both.
1189        
1190        Args:
1191            snpobj (SNPObject): 
1192                The reference SNPObject to compare against.
1193            index_by (str, default='pos'): 
1194                Criteria for matching variants. Options:
1195                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1196                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1197                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1198                Default is 'pos'.
1199            common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): 
1200                Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 
1201                computed within the function.
1202            inplace (bool, default=False): 
1203                If True, modifies `self` in place. If False, returns a new `SNPObject` with the common variants
1204                subsetted. Default is False.
1205
1206        Returns:
1207            **Optional[SNPObject]:** 
1208                A new `SNPObject` with the common variants subsetted if `inplace=False`. 
1209                If `inplace=True`, modifies `self` in place and returns None.
1210        """
1211        # Get indices of common variants if not provided
1212        if common_variants_intersection is None:
1213            _, query_idx, _ = self.get_common_variants_intersection(snpobj, index_by=index_by)
1214        else:
1215            query_idx, _ = common_variants_intersection
1216
1217        # Use filter_variants method with the identified indices, applying `inplace` as specified
1218        return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)
1219
1220    def subset_to_common_markers(
1221        self, 
1222        snpobj: 'SNPObject', 
1223        common_markers_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1224        inplace: bool = False
1225    ) -> Optional['SNPObject']:
1226        """
1227        Subset `self` to include only the common markers with a reference `snpobj`. Common markers are identified 
1228        based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 
1229        and alternate (`variants_alt`) alleles.
1230
1231        Args:
1232            snpobj (SNPObject): 
1233                The reference SNPObject to compare against.
1234            common_markers_intersection (tuple of arrays, optional): 
1235                Precomputed indices of common markers between `self` and `snpobj`. If None, intersection is 
1236                computed within the function.
1237            inplace (bool, default=False): 
1238                If True, modifies `self` in place. If False, returns a new `SNPObject` with the common markers
1239                subsetted. Default is False.
1240
1241        Returns:
1242            **Optional[SNPObject]:** 
1243                A new `SNPObject` with the common markers subsetted if `inplace=False`. 
1244                If `inplace=True`, modifies `self` in place and returns None.
1245        """
1246        # Get indices of common markers if not provided
1247        if common_markers_intersection is None:
1248            _, query_idx, _ = self.get_common_markers_intersection(snpobj)
1249        else:
1250            query_idx, _ = common_markers_intersection
1251
1252        # Use filter_variants method with the identified indices, applying `inplace` as specified
1253        return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)
1254
1255    def merge(
1256            self, 
1257            snpobj: 'SNPObject', 
1258            force_samples: bool = False, 
1259            prefix: str = '2', 
1260            inplace: bool = False
1261        ) -> Optional['SNPObject']:
1262        """
1263        Merge `self` with `snpobj` along the sample axis.
1264
1265        This method expects both SNPObjects to contain the same set of SNPs in the same order, 
1266        then combines their genotype (`calldata_gt`) and LAI (`calldata_lai`) arrays by 
1267        concatenating the sample dimension. Samples from `snpobj` are appended to those in `self`.
1268
1269        Args:
1270            snpobj (SNPObject): 
1271                The SNPObject to merge samples with.
1272            force_samples (bool, default=False): 
1273                If True, duplicate sample names are resolved by prepending the `prefix` to duplicate sample names in 
1274                `snpobj`. Otherwise, merging fails when duplicate sample names are found. Default is False.
1275            prefix (str, default='2'): 
1276                A string prepended to duplicate sample names in `snpobj` when `force_samples=True`. 
1277                Duplicates are renamed from `<sample_name>` to `<prefix>:<sample_name>`. For instance, 
1278                if `prefix='2'` and there is a conflict with a sample called "sample_1", it becomes "2:sample_1".
1279            inplace (bool, default=False): 
1280                If True, modifies `self` in place. If False, returns a new `SNPObject` with the merged samples. 
1281                Default is False.
1282
1283        Returns:
1284            **Optional[SNPObject]**: A new SNPObject containing the merged sample data.
1285        """
1286        # Merge calldata_gt if present and compatible
1287        if self.calldata_gt is not None and snpobj.calldata_gt is not None:
1288            if self.calldata_gt.shape[0] != snpobj.calldata_gt.shape[0]:
1289                raise ValueError(
1290                    f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_gt`.\n"
1291                    f"`self.calldata_gt` has {self.calldata_gt.shape[0]} SNPs, "
1292                    f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[0]} SNPs."
1293                )
1294            if self.are_strands_summed and not snpobj.are_strands_summed:
1295                raise ValueError(
1296                    "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n"
1297                    "Ensure both objects have the same genotype summation state before merging."
1298                )
1299            if not self.are_strands_summed and snpobj.are_strands_summed:
1300                raise ValueError(
1301                    "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n"
1302                    "Ensure both objects have the same genotype summation state before merging."
1303                )
1304            calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=1)
1305        else:
1306            calldata_gt = None
1307
1308        # Merge samples if present and compatible, handling duplicates if `force_samples=True`
1309        if self.samples is not None and snpobj.samples is not None:
1310            overlapping_samples = set(self.samples).intersection(set(snpobj.samples))
1311            if overlapping_samples:
1312                if not force_samples:
1313                    raise ValueError(
1314                        f"Cannot merge SNPObjects: Found overlapping sample names {overlapping_samples}.\n"
1315                        "Samples must be strictly non-overlapping. To allow merging with renaming, set `force_samples=True`."
1316                    )
1317                else:
1318                    # Rename duplicate samples by prepending the file index
1319                    renamed_samples = [f"{prefix}:{sample}" if sample in overlapping_samples else sample for sample in snpobj.samples]
1320                    samples = np.concatenate([self.samples, renamed_samples], axis=0)
1321            else:
1322                samples = np.concatenate([self.samples, snpobj.samples], axis=0)
1323        else:
1324            samples = None
1325
1326        # Merge LAI data if present and compatible
1327        if self.calldata_lai is not None and snpobj.calldata_lai is not None:
1328            if self.calldata_lai.ndim != snpobj.calldata_lai.ndim:
1329                raise ValueError(
1330                    f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n"
1331                    f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, "
1332                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions."
1333                )
1334            if self.calldata_lai.shape[0] != snpobj.calldata_lai.shape[0]:
1335                raise ValueError(
1336                    f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_lai`.\n"
1337                    f"`self.calldata_lai` has {self.calldata_lai.shape[0]} SNPs, "
1338                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[0]} SNPs."
1339                )
1340            calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=1)
1341        else:
1342            calldata_lai = None
1343
1344        if inplace:
1345            self.calldata_gt = calldata_gt
1346            self.calldata_lai = calldata_lai
1347            self.samples = samples
1348            return self
1349
1350        # Create and return a new SNPObject containing the merged samples
1351        return SNPObject(
1352            calldata_gt=calldata_gt,
1353            samples=samples,
1354            variants_ref=self.variants_ref,
1355            variants_alt=self.variants_alt,
1356            variants_chrom=self.variants_chrom,
1357            variants_filter_pass=self.variants_filter_pass,
1358            variants_id=self.variants_id,
1359            variants_pos=self.variants_pos,
1360            variants_qual=self.variants_qual,
1361            calldata_lai=calldata_lai,
1362            ancestry_map=self.ancestry_map
1363        )
1364    
1365    def concat(
1366        self,
1367        snpobj: 'SNPObject', 
1368        inplace: bool = False
1369    ) -> Optional['SNPObject']:
1370        """
1371        Concatenate self with snpobj along the SNP axis.
1372
1373        This method expects both SNPObjects to contain the same set of samples in the same order, 
1374        and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) 
1375        those in self.
1376
1377        Args:
1378            snpobj (SNPObject):
1379                The SNPObject to concatenate SNPs with.
1380            inplace (bool, default=False):
1381                If True, modifies `self` in place. If False, returns a new `SNPObject` with the concatenated SNPs. 
1382                Default is False.
1383        
1384        Returns:
1385            **Optional[SNPObject]**: A new SNPObject containing the concatenated SNP data.
1386        """
1387        # Merge calldata_gt if present and compatible
1388        if self.calldata_gt is not None and snpobj.calldata_gt is not None:
1389            if self.calldata_gt.shape[1] != snpobj.calldata_gt.shape[1]:
1390                raise ValueError(
1391                    f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_gt`.\n"
1392                    f"`self.calldata_gt` has {self.calldata_gt.shape[1]} samples, "
1393                    f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[1]} samples."
1394                )
1395            if self.are_strands_summed and not snpobj.are_strands_summed:
1396                raise ValueError(
1397                    "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n"
1398                    "Ensure both objects have the same genotype summation state before merging."
1399                )
1400            if not self.are_strands_summed and snpobj.are_strands_summed:
1401                raise ValueError(
1402                    "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n"
1403                    "Ensure both objects have the same genotype summation state before merging."
1404                )
1405            calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=0)
1406        else:
1407            calldata_gt = None
1408
1409        # Merge SNP-related attributes if present
1410        attributes = [
1411            'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual'
1412        ]
1413        merged_attrs = {}
1414        for attr in attributes:
1415            self_attr = getattr(self, attr, None)
1416            obj_attr = getattr(snpobj, attr, None)
1417
1418            # Concatenate if both present
1419            if self_attr is not None and obj_attr is not None:
1420                merged_attrs[attr] = np.concatenate([self_attr, obj_attr], axis=0)
1421            else:
1422                # If either is None, store None
1423                merged_attrs[attr] = None
1424
1425        # Merge LAI data if present and compatible
1426        if self.calldata_lai is not None and snpobj.calldata_lai is not None:
1427            if self.calldata_lai.ndim != snpobj.calldata_lai.ndim:
1428                raise ValueError(
1429                    f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n"
1430                    f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, "
1431                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions."
1432                )
1433            if self.calldata_lai.shape[1] != snpobj.calldata_lai.shape[1]:
1434                raise ValueError(
1435                    f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_lai`.\n"
1436                    f"`self.calldata_lai` has {self.calldata_lai.shape[1]} samples, "
1437                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[1]} samples."
1438                )
1439            calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=0)
1440        else:
1441            calldata_lai = None
1442        
1443        if inplace:
1444            self.calldata_gt = calldata_gt
1445            self.calldata_lai = calldata_lai
1446            for attr in attributes:
1447                self[attr] = merged_attrs[attr]
1448            return self
1449        
1450        # Create and return a new SNPObject containing the concatenated SNPs
1451        return SNPObject(
1452            calldata_gt=calldata_gt,
1453            calldata_lai=calldata_lai,
1454            samples=self.samples,
1455            variants_ref=merged_attrs['variants_ref'],
1456            variants_alt=merged_attrs['variants_alt'],
1457            variants_chrom=merged_attrs['variants_chrom'],
1458            variants_id=merged_attrs['variants_id'],
1459            variants_pos=merged_attrs['variants_pos'],
1460            variants_qual=merged_attrs['variants_qual'],
1461            variants_filter_pass=merged_attrs['variants_filter_pass'],
1462            ancestry_map=self.ancestry_map
1463        )
1464
1465    def remove_strand_ambiguous_variants(self, inplace: bool = False) -> Optional['SNPObject']:
1466        """
1467        A strand-ambiguous variant has reference (`variants_ref`) and alternate (`variants_alt`) alleles 
1468        in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable 
1469        in terms of strand orientation.
1470
1471        Args:
1472            inplace (bool, default=False): 
1473                If True, modifies `self` in place. If False, returns a new `SNPObject` with the 
1474                strand-ambiguous variants removed. Default is False.
1475
1476        Returns:
1477            **Optional[SNPObject]:** A new `SNPObject` with non-ambiguous variants only if `inplace=False`. 
1478            If `inplace=True`, modifies `self` in place and returns None.
1479        """
1480        # Identify strand-ambiguous SNPs using vectorized comparisons
1481        is_AT = (self['variants_ref'] == 'A') & (self['variants_alt'] == 'T')
1482        is_TA = (self['variants_ref'] == 'T') & (self['variants_alt'] == 'A')
1483        is_CG = (self['variants_ref'] == 'C') & (self['variants_alt'] == 'G')
1484        is_GC = (self['variants_ref'] == 'G') & (self['variants_alt'] == 'C')
1485
1486        # Create a combined mask for all ambiguous variants
1487        ambiguous_mask = is_AT | is_TA | is_CG | is_GC
1488        non_ambiguous_idx = np.where(~ambiguous_mask)[0]
1489
1490        # Count each type of ambiguity using numpy's sum on boolean arrays
1491        A_T_count = np.sum(is_AT)
1492        T_A_count = np.sum(is_TA)
1493        C_G_count = np.sum(is_CG)
1494        G_C_count = np.sum(is_GC)
1495
1496        # Log the counts of each type of strand-ambiguous variants
1497        total_ambiguous = A_T_count + T_A_count + C_G_count + G_C_count
1498        log.info(f'{A_T_count} ambiguities of A-T type.')
1499        log.info(f'{T_A_count} ambiguities of T-A type.')
1500        log.info(f'{C_G_count} ambiguities of C-G type.')
1501        log.info(f'{G_C_count} ambiguities of G-C type.')
1502
1503        # Filter out ambiguous variants and keep non-ambiguous ones
1504        log.debug(f'Removing {total_ambiguous} strand-ambiguous variants...')
1505        return self.filter_variants(indexes=non_ambiguous_idx, include=True, inplace=inplace)
1506
1507    def correct_flipped_variants(
1508        self, 
1509        snpobj: 'SNPObject', 
1510        check_complement: bool = True, 
1511        index_by: str = 'pos', 
1512        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1513        log_stats: bool = True,
1514        inplace: bool = False
1515    ) -> Optional['SNPObject']:
1516        """
1517        Correct flipped variants between between `self` and a reference `snpobj`, where reference (`variants_ref`) 
1518        and alternate (`variants_alt`) alleles are swapped.
1519
1520        **Flip Detection Based on `check_complement`:**
1521
1522        - If `check_complement=False`, only direct allele swaps are considered:
1523            1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1524
1525        - If `check_complement=True`, both direct and complementary swaps are considered, with four possible cases:
1526            1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1527            2. **Complement Swap of Ref:** `complement(self.variants_ref) == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1528            3. **Complement Swap of Alt:** `self.variants_ref == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`.
1529            4. **Complement Swap of both Ref and Alt:** `complement(self.variants_ref) == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`.
1530
1531        **Note:** Variants where `self.variants_ref == self.variants_alt` are ignored as they are ambiguous.
1532
1533        **Correction Process:** 
1534        - Swaps `variants_ref` and `variants_alt` alleles in `self` to align with `snpobj`.
1535        - Flips `calldata_gt` values (0 becomes 1, and 1 becomes 0) to match the updated allele configuration.
1536
1537        Args:
1538            snpobj (SNPObject): 
1539                The reference SNPObject to compare against.
1540            check_complement (bool, default=True): 
1541                If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants.
1542                Default is True.
1543            index_by (str, default='pos'): 
1544                Criteria for matching variants. Options:
1545                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1546                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1547                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1548                Default is 'pos'.
1549            common_variants_intersection (tuple of arrays, optional): 
1550                Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 
1551                computed within the function.
1552            log_stats (bool, default=True): 
1553                If True, logs statistical information about matching and ambiguous alleles. Default is True.
1554            inplace (bool, default=False): 
1555                If True, modifies `self` in place. If False, returns a new `SNPObject` with corrected 
1556                flips. Default is False.
1557
1558        Returns:
1559            **Optional[SNPObject]**: 
1560                A new `SNPObject` with corrected flips if `inplace=False`. 
1561                If `inplace=True`, modifies `self` in place and returns None.
1562        """
1563        # Define complement mappings for nucleotides
1564        complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
1565
1566        # Helper function to get the complement of a base
1567        def get_complement(base: str) -> str:
1568            return complement_map.get(base, base)
1569
1570        # Get common variant indices if not provided
1571        if common_variants_intersection != None:
1572            query_idx, reference_idx = common_variants_intersection
1573        else:
1574            _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by)
1575
1576        # Log statistics on matching alleles if enabled
1577        if log_stats:
1578            matching_ref = np.sum(self['variants_ref'][query_idx] == snpobj['variants_ref'][reference_idx])
1579            matching_alt = np.sum(self['variants_alt'][query_idx] == snpobj['variants_alt'][reference_idx])
1580            ambiguous = np.sum(self['variants_ref'][query_idx] == self['variants_alt'][query_idx])
1581            log.info(f"Matching reference alleles (ref=ref'): {matching_ref}, Matching alternate alleles (alt=alt'): {matching_alt}.")
1582            log.info(f"Number of ambiguous alleles (ref=alt): {ambiguous}.")
1583
1584        # Identify indices where `ref` and `alt` alleles are swapped
1585        if not check_complement:
1586            # Simple exact match for swapped alleles
1587            swapped_ref = (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx])
1588            swapped_alt = (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx])
1589        else:
1590            # Check for swapped or complementary-swapped alleles
1591            swapped_ref = (
1592                (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) |
1593                (np.vectorize(get_complement)(self['variants_ref'][query_idx]) == snpobj['variants_alt'][reference_idx])
1594            )
1595            swapped_alt = (
1596                (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) |
1597                (np.vectorize(get_complement)(self['variants_alt'][query_idx]) == snpobj['variants_ref'][reference_idx])
1598            )
1599
1600        # Filter out ambiguous variants where `ref` and `alt` alleles match (ref=alt)
1601        not_ambiguous = (self['variants_ref'][query_idx] != self['variants_alt'][query_idx])
1602
1603        # Indices in `self` of flipped variants
1604        flip_idx_query = query_idx[swapped_ref & swapped_alt & not_ambiguous]
1605
1606        # Correct the identified variant flips
1607        if len(flip_idx_query) > 0:
1608            log.info(f'Correcting {len(flip_idx_query)} variant flips...')
1609
1610            temp_alts = self['variants_alt'][flip_idx_query]
1611            temp_refs = self['variants_ref'][flip_idx_query]
1612
1613            # Correct the variant flips based on whether the operation is in-place or not
1614            if inplace:
1615                self['variants_alt'][flip_idx_query] = temp_refs
1616                self['variants_ref'][flip_idx_query] = temp_alts
1617                self['calldata_gt'][flip_idx_query] = 1 - self['calldata_gt'][flip_idx_query]
1618                return None
1619            else:
1620                snpobj = self.copy()
1621                snpobj['variants_alt'][flip_idx_query] = temp_refs
1622                snpobj['variants_ref'][flip_idx_query] = temp_alts
1623                snpobj['calldata_gt'][flip_idx_query] = 1 - snpobj['calldata_gt'][flip_idx_query]
1624                return snpobj
1625        else:
1626            log.info('No variant flips found to correct.')
1627            return self if not inplace else None
1628
1629    def remove_mismatching_variants(
1630        self, 
1631        snpobj: 'SNPObject', 
1632        index_by: str = 'pos', 
1633        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1634        inplace: bool = False
1635    ) -> Optional['SNPObject']:
1636        """
1637        Remove variants from `self`, where reference (`variants_ref`) and/or alternate (`variants_alt`) alleles 
1638        do not match with a reference `snpobj`.
1639
1640        Args:
1641            snpobj (SNPObject): 
1642                The reference SNPObject to compare against.
1643            index_by (str, default='pos'): 
1644                Criteria for matching variants. Options:
1645                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1646                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1647                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1648                Default is 'pos'.
1649            common_variants_intersection (tuple of arrays, optional): 
1650                Precomputed indices of common variants between `self` and the reference `snpobj`.
1651                If None, the intersection is computed within the function.
1652            inplace (bool, default=False): 
1653                If True, modifies `self` in place. If False, returns a new `SNPObject` without 
1654                mismatching variants. Default is False.
1655
1656        Returns:
1657            **Optional[SNPObject]:** 
1658                A new `SNPObject` without mismatching variants if `inplace=False`. 
1659                If `inplace=True`, modifies `self` in place and returns None.
1660        """
1661        # Get common variant indices if not provided
1662        if common_variants_intersection is not None:
1663            query_idx, reference_idx = common_variants_intersection
1664        else:
1665            _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by)
1666
1667        # Vectorized comparison of `ref` and `alt` alleles
1668        ref_mismatch = self['variants_ref'][query_idx] != snpobj['variants_ref'][reference_idx]
1669        alt_mismatch = self['variants_alt'][query_idx] != snpobj['variants_alt'][reference_idx]
1670        mismatch_mask = ref_mismatch | alt_mismatch
1671
1672        # Identify indices in `self` of mismatching variants
1673        mismatch_idx = query_idx[mismatch_mask]
1674
1675        # Compute total number of variant mismatches
1676        total_mismatches = np.sum(mismatch_mask)
1677
1678        # Filter out mismatching variants
1679        log.debug(f'Removing {total_mismatches} mismatching variants...')
1680        return self.filter_variants(indexes=mismatch_idx, include=True, inplace=inplace)
1681
1682    def shuffle_variants(self, inplace: bool = False) -> Optional['SNPObject']:
1683        """
1684        Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated 
1685        data (e.g., `calldata_gt` and variant-specific attributes) remain aligned.
1686
1687        Args:
1688            inplace (bool, default=False): 
1689                If True, modifies `self` in place. If False, returns a new `SNPObject` with 
1690                shuffled variants. Default is False.
1691
1692        Returns:
1693            **Optional[SNPObject]:** 
1694                A new `SNPObject` without shuffled variant positions if `inplace=False`. 
1695                If `inplace=True`, modifies `self` in place and returns None.
1696        """
1697        # Generate a random permutation index for shuffling variant positions
1698        shuffle_index = np.random.permutation(self.n_snps)
1699
1700        # Apply shuffling to all relevant attributes using the class's dictionary-like interface
1701        if inplace:
1702            for key in self.keys():
1703                if self[key] is not None:
1704                    if key == 'calldata_gt':
1705                        # `calldata_gt`` has a different shape, so it's shuffled along axis 0
1706                        self[key] = self[key][shuffle_index, ...]
1707                    elif 'variant' in key:
1708                        # snpobj attributes are 1D arrays
1709                        self[key] = np.asarray(self[key])[shuffle_index]
1710            return None
1711        else:
1712            shuffled_snpobj = self.copy()
1713            for key in shuffled_snpobj.keys():
1714                if shuffled_snpobj[key] is not None:
1715                    if key == 'calldata_gt':
1716                        shuffled_snpobj[key] = shuffled_snpobj[key][shuffle_index, ...]
1717                    elif 'variant' in key:
1718                        shuffled_snpobj[key] = np.asarray(shuffled_snpobj[key])[shuffle_index]
1719            return shuffled_snpobj
1720
1721    def set_empty_to_missing(self, inplace: bool = False) -> Optional['SNPObject']:
1722        """
1723        Replace empty strings `''` with missing values `'.'` in attributes of `self`.
1724
1725        Args:
1726            inplace (bool, default=False): 
1727                If True, modifies `self` in place. If False, returns a new `SNPObject` with empty 
1728                strings `''` replaced by missing values `'.'`. Default is False.
1729
1730        Returns:
1731            **Optional[SNPObject]:** 
1732                A new `SNPObject` with empty strings replaced if `inplace=False`. 
1733                If `inplace=True`, modifies `self` in place and returns None.
1734        """
1735        if inplace:
1736            if self.variants_alt is not None:
1737                self.variants_alt[self.variants_alt == ''] = '.'
1738            if self.variants_ref is not None:
1739                self.variants_ref[self.variants_ref == ''] = '.'
1740            if self.variants_qual is not None:
1741                self.variants_qual = self.variants_qual.astype(str)
1742                self.variants_qual[(self.variants_qual == '') | (self.variants_qual == 'nan')] = '.'
1743            if self.variants_chrom is not None:
1744                self.variants_chrom = self.variants_chrom.astype(str)
1745                self.variants_chrom[self.variants_chrom == ''] = '.'
1746            if self.variants_filter_pass is not None:
1747                self.variants_filter_pass[self.variants_filter_pass == ''] = '.'
1748            if self.variants_id is not None:
1749                self.variants_id[self.variants_id == ''] = '.'
1750            return self
1751        else:
1752            snpobj = self.copy()
1753            if snpobj.variants_alt is not None:
1754                snpobj.variants_alt[snpobj.variants_alt == ''] = '.'
1755            if snpobj.variants_ref is not None:
1756                snpobj.variants_ref[snpobj.variants_ref == ''] = '.'
1757            if snpobj.variants_qual is not None:
1758                snpobj.variants_qual = snpobj.variants_qual.astype(str)
1759                snpobj.variants_qual[(snpobj.variants_qual == '') | (snpobj.variants_qual == 'nan')] = '.'
1760            if snpobj.variants_chrom is not None:
1761                snpobj.variants_chrom[snpobj.variants_chrom == ''] = '.'
1762            if snpobj.variants_filter_pass is not None:
1763                snpobj.variants_filter_pass[snpobj.variants_filter_pass == ''] = '.'
1764            if snpobj.variants_id is not None:
1765                snpobj.variants_id[snpobj.variants_id == ''] = '.'
1766            return snpobj
1767
1768    def convert_to_window_level(
1769        self,
1770        window_size: Optional[int] = None,
1771        physical_pos: Optional[np.ndarray] = None,
1772        chromosomes: Optional[np.ndarray] = None,
1773        window_sizes: Optional[np.ndarray] = None,
1774        laiobj: Optional['LocalAncestryObject'] = None
1775    ) -> 'LocalAncestryObject':
1776        """
1777        Aggregate the `calldata_lai` attribute into genomic windows within a 
1778        `snputils.ancestry.genobj.LocalAncestryObject`.
1779
1780        **Options for defining windows (in order of precedence):**
1781
1782        1. **Fixed window size**:
1783        - Use `window_size` to specify how many SNPs go into each window. The last window on each 
1784        chromosome may be larger if SNPs are not evenly divisible by the size.
1785
1786        2. **Custom start and end positions**:
1787        - Provide `physical_pos` (2D array of shape (n_windows, 2)) as the [start, end] base-pair 
1788         coordinates for each window. 
1789        - If `chromosomes` is not provided and `self` has exactly one chromosome, all windows are 
1790        assumed to belong to that chromosome. 
1791        - If multiple chromosomes exist but `chromosomes` is missing, an error will be raised.
1792        - Optionally, provide `window_sizes` to store the SNP count per-window.
1793
1794        3. **Matching existing windows**:
1795        - Reuse window definitions (`physical_pos`, `chromosomes`, `window_sizes`) from an existing `laiobj`.
1796
1797        Args:
1798            window_size (int, optional): 
1799                Number of SNPs in each window if defining fixed-size windows. If the total number of 
1800                SNPs in a chromosome is not evenly divisible by the window size, the last window on that 
1801                chromosome will include all remaining SNPs and therefore be larger than the specified size.
1802            physical_pos (array of shape (n_windows, 2), optional): 
1803                A 2D array containing the start and end physical positions for each window.
1804            chromosomes (array of shape (n_windows,), optional): 
1805                An array with chromosome numbers corresponding to each genomic window.
1806            window_sizes (array of shape (n_windows,), optional): 
1807                An array specifying the number of SNPs in each genomic window.
1808            laiobj (LocalAncestryObject, optional): 
1809                A reference `LocalAncestryObject` from which to copy existing window definitions.
1810
1811        Returns:
1812            **LocalAncestryObject:** 
1813                A LocalAncestryObject containing window-level ancestry data.
1814        """
1815        from snputils.ancestry.genobj.local import LocalAncestryObject
1816
1817        if window_size is None and physical_pos is None and laiobj is None:
1818            raise ValueError("One of `window_size`, `physical_pos`, or `laiobj` must be provided.")
1819        
1820        # Fixed window size
1821        if window_size is not None:
1822            physical_pos = []   # Boundaries [start, end] of each window
1823            chromosomes = []    # Chromosome for each window
1824            window_sizes = []   # Number of SNPs for each window
1825            for chrom in self.unique_chrom:
1826                # Extract indices corresponding to this chromosome
1827                mask_chrom = (self.variants_chrom == chrom)
1828                # Subset to this chromosome
1829                pos_chrom = self.variants_pos[mask_chrom]
1830                # Number of SNPs for this chromosome
1831                n_snps_chrom = pos_chrom.size
1832                
1833                # Initialize the start of the first window with the position of the first SNP
1834                current_start = self.variants_pos[0]
1835
1836                # Number of full windows with exactly `window_size` SNPs
1837                n_full_windows = n_snps_chrom // window_size
1838
1839                # Build all but the last window
1840                for i in range(n_full_windows-1):
1841                    current_end = self.variants_pos[(i+1) * window_size - 1]
1842                    physical_pos.append([current_start, current_end])
1843                    chromosomes.append(chrom)
1844                    window_sizes.append(window_size)
1845                    current_start = self.variants_pos[(i+1) * window_size]
1846                
1847                # Build the last window
1848                current_end = self.variants_pos[-1]
1849                physical_pos.append([current_start, current_end])
1850                chromosomes.append(chrom)
1851                window_sizes.append(n_snps_chrom - ((n_full_windows - 1) * window_size))
1852                
1853            physical_pos = np.array(physical_pos)
1854            chromosomes = np.array(chromosomes)
1855            window_sizes = np.array(window_sizes)
1856        
1857        # Custom start and end positions
1858        elif physical_pos is not None:
1859            # Check if there is exactly one chromosome
1860            if chromosomes is None:
1861                unique_chrom = self.unique_chrom
1862                if len(unique_chrom) == 1:
1863                    # We assume all windows belong to this single chromosome
1864                    single_chrom = unique_chrom[0]
1865                    chromosomes = np.array([single_chrom] * physical_pos.shape[0])
1866                else:
1867                    raise ValueError("Multiple chromosomes detected, but `chromosomes` was not provided.")
1868
1869        # Match existing windows to a reference laiobj
1870        elif laiobj is not None:
1871            physical_pos = laiobj.physical_pos
1872            chromosomes = laiobj.chromosomes
1873            window_sizes = laiobj.window_sizes
1874
1875        # Allocate an output LAI array
1876        n_windows = physical_pos.shape[0]
1877        n_samples = self.n_samples
1878        if self.calldata_lai.ndim == 3:
1879            lai = np.zeros((n_windows, n_samples, 2))
1880        else:
1881            lai = np.zeros((n_windows, n_samples*2))
1882
1883        # For each window, find the relevant SNPs and compute the mode of the ancestries
1884        for i, ((start, end), chrom) in enumerate(zip(physical_pos, chromosomes)):
1885            snps_mask = (
1886                (self.variants_chrom == chrom) &
1887                (self.variants_pos >= start) &
1888                (self.variants_pos <= end)
1889            )
1890            if np.any(snps_mask):
1891                lai_mask = self.calldata_lai[snps_mask, ...]
1892                mode_ancestries = mode(lai_mask, axis=0, nan_policy='omit').mode
1893                lai[i] = mode_ancestries
1894            else:
1895                lai[i] = np.nan
1896
1897        # Generate haplotype labels, e.g. "Sample1.0", "Sample1.1"
1898        haplotypes = [f"{sample}.{i}" for sample in self.samples for i in range(2)]
1899
1900        # If original data was (n_snps, n_samples, 2), flatten to (n_windows, n_samples*2)
1901        if self.calldata_lai.ndim == 3:
1902            lai = lai.reshape(n_windows, -1)
1903
1904        # Aggregate into a LocalAncestryObject
1905        return LocalAncestryObject(
1906            haplotypes=haplotypes,
1907            lai=lai,
1908            samples=self.samples,
1909            ancestry_map=self.ancestry_map,
1910            window_sizes=window_sizes,
1911            physical_pos=physical_pos,
1912            chromosomes=chromosomes
1913        )
1914
1915    def save(self, file: Union[str, Path]) -> None:
1916        """
1917        Save the data stored in `self` to a specified file.
1918
1919        The format of the saved file is determined by the file extension provided in the `file` 
1920        argument. 
1921        
1922        **Supported formats:**
1923        
1924        - `.bed`: Binary PED (Plink) format.
1925        - `.pgen`: Plink2 binary genotype format.
1926        - `.vcf`: Variant Call Format.
1927        - `.pkl`: Pickle format for saving `self` in serialized form.
1928
1929        Args:
1930            file (str or pathlib.Path): 
1931                Path to the file where the data will be saved. The extension of the file determines the save format. 
1932                Supported extensions: `.bed`, `.pgen`, `.vcf`, `.pkl`.
1933        """
1934        ext = Path(file).suffix.lower()
1935        if ext == '.bed':
1936            self.save_bed(file)
1937        elif ext == '.pgen':
1938            self.save_pgen(file)
1939        elif ext == '.vcf':
1940            self.save_vcf(file)
1941        elif ext == '.pkl':
1942            self.save_pickle(file)
1943        else:
1944            raise ValueError(f"Unsupported file extension: {ext}")
1945
1946    def save_bed(self, file: Union[str, Path]) -> None:
1947        """
1948        Save the data stored in `self` to a `.bed` file.
1949
1950        Args:
1951            file (str or pathlib.Path): 
1952                Path to the file where the data will be saved. It should end with `.bed`. 
1953                If the provided path does not have this extension, it will be appended.
1954        """
1955        from snputils.snp.io.write.bed import BEDWriter
1956        writer = BEDWriter(snpobj=self, filename=file)
1957        writer.write()
1958
1959    def save_pgen(self, file: Union[str, Path]) -> None:
1960        """
1961        Save the data stored in `self` to a `.pgen` file.
1962
1963        Args:
1964            file (str or pathlib.Path): 
1965                Path to the file where the data will be saved. It should end with `.pgen`. 
1966                If the provided path does not have this extension, it will be appended.
1967        """
1968        from snputils.snp.io.write.pgen import PGENWriter
1969        writer = PGENWriter(snpobj=self, filename=file)
1970        writer.write()
1971
1972    def save_vcf(self, file: Union[str, Path]) -> None:
1973        """
1974        Save the data stored in `self` to a `.vcf` file.
1975
1976        Args:
1977            file (str or pathlib.Path): 
1978                Path to the file where the data will be saved. It should end with `.vcf`. 
1979                If the provided path does not have this extension, it will be appended.
1980        """
1981        from snputils.snp.io.write.vcf import VCFWriter
1982        writer = VCFWriter(snpobj=self, filename=file)
1983        writer.write()
1984
1985    def save_pickle(self, file: Union[str, Path]) -> None:
1986        """
1987        Save `self` in serialized form to a `.pkl` file.
1988
1989        Args:
1990            file (str or pathlib.Path): 
1991                Path to the file where the data will be saved. It should end with `.pkl`. 
1992                If the provided path does not have this extension, it will be appended.
1993        """
1994        import pickle
1995        with open(file, 'wb') as file:
1996            pickle.dump(self, file)
1997
1998    @staticmethod
1999    def _match_to_replace(val: Union[str, int, float], dictionary: Dict[Any, Any], regex: bool = True) -> Union[str, int, float]:
2000        """
2001        Find a matching key in the provided dictionary for the given value `val`
2002        and replace it with the corresponding value.
2003
2004        Args:
2005            val (str, int, or float): 
2006                The value to be matched and potentially replaced.
2007            dictionary (Dict): 
2008                A dictionary containing keys and values for matching and replacement.
2009                The keys should match the data type of `val`.
2010            regex (bool): 
2011                If True, interprets keys in `dictionary` as regular expressions.
2012                Default is True.
2013
2014        Returns:
2015            str, int, or float: 
2016                The replacement value from `dictionary` if a match is found; otherwise, the original `val`.
2017        """
2018        if regex:
2019            # Use regular expression matching to find replacements
2020            for key, value in dictionary.items():
2021                if isinstance(key, str):
2022                    match = re.match(key, val)
2023                    if match:
2024                        # Replace using the first matching regex pattern
2025                        return re.sub(key, value, val)
2026            # Return the original value if no regex match is found
2027            return val
2028        else:
2029            # Return the value for `val` if present in `dictionary`; otherwise, return `val`
2030            return dictionary.get(val, val)
2031
2032    @staticmethod
2033    def _get_chromosome_number(chrom_string: str) -> Union[int, str]:
2034        """
2035        Extracts the chromosome number from the given chromosome string.
2036
2037        Args:
2038            chrom_string (str): 
2039                The chromosome identifier.
2040
2041        Returns:
2042            int or str: 
2043                The numeric representation of the chromosome if detected. 
2044                Returns 10001 for 'X' or 'chrX', 10002 for 'Y' or 'chrY', 
2045                and the original `chrom_string` if unrecognized.
2046        """
2047        if chrom_string.isdigit():
2048            return int(chrom_string)
2049        else:
2050            chrom_num = re.search(r'\d+', chrom_string)
2051            if chrom_num:
2052                return int(chrom_num.group())
2053            elif chrom_string.lower() in ['x', 'chrx']:
2054                return 10001
2055            elif chrom_string.lower() in ['y', 'chry']:
2056                return 10002
2057            else:
2058                log.warning(f"Chromosome nomenclature not standard. Chromosome: {chrom_string}")
2059                return chrom_string
2060
2061    def _sanity_check(self) -> None:
2062        """
2063        Perform sanity checks to ensure LAI and ancestry map consistency.
2064
2065        This method checks that all unique ancestries in the LAI data are represented 
2066        in the ancestry map if it is provided.
2067        """
2068        if self.__calldata_lai is not None and self.__ancestry_map is not None:
2069            unique_ancestries = np.unique(self.__calldata_lai)
2070            missing_ancestries = [anc for anc in unique_ancestries if str(anc) not in self.__ancestry_map]
2071            if missing_ancestries:
2072                warnings.warn(f"Missing ancestries in ancestry_map: {missing_ancestries}")

A class for Single Nucleotide Polymorphism (SNP) data, with optional support for SNP-level Local Ancestry Information (LAI).

SNPObject( calldata_gt: numpy.ndarray | None = None, samples: numpy.ndarray | None = None, variants_ref: numpy.ndarray | None = None, variants_alt: numpy.ndarray | None = None, variants_chrom: numpy.ndarray | None = None, variants_filter_pass: numpy.ndarray | None = None, variants_id: numpy.ndarray | None = None, variants_pos: numpy.ndarray | None = None, variants_qual: numpy.ndarray | None = None, calldata_lai: numpy.ndarray | None = None, ancestry_map: Dict[str, str] | None = None)
26    def __init__(
27        self,
28        calldata_gt: Optional[np.ndarray] = None,
29        samples: Optional[np.ndarray] = None,
30        variants_ref: Optional[np.ndarray] = None,
31        variants_alt: Optional[np.ndarray] = None,
32        variants_chrom: Optional[np.ndarray] = None,
33        variants_filter_pass: Optional[np.ndarray] = None,
34        variants_id: Optional[np.ndarray] = None,
35        variants_pos: Optional[np.ndarray] = None,
36        variants_qual: Optional[np.ndarray] = None,
37        calldata_lai: Optional[np.ndarray] = None,
38        ancestry_map: Optional[Dict[str, str]] = None
39    ) -> None:
40        """
41        Args:
42            calldata_gt (array, optional): 
43                An array containing genotype data for each sample. This array can be either 2D with shape 
44                `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 
45                `(n_snps, n_samples, 2)` if the strands are kept separate.
46            samples (array of shape (n_samples,), optional): 
47                An array containing unique sample identifiers.
48            variants_ref (array of shape (n_snps,), optional): 
49                An array containing the reference allele for each SNP.
50            variants_alt (array of shape (n_snps,), optional): 
51                An array containing the alternate allele for each SNP.
52            variants_chrom (array of shape (n_snps,), optional): 
53                An array containing the chromosome for each SNP.
54            variants_filter_pass (array of shape (n_snps,), optional): 
55                An array indicating whether each SNP passed control checks.
56            variants_id (array of shape (n_snps,), optional): 
57                An array containing unique identifiers (IDs) for each SNP.
58            variants_pos (array of shape (n_snps,), optional): 
59                An array containing the chromosomal positions for each SNP.
60            variants_qual (array of shape (n_snps,), optional): 
61                An array containing the Phred-scaled quality score for each SNP.
62            calldata_lai (array, optional): 
63                An array containing the ancestry for each SNP. This array can be either 2D with shape
64                `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2).
65            ancestry_map (dict of str to str, optional): 
66                A dictionary mapping ancestry codes to region names.
67        """
68        self.__calldata_gt = calldata_gt
69        self.__samples = samples
70        self.__variants_ref = variants_ref
71        self.__variants_alt = variants_alt
72        self.__variants_chrom = variants_chrom
73        self.__variants_filter_pass = variants_filter_pass
74        self.__variants_id = variants_id
75        self.__variants_pos = variants_pos
76        self.__variants_qual = variants_qual
77        self.__calldata_lai = calldata_lai
78        self.__ancestry_map = ancestry_map
79
80        self._sanity_check()
Arguments:
  • calldata_gt (array, optional): An array containing genotype data for each sample. This array can be either 2D with shape (n_snps, n_samples) if the paternal and maternal strands are summed, or 3D with shape (n_snps, n_samples, 2) if the strands are kept separate.
  • samples (array of shape (n_samples,), optional): An array containing unique sample identifiers.
  • variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
  • variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
  • variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
  • variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
  • variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
  • variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
  • variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
  • calldata_lai (array, optional): An array containing the ancestry for each SNP. This array can be either 2D with shape (n_snps, n_samples*2), or 3D with shape (n_snps, n_samples, 2).
  • ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
calldata_gt: numpy.ndarray
102    @property
103    def calldata_gt(self) -> np.ndarray:
104        """
105        Retrieve `calldata_gt`.
106
107        Returns:
108            **array:** 
109                An array containing genotype data for each sample. This array can be either 2D with shape 
110                `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 
111                `(n_snps, n_samples, 2)` if the strands are kept separate.
112        """
113        return self.__calldata_gt

Retrieve calldata_gt.

Returns:

array: An array containing genotype data for each sample. This array can be either 2D with shape (n_snps, n_samples) if the paternal and maternal strands are summed, or 3D with shape (n_snps, n_samples, 2) if the strands are kept separate.

samples: numpy.ndarray | None
122    @property
123    def samples(self) -> Optional[np.ndarray]:
124        """
125        Retrieve `samples`.
126
127        Returns:
128            **array of shape (n_samples,):** 
129                An array containing unique sample identifiers.
130        """
131        return self.__samples

Retrieve samples.

Returns:

array of shape (n_samples,): An array containing unique sample identifiers.

variants_ref: numpy.ndarray | None
140    @property
141    def variants_ref(self) -> Optional[np.ndarray]:
142        """
143        Retrieve `variants_ref`.
144
145        Returns:
146            **array of shape (n_snps,):** An array containing the reference allele for each SNP.
147        """
148        return self.__variants_ref

Retrieve variants_ref.

Returns:

array of shape (n_snps,): An array containing the reference allele for each SNP.

variants_alt: numpy.ndarray | None
157    @property
158    def variants_alt(self) -> Optional[np.ndarray]:
159        """
160        Retrieve `variants_alt`.
161
162        Returns:
163            **array of shape (n_snps,):** An array containing the alternate allele for each SNP.
164        """
165        return self.__variants_alt

Retrieve variants_alt.

Returns:

array of shape (n_snps,): An array containing the alternate allele for each SNP.

variants_chrom: numpy.ndarray | None
174    @property
175    def variants_chrom(self) -> Optional[np.ndarray]:
176        """
177        Retrieve `variants_chrom`.
178
179        Returns:
180            **array of shape (n_snps,):** An array containing the chromosome for each SNP.
181        """
182        return self.__variants_chrom

Retrieve variants_chrom.

Returns:

array of shape (n_snps,): An array containing the chromosome for each SNP.

variants_filter_pass: numpy.ndarray | None
191    @property
192    def variants_filter_pass(self) -> Optional[np.ndarray]:
193        """
194        Retrieve `variants_filter_pass`.
195
196        Returns:
197            **array of shape (n_snps,):** An array indicating whether each SNP passed control checks.
198        """
199        return self.__variants_filter_pass

Retrieve variants_filter_pass.

Returns:

array of shape (n_snps,): An array indicating whether each SNP passed control checks.

variants_id: numpy.ndarray | None
208    @property
209    def variants_id(self) -> Optional[np.ndarray]:
210        """
211        Retrieve `variants_id`.
212
213        Returns:
214            **array of shape (n_snps,):** An array containing unique identifiers (IDs) for each SNP.
215        """
216        return self.__variants_id

Retrieve variants_id.

Returns:

array of shape (n_snps,): An array containing unique identifiers (IDs) for each SNP.

variants_pos: numpy.ndarray | None
225    @property
226    def variants_pos(self) -> Optional[np.ndarray]:
227        """
228        Retrieve `variants_pos`.
229
230        Returns:
231            **array of shape (n_snps,):** An array containing the chromosomal positions for each SNP.
232        """
233        return self.__variants_pos

Retrieve variants_pos.

Returns:

array of shape (n_snps,): An array containing the chromosomal positions for each SNP.

variants_qual: numpy.ndarray | None
242    @property
243    def variants_qual(self) -> Optional[np.ndarray]:
244        """
245        Retrieve `variants_qual`.
246
247        Returns:
248            **array of shape (n_snps,):** An array containing the Phred-scaled quality score for each SNP.
249        """
250        return self.__variants_qual

Retrieve variants_qual.

Returns:

array of shape (n_snps,): An array containing the Phred-scaled quality score for each SNP.

calldata_lai: numpy.ndarray | None
259    @property
260    def calldata_lai(self) -> Optional[np.ndarray]:
261        """
262        Retrieve `calldata_lai`.
263
264        Returns:
265            **array:** 
266                An array containing the ancestry for each SNP. This array can be either 2D with shape
267                `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2).
268        """
269        return self.__calldata_lai

Retrieve calldata_lai.

Returns:

array: An array containing the ancestry for each SNP. This array can be either 2D with shape (n_snps, n_samples*2), or 3D with shape (n_snps, n_samples, 2).

ancestry_map: Dict[str, str] | None
278    @property
279    def ancestry_map(self) -> Optional[Dict[str, str]]:
280        """
281        Retrieve `ancestry_map`.
282
283        Returns:
284            **dict of str to str:** A dictionary mapping ancestry codes to region names.
285        """
286        return self.__ancestry_map

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names.

n_samples: int
295    @property
296    def n_samples(self) -> int:
297        """
298        Retrieve `n_samples`.
299
300        Returns:
301            **int:** The total number of samples.
302        """
303        if self.__samples is not None:
304            return len(self.__samples)
305        elif self.__calldata_gt is not None:
306            return self.__calldata_gt.shape[1]
307        elif self.__calldata_lai is not None:
308            if self.__calldata_lai.ndim == 2:
309                return self.__calldata_lai.shape[1] // 2
310            elif self.__calldata_lai.ndim == 3:
311                return self.__calldata_lai.shape[1]
312        else:
313            raise ValueError("Unable to determine the total number of samples: no relevant data is available.")

Retrieve n_samples.

Returns:

int: The total number of samples.

n_snps: int
315    @property
316    def n_snps(self) -> int:
317        """
318        Retrieve `n_snps`.
319
320        Returns:
321            **int:** The total number of SNPs.
322        """
323        # List of attributes that can indicate the number of SNPs
324        potential_attributes = [
325            self.__calldata_gt,
326            self.__variants_ref,
327            self.__variants_alt,
328            self.__variants_chrom,
329            self.__variants_filter_pass,
330            self.__variants_id,
331            self.__variants_pos,
332            self.__variants_qual,
333            self.__calldata_lai
334        ]
335
336        # Check each attribute for its first dimension, which corresponds to `n_snps`
337        for attr in potential_attributes:
338            if attr is not None:
339                return attr.shape[0]
340
341        raise ValueError("Unable to determine the total number of SNPs: no relevant data is available.")

Retrieve n_snps.

Returns:

int: The total number of SNPs.

n_chrom: int | None
343    @property
344    def n_chrom(self) -> Optional[int]:
345        """
346        Retrieve `n_chrom`.
347
348        Returns:
349            **int:** The total number of unique chromosomes in `variants_chrom`.
350        """
351        if self.variants_chrom is None:
352            warnings.warn("Chromosome data `variants_chrom` is None.")
353            return None
354
355        return len(self.unique_chrom)

Retrieve n_chrom.

Returns:

int: The total number of unique chromosomes in variants_chrom.

n_ancestries: int
357    @property
358    def n_ancestries(self) -> int:
359        """
360        Retrieve `n_ancestries`.
361
362        Returns:
363            **int:** The total number of unique ancestries.
364        """
365        if self.__calldata_lai is not None:
366            return len(np.unique(self.__calldata_lai))
367        else:
368            raise ValueError("Unable to determine the total number of ancestries: no relevant data is available.")

Retrieve n_ancestries.

Returns:

int: The total number of unique ancestries.

unique_chrom: numpy.ndarray | None
370    @property
371    def unique_chrom(self) -> Optional[np.ndarray]:
372        """
373        Retrieve `unique_chrom`.
374
375        Returns:
376            **array:** The unique chromosome names in `variants_chrom`, preserving their order of appearance.
377        """
378        if self.variants_chrom is None:
379            warnings.warn("Chromosome data `variants_chrom` is None.")
380            return None
381
382        # Identify unique chromosome names and their first indexes of occurrence
383        _, idx = np.unique(self.variants_chrom, return_index=True)
384        # Return chromosome names sorted by their first occurrence to maintain original order
385        return self.variants_chrom[np.sort(idx)]

Retrieve unique_chrom.

Returns:

array: The unique chromosome names in variants_chrom, preserving their order of appearance.

are_strands_summed: bool
387    @property
388    def are_strands_summed(self) -> bool:
389        """
390        Retrieve `are_strands_summed`.
391        
392        Returns:
393            **bool:** 
394                True if the maternal and paternal strands have been summed together, which is indicated by 
395                `calldata_gt` having shape `(n_samples, n_snps)`. False if the strands are stored separately, 
396                indicated by `calldata_gt` having shape `(n_samples, n_snps, 2)`.
397        """
398        if self.calldata_gt is None:
399            warnings.warn("Genotype data `calldata_gt` is None.")
400            return None
401        
402        return self.calldata_gt.ndim == 2

Retrieve are_strands_summed.

Returns:

bool: True if the maternal and paternal strands have been summed together, which is indicated by calldata_gt having shape (n_samples, n_snps). False if the strands are stored separately, indicated by calldata_gt having shape (n_samples, n_snps, 2).

def copy(self) -> SNPObject:
404    def copy(self) -> SNPObject:
405        """
406        Create and return a copy of `self`.
407
408        Returns:
409            **SNPObject:** 
410                A new instance of the current object.
411        """
412        return copy.deepcopy(self)

Create and return a copy of self.

Returns:

SNPObject: A new instance of the current object.

def keys(self) -> List[str]:
414    def keys(self) -> List[str]:
415        """
416        Retrieve a list of public attribute names for `self`.
417
418        Returns:
419            **list of str:** 
420                A list of attribute names, with internal name-mangling removed, 
421                for easier reference to public attributes in the instance.
422        """
423        return [attr.replace('_SNPObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def allele_freq( self, sample_labels: Sequence[Any] | None = None, ancestry: str | int | None = None, laiobj: LocalAncestryObject | None = None, return_counts: bool = False, as_dataframe: bool = False) -> Any:
425    def allele_freq(
426        self,
427        sample_labels: Optional[Sequence[Any]] = None,
428        ancestry: Optional[Union[str, int]] = None,
429        laiobj: Optional["LocalAncestryObject"] = None,
430        return_counts: bool = False,
431        as_dataframe: bool = False,
432    ) -> Any:
433        """
434        Compute per-SNP alternate allele frequencies from `calldata_gt`.
435
436        Args:
437            sample_labels (sequence, optional):
438                Population label per sample. If None, computes cohort-level frequencies.
439            ancestry (str or int, optional):
440                If provided, compute ancestry-masked frequencies using SNP-level LAI.
441            laiobj (LocalAncestryObject, optional):
442                Optional LAI object used when `self.calldata_lai` is not set.
443            return_counts (bool, default=False):
444                If True, also return called-allele counts with the same shape as frequencies.
445            as_dataframe (bool, default=False):
446                If True, return pandas DataFrame output.
447
448        Returns:
449            Frequencies as a NumPy array (or DataFrame if `as_dataframe=True`).
450            If `return_counts=True`, returns `(freq, counts)`.
451        """
452        if self.calldata_gt is None:
453            raise ValueError("Genotype data `calldata_gt` is None.")
454
455        gt = np.asarray(self.calldata_gt)
456        if gt.ndim not in (2, 3):
457            raise ValueError("'calldata_gt' must be 2D or 3D array")
458
459        n_samples = gt.shape[1]
460
461        grouped_output = sample_labels is not None
462        if sample_labels is None:
463            labels = np.repeat("__all__", n_samples)
464        else:
465            labels = np.asarray(sample_labels)
466            if labels.ndim != 1:
467                labels = labels.ravel()
468            if labels.shape[0] != n_samples:
469                raise ValueError(
470                    "'sample_labels' must have length equal to the number of samples in `calldata_gt`."
471                )
472
473        calldata_lai = None
474        if ancestry is not None:
475            if self.calldata_lai is not None:
476                calldata_lai = self.calldata_lai
477            elif laiobj is not None:
478                try:
479                    converted_lai = laiobj.convert_to_snp_level(snpobject=self, lai_format="3D")
480                    calldata_lai = getattr(converted_lai, "calldata_lai", None)
481                except Exception:
482                    calldata_lai = None
483
484            if calldata_lai is None:
485                raise ValueError(
486                    "Ancestry-specific masking requires SNP-level LAI "
487                    "(provide a LocalAncestryObject via 'laiobj' or ensure 'self.calldata_lai' is set)."
488                )
489
490        afs, counts, pops = aggregate_pop_allele_freq(
491            calldata_gt=gt,
492            sample_labels=labels,
493            ancestry=ancestry,
494            calldata_lai=calldata_lai,
495        )
496
497        if grouped_output:
498            freq_out = afs
499            count_out = counts
500            if as_dataframe:
501                import pandas as pd
502
503                freq_out = pd.DataFrame(afs, columns=pops)
504                count_out = pd.DataFrame(counts, columns=pops)
505        else:
506            freq_out = afs[:, 0]
507            count_out = counts[:, 0]
508            if as_dataframe:
509                import pandas as pd
510
511                freq_out = pd.DataFrame({"allele_freq": freq_out})
512                count_out = pd.DataFrame({"called_alleles": count_out})
513
514        if return_counts:
515            return freq_out, count_out
516        return freq_out

Compute per-SNP alternate allele frequencies from calldata_gt.

Arguments:
  • sample_labels (sequence, optional): Population label per sample. If None, computes cohort-level frequencies.
  • ancestry (str or int, optional): If provided, compute ancestry-masked frequencies using SNP-level LAI.
  • laiobj (LocalAncestryObject, optional): Optional LAI object used when self.calldata_lai is not set.
  • return_counts (bool, default=False): If True, also return called-allele counts with the same shape as frequencies.
  • as_dataframe (bool, default=False): If True, return pandas DataFrame output.
Returns:

Frequencies as a NumPy array (or DataFrame if as_dataframe=True). If return_counts=True, returns (freq, counts).

def sum_strands( self, inplace: bool = False) -> SNPObject | None:
518    def sum_strands(self, inplace: bool = False) -> Optional['SNPObject']:
519        """
520        Sum paternal and maternal strands.
521
522        Args:
523            inplace (bool, default=False): 
524                If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 
525                filtered. Default is False.
526
527        Returns:
528            **Optional[SNPObject]:** 
529                A new `SNPObject` with summed strands if `inplace=False`. 
530                If `inplace=True`, modifies `self` in place and returns None.
531        """
532        if self.calldata_gt is None:
533            warnings.warn("Genotype data `calldata_gt` is None.")
534            return None if not inplace else self
535
536        if self.are_strands_summed:
537            warnings.warn("Genotype data `calldata_gt` is already summed.")
538            return self if inplace else self.copy()
539        
540        if inplace:
541            self.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8)
542            return self
543        else:
544            snpobj = self.copy()
545            snpobj.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8)
546            return snpobj

Sum paternal and maternal strands.

Arguments:
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the variants filtered. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with summed strands if inplace=False. If inplace=True, modifies self in place and returns None.

def filter_variants( self, chrom: str | Sequence[str] | numpy.ndarray | None = None, pos: int | Sequence[int] | numpy.ndarray | None = None, indexes: int | Sequence[int] | numpy.ndarray | None = None, include: bool = True, inplace: bool = False) -> SNPObject | None:
548    def filter_variants(
549            self, 
550            chrom: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 
551            pos: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 
552            indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 
553            include: bool = True, 
554            inplace: bool = False
555        ) -> Optional['SNPObject']:
556        """
557        Filter variants based on specified chromosome names, variant positions, or variant indexes.
558
559        This method updates the `calldata_gt`, `variants_ref`, `variants_alt`, 
560        `variants_chrom`, `variants_filter_pass`, `variants_id`, `variants_pos`,  
561        `variants_qual`, and `lai` attributes to include or exclude the specified variants. The filtering 
562        criteria can be based on chromosome names, variant positions, or indexes. If multiple 
563        criteria are provided, their union is used for filtering. The order of the variants is preserved.
564        
565        Negative indexes are supported and follow 
566        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
567
568        Args:
569            chrom (str or array_like of str, optional): 
570                Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence 
571                of chromosomes. If both `chrom` and `pos` are provided, they must either have matching lengths 
572                (pairing each chromosome with a position) or `chrom` should be a single value that applies to 
573                all positions in `pos`. Default is None. 
574            pos (int or array_like of int, optional): 
575                Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions. 
576                If `chrom` is also provided, `pos` should either match `chrom` in length or `chrom` should be a 
577                single value. Default is None.
578            indexes (int or array_like of int, optional): 
579                Index(es) of the variants to include or exclude. Can be a single index or a sequence
580                of indexes. Negative indexes are supported. Default is None.
581            include (bool, default=True): 
582                If True, includes only the specified variants. If False, excludes the specified
583                variants. Default is True.
584            inplace (bool, default=False): 
585                If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 
586                filtered. Default is False.
587
588        Returns:
589            **Optional[SNPObject]:** 
590                A new `SNPObject` with the specified variants filtered if `inplace=False`. 
591                If `inplace=True`, modifies `self` in place and returns None.
592        """
593        if chrom is None and pos is None and indexes is None:
594            raise ValueError("At least one of 'chrom', 'pos', or 'indexes' must be provided.")
595
596        n_snps = self.n_snps
597
598        # Convert inputs to arrays for consistency
599        chrom = np.atleast_1d(chrom) if chrom is not None else None
600        pos = np.atleast_1d(pos) if pos is not None else None
601        indexes = np.atleast_1d(indexes) if indexes is not None else None
602
603        # Validate chrom and pos lengths if both are provided
604        if chrom is not None and pos is not None:
605            if len(chrom) != len(pos) and len(chrom) > 1:
606                raise ValueError(
607                    "When both 'chrom' and 'pos' are provided, they must either be of the same length "
608                    "or 'chrom' must be a single value."
609                )
610
611        # Create a mask for chromosome and position filtering
612        mask_combined = np.zeros(n_snps, dtype=bool)
613        if chrom is not None and pos is not None:
614            if len(chrom) == 1:
615                # Apply single chromosome to all positions in `pos`
616                mask_combined = (self['variants_chrom'] == chrom[0]) & np.isin(self['variants_pos'], pos)
617            else:
618                # Vectorized pair matching for chrom and pos
619                query_pairs = np.array(
620                    list(zip(chrom, pos)),
621                    dtype=[
622                        ('chrom', self['variants_chrom'].dtype),
623                        ('pos', self['variants_pos'].dtype)
624                    ]
625                )
626                data_pairs = np.array(
627                    list(zip(self['variants_chrom'], self['variants_pos'])),
628                    dtype=[
629                        ('chrom', self['variants_chrom'].dtype),
630                        ('pos', self['variants_pos'].dtype)
631                    ]
632                )
633                mask_combined = np.isin(data_pairs, query_pairs)
634
635        elif chrom is not None:
636            # Only chromosome filtering
637            mask_combined = np.isin(self['variants_chrom'], chrom)
638        elif pos is not None:
639            # Only position filtering
640            mask_combined = np.isin(self['variants_pos'], pos)
641
642        # Create mask based on indexes if provided
643        if indexes is not None:
644            # Validate indexes, allowing negative indexes
645            out_of_bounds_indexes = indexes[(indexes < -n_snps) | (indexes >= n_snps)]
646            if out_of_bounds_indexes.size > 0:
647                raise ValueError(f"One or more sample indexes are out of bounds.")
648
649            # Handle negative indexes and check for out-of-bounds indexes
650            adjusted_indexes = np.mod(indexes, n_snps)
651
652            # Create mask for specified indexes
653            mask_indexes = np.zeros(n_snps, dtype=bool)
654            mask_indexes[adjusted_indexes] = True
655
656            # Combine with `chrom` and `pos` mask using logical OR (union of all specified criteria)
657            mask_combined = mask_combined | mask_indexes
658
659        # Invert mask if `include` is False
660        if not include:
661            mask_combined = ~mask_combined
662
663        # Define keys to filter
664        keys = [
665            'calldata_gt', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 
666            'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai'
667        ]
668
669        # Apply filtering based on inplace parameter
670        if inplace:
671            for key in keys:
672                if self[key] is not None:
673                    if self[key].ndim > 1:
674                        self[key] = np.asarray(self[key])[mask_combined, ...]
675                    else:
676                        self[key] = np.asarray(self[key])[mask_combined]
677
678            return None
679        else:
680            # Create A new `SNPObject` with filtered data
681            snpobj = self.copy()
682            for key in keys:
683                if snpobj[key] is not None:
684                    if snpobj[key].ndim > 1:
685                        snpobj[key] = np.asarray(snpobj[key])[mask_combined, ...]
686                    else:
687                        snpobj[key] = np.asarray(snpobj[key])[mask_combined]
688
689            return snpobj

Filter variants based on specified chromosome names, variant positions, or variant indexes.

This method updates the calldata_gt, variants_ref, variants_alt, variants_chrom, variants_filter_pass, variants_id, variants_pos,
variants_qual, and lai attributes to include or exclude the specified variants. The filtering criteria can be based on chromosome names, variant positions, or indexes. If multiple criteria are provided, their union is used for filtering. The order of the variants is preserved.

Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • chrom (str or array_like of str, optional): Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence of chromosomes. If both chrom and pos are provided, they must either have matching lengths (pairing each chromosome with a position) or chrom should be a single value that applies to all positions in pos. Default is None.
  • pos (int or array_like of int, optional): Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions. If chrom is also provided, pos should either match chrom in length or chrom should be a single value. Default is None.
  • indexes (int or array_like of int, optional): Index(es) of the variants to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
  • include (bool, default=True): If True, includes only the specified variants. If False, excludes the specified variants. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the variants filtered. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the specified variants filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def filter_samples( self, samples: str | Sequence[str] | numpy.ndarray | None = None, indexes: int | Sequence[int] | numpy.ndarray | None = None, include: bool = True, reorder: bool = False, inplace: bool = False) -> SNPObject | None:
691    def filter_samples(
692            self, 
693            samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
694            indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
695            include: bool = True,
696            reorder: bool = False,
697            inplace: bool = False
698        ) -> Optional['SNPObject']:
699        """
700        Filter samples based on specified names or indexes.
701
702        This method updates the `samples` and `calldata_gt` attributes to include or exclude the specified 
703        samples. The order of the samples is preserved. Set `reorder=True` to match the ordering of the
704        provided `samples` and/or `indexes` lists when including.
705
706        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
707        indexes will be included or excluded.
708
709        This method allows inclusion or exclusion of specific samples by their names or 
710        indexes. When both sample names and indexes are provided, the union of the specified samples 
711        is used. Negative indexes are supported and follow 
712        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
713
714        Args:
715            samples (str or array_like of str, optional): 
716                 Name(s) of the samples to include or exclude. Can be a single sample name or a
717                 sequence of sample names. Default is None.
718            indexes (int or array_like of int, optional):
719                Index(es) of the samples to include or exclude. Can be a single index or a sequence
720                of indexes. Negative indexes are supported. Default is None.
721            include (bool, default=True): 
722                If True, includes only the specified samples. If False, excludes the specified
723                samples. Default is True.
724            inplace (bool, default=False): 
725                If True, modifies `self` in place. If False, returns a new `SNPObject` with the samples 
726                filtered. Default is False.
727
728        Returns:
729            **Optional[SNPObject]:** 
730                A new `SNPObject` with the specified samples filtered if `inplace=False`. 
731                If `inplace=True`, modifies `self` in place and returns None.
732        """
733        if samples is None and indexes is None:
734            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
735
736        n_samples = self.n_samples
737        sample_names = np.array(self['samples'])
738
739        # Create mask based on sample names
740        if samples is not None:
741            samples = np.asarray(samples).ravel()
742            mask_samples = np.isin(sample_names, samples)
743            missing_samples = samples[~np.isin(samples, sample_names)]
744            if missing_samples.size > 0:
745                raise ValueError(f"The following specified samples were not found: {missing_samples.tolist()}")
746        else:
747            mask_samples = np.zeros(n_samples, dtype=bool)
748
749        # Create mask based on sample indexes
750        if indexes is not None:
751            indexes = np.asarray(indexes).ravel()
752
753            # Validate indexes, allowing negative indexes
754            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
755            if out_of_bounds_indexes.size > 0:
756                raise ValueError(f"One or more sample indexes are out of bounds.")
757            
758            # Handle negative indexes
759            adjusted_indexes = np.mod(indexes, n_samples)
760
761            mask_indexes = np.zeros(n_samples, dtype=bool)
762            mask_indexes[adjusted_indexes] = True
763        else:
764            mask_indexes = np.zeros(n_samples, dtype=bool)
765
766        # Combine masks using logical OR (union of samples)
767        mask_combined = mask_samples | mask_indexes
768
769        if not include:
770            mask_combined = ~mask_combined
771
772        # If requested, compute an ordering of selected samples that follows the provided lists.
773        ordered_indices = None
774        if include and reorder:
775            sel_indices = np.where(mask_combined)[0]
776            ordered_list: List[int] = []
777            added = np.zeros(n_samples, dtype=bool)
778
779            # Prioritize the order in `samples`
780            if samples is not None:
781                name_to_idx = {name: idx for idx, name in enumerate(sample_names)}
782                for s in samples:
783                    idx = name_to_idx.get(s)
784                    if idx is not None and mask_combined[idx] and not added[idx]:
785                        ordered_list.append(idx)
786                        added[idx] = True
787
788            # Then respect the order in `indexes`
789            if indexes is not None:
790                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
791                for idx in adj_idx:
792                    if mask_combined[idx] and not added[idx]:
793                        ordered_list.append(int(idx))
794                        added[idx] = True
795
796            # Finally, append any remaining selected samples in their original order
797            for idx in sel_indices:
798                if not added[idx]:
799                    ordered_list.append(int(idx))
800
801            ordered_indices = np.asarray(ordered_list, dtype=int)
802
803        # Define keys to filter
804        keys = ['samples', 'calldata_gt', 'calldata_lai']
805
806        # Apply filtering based on inplace parameter
807        if inplace:
808            for key in keys:
809                if self[key] is not None:
810                    arr = np.asarray(self[key])
811                    if ordered_indices is not None:
812                        if key == 'calldata_lai' and arr.ndim == 2:
813                            # Haplotype-aware reordering for 2D LAI (n_snps, n_samples*2)
814                            hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1])
815                            self[key] = arr[:, hap_idx]
816                        elif arr.ndim > 1:
817                            self[key] = arr[:, ordered_indices, ...]
818                        else:
819                            self[key] = arr[ordered_indices]
820                    else:
821                        if arr.ndim > 1:
822                            self[key] = arr[:, mask_combined, ...]
823                        else:
824                            self[key] = arr[mask_combined]
825
826            return None
827        else:
828            # Create A new `SNPObject` with filtered data
829            snpobj = self.copy()
830            for key in keys:
831                if snpobj[key] is not None:
832                    arr = np.asarray(snpobj[key])
833                    if ordered_indices is not None:
834                        if key == 'calldata_lai' and arr.ndim == 2:
835                            hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1])
836                            snpobj[key] = arr[:, hap_idx]
837                        elif arr.ndim > 1:
838                            snpobj[key] = arr[:, ordered_indices, ...]
839                        else:
840                            snpobj[key] = arr[ordered_indices]
841                    else:
842                        if arr.ndim > 1:
843                            snpobj[key] = arr[:, mask_combined, ...]
844                        else:
845                            snpobj[key] = arr[mask_combined]
846            return snpobj

Filter samples based on specified names or indexes.

This method updates the samples and calldata_gt attributes to include or exclude the specified samples. The order of the samples is preserved. Set reorder=True to match the ordering of the provided samples and/or indexes lists when including.

If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.

This method allows inclusion or exclusion of specific samples by their names or indexes. When both sample names and indexes are provided, the union of the specified samples is used. Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
  • indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
  • include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the samples filtered. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the specified samples filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def detect_chromosome_format(self) -> str:
848    def detect_chromosome_format(self) -> str:
849        """
850        Detect the chromosome naming convention in `variants_chrom` based on the prefix 
851        of the first chromosome identifier in `unique_chrom`.
852        
853        **Recognized formats:**
854
855        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
856        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
857        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
858        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
859        
860        If the format does not match any recognized pattern, `'Unknown format'` is returned.
861
862        Returns:
863            **str:** 
864                A string indicating the detected chromosome format (`'chr'`, `'chm'`, `'chrom'`, or `'plain'`).
865                If no recognized format is matched, returns `'Unknown format'`.
866        """
867        # Select the first unique chromosome identifier for format detection
868        chromosome_str = self.unique_chrom[0]
869
870        # Define regular expressions to match each recognized chromosome format
871        patterns = {
872            'chr': r'^chr(\d+|X|Y|M)$',    # Matches 'chr' prefixed format
873            'chm': r'^chm(\d+|X|Y|M)$',    # Matches 'chm' prefixed format
874            'chrom': r'^chrom(\d+|X|Y|M)$', # Matches 'chrom' prefixed format
875            'plain': r'^(\d+|X|Y|M)$'       # Matches plain format without prefix
876        }
877
878        # Iterate through the patterns to identify the chromosome format
879        for prefix, pattern in patterns.items():
880            if re.match(pattern, chromosome_str):
881                return prefix  # Return the recognized format prefix
882
883        # If no pattern matches, return 'Unknown format'
884        return 'Unknown format'

Detect the chromosome naming convention in variants_chrom based on the prefix of the first chromosome identifier in unique_chrom.

Recognized formats:

  • 'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
  • 'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
  • 'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
  • 'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.

If the format does not match any recognized pattern, 'Unknown format' is returned.

Returns:

str: A string indicating the detected chromosome format ('chr', 'chm', 'chrom', or 'plain'). If no recognized format is matched, returns 'Unknown format'.

def convert_chromosome_format( self, from_format: str, to_format: str, inplace: bool = False) -> SNPObject | None:
886    def convert_chromosome_format(
887        self, 
888        from_format: str, 
889        to_format: str, 
890        inplace: bool = False
891    ) -> Optional['SNPObject']:
892        """
893        Convert the chromosome format from one naming convention to another in `variants_chrom`.
894
895        **Supported formats:**
896
897        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
898        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
899        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
900        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
901
902        Args:
903            from_format (str): 
904                The current chromosome format. Acceptable values are `'chr'`, `'chm'`, `'chrom'`, or `'plain'`.
905            to_format (str): 
906                The target format for chromosome data conversion. Acceptable values match `from_format` options.
907            inplace (bool, default=False): 
908                If True, modifies `self` in place. If False, returns a new `SNPObject` with the converted format.
909                Default is False.
910
911        Returns:
912            **Optional[SNPObject]:** A new `SNPObject` with the converted chromosome format if `inplace=False`. 
913            If `inplace=True`, modifies `self` in place and returns None.
914        """
915        # Define the list of standard chromosome identifiers
916        chrom_list = [*map(str, range(1, 23)), 'X', 'Y', 'M']  # M for mitochondrial chromosomes
917
918        # Format mappings for different chromosome naming conventions
919        format_mappings = {
920            'chr': [f'chr{i}' for i in chrom_list],
921            'chm': [f'chm{i}' for i in chrom_list],
922            'chrom': [f'chrom{i}' for i in chrom_list],
923            'plain': chrom_list,
924        }
925
926        # Verify that from_format and to_format are valid naming conventions
927        if from_format not in format_mappings or to_format not in format_mappings:
928            raise ValueError(f"Invalid format: {from_format} or {to_format}. Must be one of {list(format_mappings.keys())}.")
929
930        # Convert chromosomes to string for consistent comparison
931        variants_chrom = self['variants_chrom'].astype(str)
932
933        # Verify that all chromosomes in the object follow the specified `from_format`
934        expected_chroms = set(format_mappings[from_format])
935        mismatched_chroms = set(variants_chrom) - expected_chroms
936
937        if mismatched_chroms:
938            raise ValueError(f"The following chromosomes do not match the `from_format` '{from_format}': {mismatched_chroms}.")
939
940        # Create conditions for selecting based on current `from_format` names
941        conditions = [variants_chrom == chrom for chrom in format_mappings[from_format]]
942
943        # Rename chromosomes based on inplace flag
944        if inplace:
945            self['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown')
946            return None
947        else:
948            snpobject = self.copy()
949            snpobject['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown')
950            return snpobject

Convert the chromosome format from one naming convention to another in variants_chrom.

Supported formats:

  • 'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
  • 'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
  • 'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
  • 'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
Arguments:
  • from_format (str): The current chromosome format. Acceptable values are 'chr', 'chm', 'chrom', or 'plain'.
  • to_format (str): The target format for chromosome data conversion. Acceptable values match from_format options.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the converted format. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the converted chromosome format if inplace=False. If inplace=True, modifies self in place and returns None.

def match_chromosome_format( self, snpobj: SNPObject, inplace: bool = False) -> SNPObject | None:
952    def match_chromosome_format(self, snpobj: 'SNPObject', inplace: bool = False) -> Optional['SNPObject']:
953        """
954        Convert the chromosome format in `variants_chrom` from `self` to match the format of a reference `snpobj`.
955
956        **Recognized formats:**
957
958        - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
959        - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
960        - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
961        - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
962
963        Args:
964            snpobj (SNPObject): 
965                The reference SNPObject whose chromosome format will be matched.
966            inplace (bool, default=False): 
967                If True, modifies `self` in place. If False, returns a new `SNPObject` with the 
968                chromosome format matching that of `snpobj`. Default is False.
969
970        Returns:
971            **Optional[SNPObject]:** 
972                A new `SNPObject` with matched chromosome format if `inplace=False`. 
973                If `inplace=True`, modifies `self` in place and returns None.
974        """
975        # Detect the chromosome naming format of the current SNPObject
976        fmt1 = self.detect_chromosome_format()
977        if fmt1 == 'Unknown format':
978            raise ValueError("The chromosome format of the current SNPObject is unrecognized.")
979        
980        # Detect the chromosome naming format of the reference SNPObject
981        fmt2 = snpobj.detect_chromosome_format()
982        if fmt2 == 'Unknown format':
983            raise ValueError("The chromosome format of the reference SNPObject is unrecognized.")
984
985        # Convert the current SNPObject's chromosome format to match the reference format
986        return self.convert_chromosome_format(fmt1, fmt2, inplace=inplace)

Convert the chromosome format in variants_chrom from self to match the format of a reference snpobj.

Recognized formats:

  • 'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.
  • 'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.
  • 'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.
  • 'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
Arguments:
  • snpobj (SNPObject): The reference SNPObject whose chromosome format will be matched.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the chromosome format matching that of snpobj. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with matched chromosome format if inplace=False. If inplace=True, modifies self in place and returns None.

def rename_chrom( self, to_replace: Dict[str, str] | str | List[str] = {'^([0-9]+)$': 'chr\\1', '^chr([0-9]+)$': '\\1'}, value: str | List[str] | None = None, regex: bool = True, inplace: bool = False) -> SNPObject | None:
 988    def rename_chrom(
 989        self,
 990        to_replace: Union[Dict[str, str], str, List[str]] = {'^([0-9]+)$': r'chr\1', r'^chr([0-9]+)$': r'\1'},
 991        value: Optional[Union[str, List[str]]] = None,
 992        regex: bool = True,
 993        inplace: bool = False
 994    ) -> Optional['SNPObject']:
 995        """
 996        Replace chromosome values in `variants_chrom` using patterns or exact matches.
 997
 998        This method allows flexible chromosome replacements, using regex or exact matches, useful 
 999        for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'), 
1000        consider `convert_chromosome_format`.
1001
1002        Args:
1003            to_replace (dict, str, or list of str): 
1004                Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior 
1005                transforms `<chrom_num>` to `chr<chrom_num>` or vice versa. Non-matching values 
1006                remain unchanged.
1007                - If str or list of str: Matches will be replaced with `value`.
1008                - If regex (bool), then any regex matches will be replaced with `value`.
1009                - If dict: Keys defines values to replace, with corresponding replacements as values.
1010            value (str or list of str, optional): 
1011                Replacement value(s) if `to_replace` is a string or list. Ignored if `to_replace` 
1012                is a dictionary.
1013            regex (bool, default=True): 
1014                If True, interprets `to_replace` keys as regex patterns.
1015            inplace (bool, default=False): 
1016                If True, modifies `self` in place. If False, returns a new `SNPObject` with the chromosomes
1017                renamed. Default is False.
1018
1019        Returns:
1020            **Optional[SNPObject]:** A new `SNPObject` with the renamed chromosome format if `inplace=False`. 
1021            If `inplace=True`, modifies `self` in place and returns None.
1022        """
1023        # Standardize input format: convert `to_replace` and `value` to a dictionary if needed
1024        if isinstance(to_replace, (str, int)):
1025            to_replace = [to_replace]
1026        if isinstance(value, (str, int)):
1027            value = [value]
1028        if isinstance(to_replace, list) and isinstance(value, list):
1029            dictionary = dict(zip(to_replace, value))
1030        elif isinstance(to_replace, dict) and value is None:
1031            dictionary = to_replace
1032        else:
1033            raise ValueError(
1034            "Invalid input: `to_replace` and `value` must be compatible types (both str, list of str, or dict)."
1035        )
1036
1037        # Vectorized function for replacing values in chromosome array
1038        vec_replace_values = np.vectorize(self._match_to_replace)
1039
1040        # Rename chromosomes based on inplace flag
1041        if inplace:
1042            self.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex)
1043            return None
1044        else:
1045            snpobj = self.copy()
1046            snpobj.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex)
1047            return snpobj

Replace chromosome values in variants_chrom using patterns or exact matches.

This method allows flexible chromosome replacements, using regex or exact matches, useful for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'), consider convert_chromosome_format.

Arguments:
  • to_replace (dict, str, or list of str): Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior transforms <chrom_num> to chr<chrom_num> or vice versa. Non-matching values remain unchanged.
    • If str or list of str: Matches will be replaced with value.
    • If regex (bool), then any regex matches will be replaced with value.
    • If dict: Keys defines values to replace, with corresponding replacements as values.
  • value (str or list of str, optional): Replacement value(s) if to_replace is a string or list. Ignored if to_replace is a dictionary.
  • regex (bool, default=True): If True, interprets to_replace keys as regex patterns.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the chromosomes renamed. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the renamed chromosome format if inplace=False. If inplace=True, modifies self in place and returns None.

def rename_missings( self, before: int | float | str = -1, after: int | float | str = '.', inplace: bool = False) -> SNPObject | None:
1049    def rename_missings(
1050        self, 
1051        before: Union[int, float, str] = -1, 
1052        after: Union[int, float, str] = '.', 
1053        inplace: bool = False
1054    ) -> Optional['SNPObject']:
1055        """
1056        Replace missing values in the `calldata_gt` attribute.
1057
1058        This method identifies missing values in 'calldata_gt' and replaces them with a specified 
1059        value. By default, it replaces occurrences of `-1` (often used to signify missing data) with `'.'`.
1060
1061        Args:
1062            before (int, float, or str, default=-1): 
1063                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
1064                Default is -1.
1065            after (int, float, or str, default='.'): 
1066                The value that will replace `before`. Default is '.'.
1067            inplace (bool, default=False): 
1068                If True, modifies `self` in place. If False, returns a new `SNPObject` with the applied 
1069                replacements. Default is False.
1070
1071        Returns:
1072            **Optional[SNPObject]:** 
1073                A new `SNPObject` with the renamed missing values if `inplace=False`. 
1074                If `inplace=True`, modifies `self` in place and returns None.
1075        """
1076        # Rename missing values in the `calldata_gt` attribute based on inplace flag
1077        if inplace:
1078            self['calldata_gt'] = np.where(self['calldata_gt'] == before, after, self['calldata_gt'])
1079            return None
1080        else:
1081            snpobj = self.copy()
1082            snpobj['calldata_gt'] = np.where(snpobj['calldata_gt'] == before, after, snpobj['calldata_gt'])
1083            return snpobj

Replace missing values in the calldata_gt attribute.

This method identifies missing values in 'calldata_gt' and replaces them with a specified value. By default, it replaces occurrences of -1 (often used to signify missing data) with '.'.

Arguments:
  • before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
  • after (int, float, or str, default='.'): The value that will replace before. Default is '.'.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the applied replacements. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the renamed missing values if inplace=False. If inplace=True, modifies self in place and returns None.

def get_common_variants_intersection( self, snpobj: SNPObject, index_by: str = 'pos') -> Tuple[List[str], numpy.ndarray, numpy.ndarray]:
1085    def get_common_variants_intersection(
1086        self, 
1087        snpobj: 'SNPObject', 
1088        index_by: str = 'pos'
1089    ) -> Tuple[List[str], np.ndarray, np.ndarray]:
1090        """
1091        Identify common variants between `self` and the `snpobj` instance based on the specified `index_by` criterion, 
1092        which may match based on chromosome and position (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both.
1093
1094        This method returns the identifiers of common variants and their corresponding indices in both objects.
1095
1096        Args:
1097            snpobj (SNPObject): 
1098                The reference SNPObject to compare against.
1099            index_by (str, default='pos'): 
1100                Criteria for matching variants. Options:
1101                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1102                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1103                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1104                Default is 'pos'.
1105
1106        Returns:
1107            Tuple containing:
1108            - **list of str:** A list of common variant identifiers (as strings).
1109            - **array:** An array of indices in `self` where common variants are located.
1110            - **array:** An array of indices in `snpobj` where common variants are located.
1111        """
1112        # Create unique identifiers for each variant in both SNPObjects based on the specified criterion
1113        if index_by == 'pos':
1114            query_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(self['variants_chrom'], self['variants_pos'])]
1115            reference_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(snpobj['variants_chrom'], snpobj['variants_pos'])]
1116        elif index_by == 'id':
1117            query_identifiers = self['variants_id'].tolist()
1118            reference_identifiers = snpobj['variants_id'].tolist()
1119        elif index_by == 'pos+id':
1120            query_identifiers = [
1121                f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(self['variants_chrom'], self['variants_pos'], self['variants_id'])
1122            ]
1123            reference_identifiers = [
1124                f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_id'])
1125            ]
1126        else:
1127            raise ValueError("`index_by` must be one of 'pos', 'id', or 'pos+id'.")
1128
1129        # Convert to sets for intersection
1130        common_ids = set(query_identifiers).intersection(reference_identifiers)
1131
1132        # Collect indices for common identifiers
1133        query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids]
1134        reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids]
1135
1136        return list(common_ids), np.array(query_idx), np.array(reference_idx)

Identify common variants between self and the snpobj instance based on the specified index_by criterion, which may match based on chromosome and position (variants_chrom, variants_pos), ID (variants_id), or both.

This method returns the identifiers of common variants and their corresponding indices in both objects.

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
  • index_by (str, default='pos'): Criteria for matching variants. Options:
Returns:

Tuple containing:

  • list of str: A list of common variant identifiers (as strings).
  • array: An array of indices in self where common variants are located.
  • array: An array of indices in snpobj where common variants are located.
def get_common_markers_intersection( self, snpobj: SNPObject) -> Tuple[List[str], numpy.ndarray, numpy.ndarray]:
1138    def get_common_markers_intersection(
1139        self, 
1140        snpobj: 'SNPObject'
1141    ) -> Tuple[List[str], np.ndarray, np.ndarray]:
1142        """
1143        Identify common markers between between `self` and the `snpobj` instance. Common markers are identified 
1144        based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 
1145        and alternate (`variants_alt`) alleles.
1146
1147        This method returns the identifiers of common markers and their corresponding indices in both objects.
1148        
1149        Args:
1150            snpobj (SNPObject): 
1151                The reference SNPObject to compare against.
1152        
1153        Returns:
1154            Tuple containing:
1155            - **list of str:** A list of common variant identifiers (as strings).
1156            - **array:** An array of indices in `self` where common variants are located.
1157            - **array:** An array of indices in `snpobj` where common variants are located.
1158        """
1159        # Generate unique identifiers based on chrom, pos, ref, and alt alleles
1160        query_identifiers = [
1161            f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 
1162            zip(self['variants_chrom'], self['variants_pos'], self['variants_ref'], self['variants_alt'])
1163        ]
1164        reference_identifiers = [
1165            f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 
1166            zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_ref'], snpobj['variants_alt'])
1167        ]
1168
1169        # Convert to sets for intersection
1170        common_ids = set(query_identifiers).intersection(reference_identifiers)
1171
1172        # Collect indices for common identifiers in both SNPObjects
1173        query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids]
1174        reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids]
1175
1176        return list(common_ids), np.array(query_idx), np.array(reference_idx)

Identify common markers between between self and the snpobj instance. Common markers are identified based on matching chromosome (variants_chrom), position (variants_pos), reference (variants_ref), and alternate (variants_alt) alleles.

This method returns the identifiers of common markers and their corresponding indices in both objects.

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
Returns:

Tuple containing:

  • list of str: A list of common variant identifiers (as strings).
  • array: An array of indices in self where common variants are located.
  • array: An array of indices in snpobj where common variants are located.
def subset_to_common_variants( self, snpobj: SNPObject, index_by: str = 'pos', common_variants_intersection: Tuple[numpy.ndarray, numpy.ndarray] | None = None, inplace: bool = False) -> SNPObject | None:
1178    def subset_to_common_variants(
1179        self, 
1180        snpobj: 'SNPObject', 
1181        index_by: str = 'pos', 
1182        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1183        inplace: bool = False
1184    ) -> Optional['SNPObject']:
1185        """
1186        Subset `self` to include only the common variants with a reference `snpobj` based on 
1187        the specified `index_by` criterion, which may match based on chromosome and position 
1188        (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both.
1189        
1190        Args:
1191            snpobj (SNPObject): 
1192                The reference SNPObject to compare against.
1193            index_by (str, default='pos'): 
1194                Criteria for matching variants. Options:
1195                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1196                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1197                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1198                Default is 'pos'.
1199            common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): 
1200                Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 
1201                computed within the function.
1202            inplace (bool, default=False): 
1203                If True, modifies `self` in place. If False, returns a new `SNPObject` with the common variants
1204                subsetted. Default is False.
1205
1206        Returns:
1207            **Optional[SNPObject]:** 
1208                A new `SNPObject` with the common variants subsetted if `inplace=False`. 
1209                If `inplace=True`, modifies `self` in place and returns None.
1210        """
1211        # Get indices of common variants if not provided
1212        if common_variants_intersection is None:
1213            _, query_idx, _ = self.get_common_variants_intersection(snpobj, index_by=index_by)
1214        else:
1215            query_idx, _ = common_variants_intersection
1216
1217        # Use filter_variants method with the identified indices, applying `inplace` as specified
1218        return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)

Subset self to include only the common variants with a reference snpobj based on the specified index_by criterion, which may match based on chromosome and position (variants_chrom, variants_pos), ID (variants_id), or both.

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
  • index_by (str, default='pos'): Criteria for matching variants. Options:
  • common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): Precomputed indices of common variants between self and snpobj. If None, intersection is computed within the function.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the common variants subsetted. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the common variants subsetted if inplace=False. If inplace=True, modifies self in place and returns None.

def subset_to_common_markers( self, snpobj: SNPObject, common_markers_intersection: Tuple[numpy.ndarray, numpy.ndarray] | None = None, inplace: bool = False) -> SNPObject | None:
1220    def subset_to_common_markers(
1221        self, 
1222        snpobj: 'SNPObject', 
1223        common_markers_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1224        inplace: bool = False
1225    ) -> Optional['SNPObject']:
1226        """
1227        Subset `self` to include only the common markers with a reference `snpobj`. Common markers are identified 
1228        based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 
1229        and alternate (`variants_alt`) alleles.
1230
1231        Args:
1232            snpobj (SNPObject): 
1233                The reference SNPObject to compare against.
1234            common_markers_intersection (tuple of arrays, optional): 
1235                Precomputed indices of common markers between `self` and `snpobj`. If None, intersection is 
1236                computed within the function.
1237            inplace (bool, default=False): 
1238                If True, modifies `self` in place. If False, returns a new `SNPObject` with the common markers
1239                subsetted. Default is False.
1240
1241        Returns:
1242            **Optional[SNPObject]:** 
1243                A new `SNPObject` with the common markers subsetted if `inplace=False`. 
1244                If `inplace=True`, modifies `self` in place and returns None.
1245        """
1246        # Get indices of common markers if not provided
1247        if common_markers_intersection is None:
1248            _, query_idx, _ = self.get_common_markers_intersection(snpobj)
1249        else:
1250            query_idx, _ = common_markers_intersection
1251
1252        # Use filter_variants method with the identified indices, applying `inplace` as specified
1253        return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)

Subset self to include only the common markers with a reference snpobj. Common markers are identified based on matching chromosome (variants_chrom), position (variants_pos), reference (variants_ref), and alternate (variants_alt) alleles.

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
  • common_markers_intersection (tuple of arrays, optional): Precomputed indices of common markers between self and snpobj. If None, intersection is computed within the function.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the common markers subsetted. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with the common markers subsetted if inplace=False. If inplace=True, modifies self in place and returns None.

def merge( self, snpobj: SNPObject, force_samples: bool = False, prefix: str = '2', inplace: bool = False) -> SNPObject | None:
1255    def merge(
1256            self, 
1257            snpobj: 'SNPObject', 
1258            force_samples: bool = False, 
1259            prefix: str = '2', 
1260            inplace: bool = False
1261        ) -> Optional['SNPObject']:
1262        """
1263        Merge `self` with `snpobj` along the sample axis.
1264
1265        This method expects both SNPObjects to contain the same set of SNPs in the same order, 
1266        then combines their genotype (`calldata_gt`) and LAI (`calldata_lai`) arrays by 
1267        concatenating the sample dimension. Samples from `snpobj` are appended to those in `self`.
1268
1269        Args:
1270            snpobj (SNPObject): 
1271                The SNPObject to merge samples with.
1272            force_samples (bool, default=False): 
1273                If True, duplicate sample names are resolved by prepending the `prefix` to duplicate sample names in 
1274                `snpobj`. Otherwise, merging fails when duplicate sample names are found. Default is False.
1275            prefix (str, default='2'): 
1276                A string prepended to duplicate sample names in `snpobj` when `force_samples=True`. 
1277                Duplicates are renamed from `<sample_name>` to `<prefix>:<sample_name>`. For instance, 
1278                if `prefix='2'` and there is a conflict with a sample called "sample_1", it becomes "2:sample_1".
1279            inplace (bool, default=False): 
1280                If True, modifies `self` in place. If False, returns a new `SNPObject` with the merged samples. 
1281                Default is False.
1282
1283        Returns:
1284            **Optional[SNPObject]**: A new SNPObject containing the merged sample data.
1285        """
1286        # Merge calldata_gt if present and compatible
1287        if self.calldata_gt is not None and snpobj.calldata_gt is not None:
1288            if self.calldata_gt.shape[0] != snpobj.calldata_gt.shape[0]:
1289                raise ValueError(
1290                    f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_gt`.\n"
1291                    f"`self.calldata_gt` has {self.calldata_gt.shape[0]} SNPs, "
1292                    f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[0]} SNPs."
1293                )
1294            if self.are_strands_summed and not snpobj.are_strands_summed:
1295                raise ValueError(
1296                    "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n"
1297                    "Ensure both objects have the same genotype summation state before merging."
1298                )
1299            if not self.are_strands_summed and snpobj.are_strands_summed:
1300                raise ValueError(
1301                    "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n"
1302                    "Ensure both objects have the same genotype summation state before merging."
1303                )
1304            calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=1)
1305        else:
1306            calldata_gt = None
1307
1308        # Merge samples if present and compatible, handling duplicates if `force_samples=True`
1309        if self.samples is not None and snpobj.samples is not None:
1310            overlapping_samples = set(self.samples).intersection(set(snpobj.samples))
1311            if overlapping_samples:
1312                if not force_samples:
1313                    raise ValueError(
1314                        f"Cannot merge SNPObjects: Found overlapping sample names {overlapping_samples}.\n"
1315                        "Samples must be strictly non-overlapping. To allow merging with renaming, set `force_samples=True`."
1316                    )
1317                else:
1318                    # Rename duplicate samples by prepending the file index
1319                    renamed_samples = [f"{prefix}:{sample}" if sample in overlapping_samples else sample for sample in snpobj.samples]
1320                    samples = np.concatenate([self.samples, renamed_samples], axis=0)
1321            else:
1322                samples = np.concatenate([self.samples, snpobj.samples], axis=0)
1323        else:
1324            samples = None
1325
1326        # Merge LAI data if present and compatible
1327        if self.calldata_lai is not None and snpobj.calldata_lai is not None:
1328            if self.calldata_lai.ndim != snpobj.calldata_lai.ndim:
1329                raise ValueError(
1330                    f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n"
1331                    f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, "
1332                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions."
1333                )
1334            if self.calldata_lai.shape[0] != snpobj.calldata_lai.shape[0]:
1335                raise ValueError(
1336                    f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_lai`.\n"
1337                    f"`self.calldata_lai` has {self.calldata_lai.shape[0]} SNPs, "
1338                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[0]} SNPs."
1339                )
1340            calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=1)
1341        else:
1342            calldata_lai = None
1343
1344        if inplace:
1345            self.calldata_gt = calldata_gt
1346            self.calldata_lai = calldata_lai
1347            self.samples = samples
1348            return self
1349
1350        # Create and return a new SNPObject containing the merged samples
1351        return SNPObject(
1352            calldata_gt=calldata_gt,
1353            samples=samples,
1354            variants_ref=self.variants_ref,
1355            variants_alt=self.variants_alt,
1356            variants_chrom=self.variants_chrom,
1357            variants_filter_pass=self.variants_filter_pass,
1358            variants_id=self.variants_id,
1359            variants_pos=self.variants_pos,
1360            variants_qual=self.variants_qual,
1361            calldata_lai=calldata_lai,
1362            ancestry_map=self.ancestry_map
1363        )

Merge self with snpobj along the sample axis.

This method expects both SNPObjects to contain the same set of SNPs in the same order, then combines their genotype (calldata_gt) and LAI (calldata_lai) arrays by concatenating the sample dimension. Samples from snpobj are appended to those in self.

Arguments:
  • snpobj (SNPObject): The SNPObject to merge samples with.
  • force_samples (bool, default=False): If True, duplicate sample names are resolved by prepending the prefix to duplicate sample names in snpobj. Otherwise, merging fails when duplicate sample names are found. Default is False.
  • prefix (str, default='2'): A string prepended to duplicate sample names in snpobj when force_samples=True. Duplicates are renamed from <sample_name> to <prefix>:<sample_name>. For instance, if prefix='2' and there is a conflict with a sample called "sample_1", it becomes "2:sample_1".
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the merged samples. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject containing the merged sample data.

def concat( self, snpobj: SNPObject, inplace: bool = False) -> SNPObject | None:
1365    def concat(
1366        self,
1367        snpobj: 'SNPObject', 
1368        inplace: bool = False
1369    ) -> Optional['SNPObject']:
1370        """
1371        Concatenate self with snpobj along the SNP axis.
1372
1373        This method expects both SNPObjects to contain the same set of samples in the same order, 
1374        and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) 
1375        those in self.
1376
1377        Args:
1378            snpobj (SNPObject):
1379                The SNPObject to concatenate SNPs with.
1380            inplace (bool, default=False):
1381                If True, modifies `self` in place. If False, returns a new `SNPObject` with the concatenated SNPs. 
1382                Default is False.
1383        
1384        Returns:
1385            **Optional[SNPObject]**: A new SNPObject containing the concatenated SNP data.
1386        """
1387        # Merge calldata_gt if present and compatible
1388        if self.calldata_gt is not None and snpobj.calldata_gt is not None:
1389            if self.calldata_gt.shape[1] != snpobj.calldata_gt.shape[1]:
1390                raise ValueError(
1391                    f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_gt`.\n"
1392                    f"`self.calldata_gt` has {self.calldata_gt.shape[1]} samples, "
1393                    f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[1]} samples."
1394                )
1395            if self.are_strands_summed and not snpobj.are_strands_summed:
1396                raise ValueError(
1397                    "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n"
1398                    "Ensure both objects have the same genotype summation state before merging."
1399                )
1400            if not self.are_strands_summed and snpobj.are_strands_summed:
1401                raise ValueError(
1402                    "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n"
1403                    "Ensure both objects have the same genotype summation state before merging."
1404                )
1405            calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=0)
1406        else:
1407            calldata_gt = None
1408
1409        # Merge SNP-related attributes if present
1410        attributes = [
1411            'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual'
1412        ]
1413        merged_attrs = {}
1414        for attr in attributes:
1415            self_attr = getattr(self, attr, None)
1416            obj_attr = getattr(snpobj, attr, None)
1417
1418            # Concatenate if both present
1419            if self_attr is not None and obj_attr is not None:
1420                merged_attrs[attr] = np.concatenate([self_attr, obj_attr], axis=0)
1421            else:
1422                # If either is None, store None
1423                merged_attrs[attr] = None
1424
1425        # Merge LAI data if present and compatible
1426        if self.calldata_lai is not None and snpobj.calldata_lai is not None:
1427            if self.calldata_lai.ndim != snpobj.calldata_lai.ndim:
1428                raise ValueError(
1429                    f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n"
1430                    f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, "
1431                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions."
1432                )
1433            if self.calldata_lai.shape[1] != snpobj.calldata_lai.shape[1]:
1434                raise ValueError(
1435                    f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_lai`.\n"
1436                    f"`self.calldata_lai` has {self.calldata_lai.shape[1]} samples, "
1437                    f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[1]} samples."
1438                )
1439            calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=0)
1440        else:
1441            calldata_lai = None
1442        
1443        if inplace:
1444            self.calldata_gt = calldata_gt
1445            self.calldata_lai = calldata_lai
1446            for attr in attributes:
1447                self[attr] = merged_attrs[attr]
1448            return self
1449        
1450        # Create and return a new SNPObject containing the concatenated SNPs
1451        return SNPObject(
1452            calldata_gt=calldata_gt,
1453            calldata_lai=calldata_lai,
1454            samples=self.samples,
1455            variants_ref=merged_attrs['variants_ref'],
1456            variants_alt=merged_attrs['variants_alt'],
1457            variants_chrom=merged_attrs['variants_chrom'],
1458            variants_id=merged_attrs['variants_id'],
1459            variants_pos=merged_attrs['variants_pos'],
1460            variants_qual=merged_attrs['variants_qual'],
1461            variants_filter_pass=merged_attrs['variants_filter_pass'],
1462            ancestry_map=self.ancestry_map
1463        )

Concatenate self with snpobj along the SNP axis.

This method expects both SNPObjects to contain the same set of samples in the same order, and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) those in self.

Arguments:
  • snpobj (SNPObject): The SNPObject to concatenate SNPs with.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the concatenated SNPs. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject containing the concatenated SNP data.

def remove_strand_ambiguous_variants( self, inplace: bool = False) -> SNPObject | None:
1465    def remove_strand_ambiguous_variants(self, inplace: bool = False) -> Optional['SNPObject']:
1466        """
1467        A strand-ambiguous variant has reference (`variants_ref`) and alternate (`variants_alt`) alleles 
1468        in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable 
1469        in terms of strand orientation.
1470
1471        Args:
1472            inplace (bool, default=False): 
1473                If True, modifies `self` in place. If False, returns a new `SNPObject` with the 
1474                strand-ambiguous variants removed. Default is False.
1475
1476        Returns:
1477            **Optional[SNPObject]:** A new `SNPObject` with non-ambiguous variants only if `inplace=False`. 
1478            If `inplace=True`, modifies `self` in place and returns None.
1479        """
1480        # Identify strand-ambiguous SNPs using vectorized comparisons
1481        is_AT = (self['variants_ref'] == 'A') & (self['variants_alt'] == 'T')
1482        is_TA = (self['variants_ref'] == 'T') & (self['variants_alt'] == 'A')
1483        is_CG = (self['variants_ref'] == 'C') & (self['variants_alt'] == 'G')
1484        is_GC = (self['variants_ref'] == 'G') & (self['variants_alt'] == 'C')
1485
1486        # Create a combined mask for all ambiguous variants
1487        ambiguous_mask = is_AT | is_TA | is_CG | is_GC
1488        non_ambiguous_idx = np.where(~ambiguous_mask)[0]
1489
1490        # Count each type of ambiguity using numpy's sum on boolean arrays
1491        A_T_count = np.sum(is_AT)
1492        T_A_count = np.sum(is_TA)
1493        C_G_count = np.sum(is_CG)
1494        G_C_count = np.sum(is_GC)
1495
1496        # Log the counts of each type of strand-ambiguous variants
1497        total_ambiguous = A_T_count + T_A_count + C_G_count + G_C_count
1498        log.info(f'{A_T_count} ambiguities of A-T type.')
1499        log.info(f'{T_A_count} ambiguities of T-A type.')
1500        log.info(f'{C_G_count} ambiguities of C-G type.')
1501        log.info(f'{G_C_count} ambiguities of G-C type.')
1502
1503        # Filter out ambiguous variants and keep non-ambiguous ones
1504        log.debug(f'Removing {total_ambiguous} strand-ambiguous variants...')
1505        return self.filter_variants(indexes=non_ambiguous_idx, include=True, inplace=inplace)

A strand-ambiguous variant has reference (variants_ref) and alternate (variants_alt) alleles in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable in terms of strand orientation.

Arguments:
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with the strand-ambiguous variants removed. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with non-ambiguous variants only if inplace=False. If inplace=True, modifies self in place and returns None.

def correct_flipped_variants( self, snpobj: SNPObject, check_complement: bool = True, index_by: str = 'pos', common_variants_intersection: Tuple[numpy.ndarray, numpy.ndarray] | None = None, log_stats: bool = True, inplace: bool = False) -> SNPObject | None:
1507    def correct_flipped_variants(
1508        self, 
1509        snpobj: 'SNPObject', 
1510        check_complement: bool = True, 
1511        index_by: str = 'pos', 
1512        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1513        log_stats: bool = True,
1514        inplace: bool = False
1515    ) -> Optional['SNPObject']:
1516        """
1517        Correct flipped variants between between `self` and a reference `snpobj`, where reference (`variants_ref`) 
1518        and alternate (`variants_alt`) alleles are swapped.
1519
1520        **Flip Detection Based on `check_complement`:**
1521
1522        - If `check_complement=False`, only direct allele swaps are considered:
1523            1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1524
1525        - If `check_complement=True`, both direct and complementary swaps are considered, with four possible cases:
1526            1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1527            2. **Complement Swap of Ref:** `complement(self.variants_ref) == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`.
1528            3. **Complement Swap of Alt:** `self.variants_ref == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`.
1529            4. **Complement Swap of both Ref and Alt:** `complement(self.variants_ref) == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`.
1530
1531        **Note:** Variants where `self.variants_ref == self.variants_alt` are ignored as they are ambiguous.
1532
1533        **Correction Process:** 
1534        - Swaps `variants_ref` and `variants_alt` alleles in `self` to align with `snpobj`.
1535        - Flips `calldata_gt` values (0 becomes 1, and 1 becomes 0) to match the updated allele configuration.
1536
1537        Args:
1538            snpobj (SNPObject): 
1539                The reference SNPObject to compare against.
1540            check_complement (bool, default=True): 
1541                If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants.
1542                Default is True.
1543            index_by (str, default='pos'): 
1544                Criteria for matching variants. Options:
1545                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1546                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1547                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1548                Default is 'pos'.
1549            common_variants_intersection (tuple of arrays, optional): 
1550                Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 
1551                computed within the function.
1552            log_stats (bool, default=True): 
1553                If True, logs statistical information about matching and ambiguous alleles. Default is True.
1554            inplace (bool, default=False): 
1555                If True, modifies `self` in place. If False, returns a new `SNPObject` with corrected 
1556                flips. Default is False.
1557
1558        Returns:
1559            **Optional[SNPObject]**: 
1560                A new `SNPObject` with corrected flips if `inplace=False`. 
1561                If `inplace=True`, modifies `self` in place and returns None.
1562        """
1563        # Define complement mappings for nucleotides
1564        complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
1565
1566        # Helper function to get the complement of a base
1567        def get_complement(base: str) -> str:
1568            return complement_map.get(base, base)
1569
1570        # Get common variant indices if not provided
1571        if common_variants_intersection != None:
1572            query_idx, reference_idx = common_variants_intersection
1573        else:
1574            _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by)
1575
1576        # Log statistics on matching alleles if enabled
1577        if log_stats:
1578            matching_ref = np.sum(self['variants_ref'][query_idx] == snpobj['variants_ref'][reference_idx])
1579            matching_alt = np.sum(self['variants_alt'][query_idx] == snpobj['variants_alt'][reference_idx])
1580            ambiguous = np.sum(self['variants_ref'][query_idx] == self['variants_alt'][query_idx])
1581            log.info(f"Matching reference alleles (ref=ref'): {matching_ref}, Matching alternate alleles (alt=alt'): {matching_alt}.")
1582            log.info(f"Number of ambiguous alleles (ref=alt): {ambiguous}.")
1583
1584        # Identify indices where `ref` and `alt` alleles are swapped
1585        if not check_complement:
1586            # Simple exact match for swapped alleles
1587            swapped_ref = (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx])
1588            swapped_alt = (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx])
1589        else:
1590            # Check for swapped or complementary-swapped alleles
1591            swapped_ref = (
1592                (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) |
1593                (np.vectorize(get_complement)(self['variants_ref'][query_idx]) == snpobj['variants_alt'][reference_idx])
1594            )
1595            swapped_alt = (
1596                (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) |
1597                (np.vectorize(get_complement)(self['variants_alt'][query_idx]) == snpobj['variants_ref'][reference_idx])
1598            )
1599
1600        # Filter out ambiguous variants where `ref` and `alt` alleles match (ref=alt)
1601        not_ambiguous = (self['variants_ref'][query_idx] != self['variants_alt'][query_idx])
1602
1603        # Indices in `self` of flipped variants
1604        flip_idx_query = query_idx[swapped_ref & swapped_alt & not_ambiguous]
1605
1606        # Correct the identified variant flips
1607        if len(flip_idx_query) > 0:
1608            log.info(f'Correcting {len(flip_idx_query)} variant flips...')
1609
1610            temp_alts = self['variants_alt'][flip_idx_query]
1611            temp_refs = self['variants_ref'][flip_idx_query]
1612
1613            # Correct the variant flips based on whether the operation is in-place or not
1614            if inplace:
1615                self['variants_alt'][flip_idx_query] = temp_refs
1616                self['variants_ref'][flip_idx_query] = temp_alts
1617                self['calldata_gt'][flip_idx_query] = 1 - self['calldata_gt'][flip_idx_query]
1618                return None
1619            else:
1620                snpobj = self.copy()
1621                snpobj['variants_alt'][flip_idx_query] = temp_refs
1622                snpobj['variants_ref'][flip_idx_query] = temp_alts
1623                snpobj['calldata_gt'][flip_idx_query] = 1 - snpobj['calldata_gt'][flip_idx_query]
1624                return snpobj
1625        else:
1626            log.info('No variant flips found to correct.')
1627            return self if not inplace else None

Correct flipped variants between between self and a reference snpobj, where reference (variants_ref) and alternate (variants_alt) alleles are swapped.

Flip Detection Based on check_complement:

  • If check_complement=False, only direct allele swaps are considered:

    1. Direct Swap: self.variants_ref == snpobj.variants_alt and self.variants_alt == snpobj.variants_ref.
  • If check_complement=True, both direct and complementary swaps are considered, with four possible cases:

    1. Direct Swap: self.variants_ref == snpobj.variants_alt and self.variants_alt == snpobj.variants_ref.
    2. Complement Swap of Ref: complement(self.variants_ref) == snpobj.variants_alt and self.variants_alt == snpobj.variants_ref.
    3. Complement Swap of Alt: self.variants_ref == snpobj.variants_alt and complement(self.variants_alt) == snpobj.variants_ref.
    4. Complement Swap of both Ref and Alt: complement(self.variants_ref) == snpobj.variants_alt and complement(self.variants_alt) == snpobj.variants_ref.

Note: Variants where self.variants_ref == self.variants_alt are ignored as they are ambiguous.

Correction Process:

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
  • check_complement (bool, default=True): If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants. Default is True.
  • index_by (str, default='pos'): Criteria for matching variants. Options:
  • common_variants_intersection (tuple of arrays, optional): Precomputed indices of common variants between self and snpobj. If None, intersection is computed within the function.
  • log_stats (bool, default=True): If True, logs statistical information about matching and ambiguous alleles. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with corrected flips. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with corrected flips if inplace=False. If inplace=True, modifies self in place and returns None.

def remove_mismatching_variants( self, snpobj: SNPObject, index_by: str = 'pos', common_variants_intersection: Tuple[numpy.ndarray, numpy.ndarray] | None = None, inplace: bool = False) -> SNPObject | None:
1629    def remove_mismatching_variants(
1630        self, 
1631        snpobj: 'SNPObject', 
1632        index_by: str = 'pos', 
1633        common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None,
1634        inplace: bool = False
1635    ) -> Optional['SNPObject']:
1636        """
1637        Remove variants from `self`, where reference (`variants_ref`) and/or alternate (`variants_alt`) alleles 
1638        do not match with a reference `snpobj`.
1639
1640        Args:
1641            snpobj (SNPObject): 
1642                The reference SNPObject to compare against.
1643            index_by (str, default='pos'): 
1644                Criteria for matching variants. Options:
1645                - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'.
1646                - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'.
1647                - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'.
1648                Default is 'pos'.
1649            common_variants_intersection (tuple of arrays, optional): 
1650                Precomputed indices of common variants between `self` and the reference `snpobj`.
1651                If None, the intersection is computed within the function.
1652            inplace (bool, default=False): 
1653                If True, modifies `self` in place. If False, returns a new `SNPObject` without 
1654                mismatching variants. Default is False.
1655
1656        Returns:
1657            **Optional[SNPObject]:** 
1658                A new `SNPObject` without mismatching variants if `inplace=False`. 
1659                If `inplace=True`, modifies `self` in place and returns None.
1660        """
1661        # Get common variant indices if not provided
1662        if common_variants_intersection is not None:
1663            query_idx, reference_idx = common_variants_intersection
1664        else:
1665            _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by)
1666
1667        # Vectorized comparison of `ref` and `alt` alleles
1668        ref_mismatch = self['variants_ref'][query_idx] != snpobj['variants_ref'][reference_idx]
1669        alt_mismatch = self['variants_alt'][query_idx] != snpobj['variants_alt'][reference_idx]
1670        mismatch_mask = ref_mismatch | alt_mismatch
1671
1672        # Identify indices in `self` of mismatching variants
1673        mismatch_idx = query_idx[mismatch_mask]
1674
1675        # Compute total number of variant mismatches
1676        total_mismatches = np.sum(mismatch_mask)
1677
1678        # Filter out mismatching variants
1679        log.debug(f'Removing {total_mismatches} mismatching variants...')
1680        return self.filter_variants(indexes=mismatch_idx, include=True, inplace=inplace)

Remove variants from self, where reference (variants_ref) and/or alternate (variants_alt) alleles do not match with a reference snpobj.

Arguments:
  • snpobj (SNPObject): The reference SNPObject to compare against.
  • index_by (str, default='pos'): Criteria for matching variants. Options:
  • common_variants_intersection (tuple of arrays, optional): Precomputed indices of common variants between self and the reference snpobj. If None, the intersection is computed within the function.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject without mismatching variants. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject without mismatching variants if inplace=False. If inplace=True, modifies self in place and returns None.

def shuffle_variants( self, inplace: bool = False) -> SNPObject | None:
1682    def shuffle_variants(self, inplace: bool = False) -> Optional['SNPObject']:
1683        """
1684        Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated 
1685        data (e.g., `calldata_gt` and variant-specific attributes) remain aligned.
1686
1687        Args:
1688            inplace (bool, default=False): 
1689                If True, modifies `self` in place. If False, returns a new `SNPObject` with 
1690                shuffled variants. Default is False.
1691
1692        Returns:
1693            **Optional[SNPObject]:** 
1694                A new `SNPObject` without shuffled variant positions if `inplace=False`. 
1695                If `inplace=True`, modifies `self` in place and returns None.
1696        """
1697        # Generate a random permutation index for shuffling variant positions
1698        shuffle_index = np.random.permutation(self.n_snps)
1699
1700        # Apply shuffling to all relevant attributes using the class's dictionary-like interface
1701        if inplace:
1702            for key in self.keys():
1703                if self[key] is not None:
1704                    if key == 'calldata_gt':
1705                        # `calldata_gt`` has a different shape, so it's shuffled along axis 0
1706                        self[key] = self[key][shuffle_index, ...]
1707                    elif 'variant' in key:
1708                        # snpobj attributes are 1D arrays
1709                        self[key] = np.asarray(self[key])[shuffle_index]
1710            return None
1711        else:
1712            shuffled_snpobj = self.copy()
1713            for key in shuffled_snpobj.keys():
1714                if shuffled_snpobj[key] is not None:
1715                    if key == 'calldata_gt':
1716                        shuffled_snpobj[key] = shuffled_snpobj[key][shuffle_index, ...]
1717                    elif 'variant' in key:
1718                        shuffled_snpobj[key] = np.asarray(shuffled_snpobj[key])[shuffle_index]
1719            return shuffled_snpobj

Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated data (e.g., calldata_gt and variant-specific attributes) remain aligned.

Arguments:
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with shuffled variants. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject without shuffled variant positions if inplace=False. If inplace=True, modifies self in place and returns None.

def set_empty_to_missing( self, inplace: bool = False) -> SNPObject | None:
1721    def set_empty_to_missing(self, inplace: bool = False) -> Optional['SNPObject']:
1722        """
1723        Replace empty strings `''` with missing values `'.'` in attributes of `self`.
1724
1725        Args:
1726            inplace (bool, default=False): 
1727                If True, modifies `self` in place. If False, returns a new `SNPObject` with empty 
1728                strings `''` replaced by missing values `'.'`. Default is False.
1729
1730        Returns:
1731            **Optional[SNPObject]:** 
1732                A new `SNPObject` with empty strings replaced if `inplace=False`. 
1733                If `inplace=True`, modifies `self` in place and returns None.
1734        """
1735        if inplace:
1736            if self.variants_alt is not None:
1737                self.variants_alt[self.variants_alt == ''] = '.'
1738            if self.variants_ref is not None:
1739                self.variants_ref[self.variants_ref == ''] = '.'
1740            if self.variants_qual is not None:
1741                self.variants_qual = self.variants_qual.astype(str)
1742                self.variants_qual[(self.variants_qual == '') | (self.variants_qual == 'nan')] = '.'
1743            if self.variants_chrom is not None:
1744                self.variants_chrom = self.variants_chrom.astype(str)
1745                self.variants_chrom[self.variants_chrom == ''] = '.'
1746            if self.variants_filter_pass is not None:
1747                self.variants_filter_pass[self.variants_filter_pass == ''] = '.'
1748            if self.variants_id is not None:
1749                self.variants_id[self.variants_id == ''] = '.'
1750            return self
1751        else:
1752            snpobj = self.copy()
1753            if snpobj.variants_alt is not None:
1754                snpobj.variants_alt[snpobj.variants_alt == ''] = '.'
1755            if snpobj.variants_ref is not None:
1756                snpobj.variants_ref[snpobj.variants_ref == ''] = '.'
1757            if snpobj.variants_qual is not None:
1758                snpobj.variants_qual = snpobj.variants_qual.astype(str)
1759                snpobj.variants_qual[(snpobj.variants_qual == '') | (snpobj.variants_qual == 'nan')] = '.'
1760            if snpobj.variants_chrom is not None:
1761                snpobj.variants_chrom[snpobj.variants_chrom == ''] = '.'
1762            if snpobj.variants_filter_pass is not None:
1763                snpobj.variants_filter_pass[snpobj.variants_filter_pass == ''] = '.'
1764            if snpobj.variants_id is not None:
1765                snpobj.variants_id[snpobj.variants_id == ''] = '.'
1766            return snpobj

Replace empty strings '' with missing values '.' in attributes of self.

Arguments:
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new SNPObject with empty strings '' replaced by missing values '.'. Default is False.
Returns:

Optional[SNPObject]: A new SNPObject with empty strings replaced if inplace=False. If inplace=True, modifies self in place and returns None.

def convert_to_window_level( self, window_size: int | None = None, physical_pos: numpy.ndarray | None = None, chromosomes: numpy.ndarray | None = None, window_sizes: numpy.ndarray | None = None, laiobj: LocalAncestryObject | None = None) -> LocalAncestryObject:
1768    def convert_to_window_level(
1769        self,
1770        window_size: Optional[int] = None,
1771        physical_pos: Optional[np.ndarray] = None,
1772        chromosomes: Optional[np.ndarray] = None,
1773        window_sizes: Optional[np.ndarray] = None,
1774        laiobj: Optional['LocalAncestryObject'] = None
1775    ) -> 'LocalAncestryObject':
1776        """
1777        Aggregate the `calldata_lai` attribute into genomic windows within a 
1778        `snputils.ancestry.genobj.LocalAncestryObject`.
1779
1780        **Options for defining windows (in order of precedence):**
1781
1782        1. **Fixed window size**:
1783        - Use `window_size` to specify how many SNPs go into each window. The last window on each 
1784        chromosome may be larger if SNPs are not evenly divisible by the size.
1785
1786        2. **Custom start and end positions**:
1787        - Provide `physical_pos` (2D array of shape (n_windows, 2)) as the [start, end] base-pair 
1788         coordinates for each window. 
1789        - If `chromosomes` is not provided and `self` has exactly one chromosome, all windows are 
1790        assumed to belong to that chromosome. 
1791        - If multiple chromosomes exist but `chromosomes` is missing, an error will be raised.
1792        - Optionally, provide `window_sizes` to store the SNP count per-window.
1793
1794        3. **Matching existing windows**:
1795        - Reuse window definitions (`physical_pos`, `chromosomes`, `window_sizes`) from an existing `laiobj`.
1796
1797        Args:
1798            window_size (int, optional): 
1799                Number of SNPs in each window if defining fixed-size windows. If the total number of 
1800                SNPs in a chromosome is not evenly divisible by the window size, the last window on that 
1801                chromosome will include all remaining SNPs and therefore be larger than the specified size.
1802            physical_pos (array of shape (n_windows, 2), optional): 
1803                A 2D array containing the start and end physical positions for each window.
1804            chromosomes (array of shape (n_windows,), optional): 
1805                An array with chromosome numbers corresponding to each genomic window.
1806            window_sizes (array of shape (n_windows,), optional): 
1807                An array specifying the number of SNPs in each genomic window.
1808            laiobj (LocalAncestryObject, optional): 
1809                A reference `LocalAncestryObject` from which to copy existing window definitions.
1810
1811        Returns:
1812            **LocalAncestryObject:** 
1813                A LocalAncestryObject containing window-level ancestry data.
1814        """
1815        from snputils.ancestry.genobj.local import LocalAncestryObject
1816
1817        if window_size is None and physical_pos is None and laiobj is None:
1818            raise ValueError("One of `window_size`, `physical_pos`, or `laiobj` must be provided.")
1819        
1820        # Fixed window size
1821        if window_size is not None:
1822            physical_pos = []   # Boundaries [start, end] of each window
1823            chromosomes = []    # Chromosome for each window
1824            window_sizes = []   # Number of SNPs for each window
1825            for chrom in self.unique_chrom:
1826                # Extract indices corresponding to this chromosome
1827                mask_chrom = (self.variants_chrom == chrom)
1828                # Subset to this chromosome
1829                pos_chrom = self.variants_pos[mask_chrom]
1830                # Number of SNPs for this chromosome
1831                n_snps_chrom = pos_chrom.size
1832                
1833                # Initialize the start of the first window with the position of the first SNP
1834                current_start = self.variants_pos[0]
1835
1836                # Number of full windows with exactly `window_size` SNPs
1837                n_full_windows = n_snps_chrom // window_size
1838
1839                # Build all but the last window
1840                for i in range(n_full_windows-1):
1841                    current_end = self.variants_pos[(i+1) * window_size - 1]
1842                    physical_pos.append([current_start, current_end])
1843                    chromosomes.append(chrom)
1844                    window_sizes.append(window_size)
1845                    current_start = self.variants_pos[(i+1) * window_size]
1846                
1847                # Build the last window
1848                current_end = self.variants_pos[-1]
1849                physical_pos.append([current_start, current_end])
1850                chromosomes.append(chrom)
1851                window_sizes.append(n_snps_chrom - ((n_full_windows - 1) * window_size))
1852                
1853            physical_pos = np.array(physical_pos)
1854            chromosomes = np.array(chromosomes)
1855            window_sizes = np.array(window_sizes)
1856        
1857        # Custom start and end positions
1858        elif physical_pos is not None:
1859            # Check if there is exactly one chromosome
1860            if chromosomes is None:
1861                unique_chrom = self.unique_chrom
1862                if len(unique_chrom) == 1:
1863                    # We assume all windows belong to this single chromosome
1864                    single_chrom = unique_chrom[0]
1865                    chromosomes = np.array([single_chrom] * physical_pos.shape[0])
1866                else:
1867                    raise ValueError("Multiple chromosomes detected, but `chromosomes` was not provided.")
1868
1869        # Match existing windows to a reference laiobj
1870        elif laiobj is not None:
1871            physical_pos = laiobj.physical_pos
1872            chromosomes = laiobj.chromosomes
1873            window_sizes = laiobj.window_sizes
1874
1875        # Allocate an output LAI array
1876        n_windows = physical_pos.shape[0]
1877        n_samples = self.n_samples
1878        if self.calldata_lai.ndim == 3:
1879            lai = np.zeros((n_windows, n_samples, 2))
1880        else:
1881            lai = np.zeros((n_windows, n_samples*2))
1882
1883        # For each window, find the relevant SNPs and compute the mode of the ancestries
1884        for i, ((start, end), chrom) in enumerate(zip(physical_pos, chromosomes)):
1885            snps_mask = (
1886                (self.variants_chrom == chrom) &
1887                (self.variants_pos >= start) &
1888                (self.variants_pos <= end)
1889            )
1890            if np.any(snps_mask):
1891                lai_mask = self.calldata_lai[snps_mask, ...]
1892                mode_ancestries = mode(lai_mask, axis=0, nan_policy='omit').mode
1893                lai[i] = mode_ancestries
1894            else:
1895                lai[i] = np.nan
1896
1897        # Generate haplotype labels, e.g. "Sample1.0", "Sample1.1"
1898        haplotypes = [f"{sample}.{i}" for sample in self.samples for i in range(2)]
1899
1900        # If original data was (n_snps, n_samples, 2), flatten to (n_windows, n_samples*2)
1901        if self.calldata_lai.ndim == 3:
1902            lai = lai.reshape(n_windows, -1)
1903
1904        # Aggregate into a LocalAncestryObject
1905        return LocalAncestryObject(
1906            haplotypes=haplotypes,
1907            lai=lai,
1908            samples=self.samples,
1909            ancestry_map=self.ancestry_map,
1910            window_sizes=window_sizes,
1911            physical_pos=physical_pos,
1912            chromosomes=chromosomes
1913        )

Aggregate the calldata_lai attribute into genomic windows within a snputils.ancestry.genobj.LocalAncestryObject.

Options for defining windows (in order of precedence):

  1. Fixed window size:
  • Use window_size to specify how many SNPs go into each window. The last window on each chromosome may be larger if SNPs are not evenly divisible by the size.
  1. Custom start and end positions:
  • Provide physical_pos (2D array of shape (n_windows, 2)) as the [start, end] base-pair coordinates for each window.
  • If chromosomes is not provided and self has exactly one chromosome, all windows are assumed to belong to that chromosome.
  • If multiple chromosomes exist but chromosomes is missing, an error will be raised.
  • Optionally, provide window_sizes to store the SNP count per-window.
  1. Matching existing windows:
  • Reuse window definitions (physical_pos, chromosomes, window_sizes) from an existing laiobj.
Arguments:
  • window_size (int, optional): Number of SNPs in each window if defining fixed-size windows. If the total number of SNPs in a chromosome is not evenly divisible by the window size, the last window on that chromosome will include all remaining SNPs and therefore be larger than the specified size.
  • physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
  • chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
  • window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
  • laiobj (LocalAncestryObject, optional): A reference LocalAncestryObject from which to copy existing window definitions.
Returns:

LocalAncestryObject: A LocalAncestryObject containing window-level ancestry data.

def save(self, file: str | pathlib.Path) -> None:
1915    def save(self, file: Union[str, Path]) -> None:
1916        """
1917        Save the data stored in `self` to a specified file.
1918
1919        The format of the saved file is determined by the file extension provided in the `file` 
1920        argument. 
1921        
1922        **Supported formats:**
1923        
1924        - `.bed`: Binary PED (Plink) format.
1925        - `.pgen`: Plink2 binary genotype format.
1926        - `.vcf`: Variant Call Format.
1927        - `.pkl`: Pickle format for saving `self` in serialized form.
1928
1929        Args:
1930            file (str or pathlib.Path): 
1931                Path to the file where the data will be saved. The extension of the file determines the save format. 
1932                Supported extensions: `.bed`, `.pgen`, `.vcf`, `.pkl`.
1933        """
1934        ext = Path(file).suffix.lower()
1935        if ext == '.bed':
1936            self.save_bed(file)
1937        elif ext == '.pgen':
1938            self.save_pgen(file)
1939        elif ext == '.vcf':
1940            self.save_vcf(file)
1941        elif ext == '.pkl':
1942            self.save_pickle(file)
1943        else:
1944            raise ValueError(f"Unsupported file extension: {ext}")

Save the data stored in self to a specified file.

The format of the saved file is determined by the file extension provided in the file argument.

Supported formats:

  • .bed: Binary PED (Plink) format.
  • .pgen: Plink2 binary genotype format.
  • .vcf: Variant Call Format.
  • .pkl: Pickle format for saving self in serialized form.
Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format. Supported extensions: .bed, .pgen, .vcf, .pkl.
def save_bed(self, file: str | pathlib.Path) -> None:
1946    def save_bed(self, file: Union[str, Path]) -> None:
1947        """
1948        Save the data stored in `self` to a `.bed` file.
1949
1950        Args:
1951            file (str or pathlib.Path): 
1952                Path to the file where the data will be saved. It should end with `.bed`. 
1953                If the provided path does not have this extension, it will be appended.
1954        """
1955        from snputils.snp.io.write.bed import BEDWriter
1956        writer = BEDWriter(snpobj=self, filename=file)
1957        writer.write()

Save the data stored in self to a .bed file.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .bed. If the provided path does not have this extension, it will be appended.
def save_pgen(self, file: str | pathlib.Path) -> None:
1959    def save_pgen(self, file: Union[str, Path]) -> None:
1960        """
1961        Save the data stored in `self` to a `.pgen` file.
1962
1963        Args:
1964            file (str or pathlib.Path): 
1965                Path to the file where the data will be saved. It should end with `.pgen`. 
1966                If the provided path does not have this extension, it will be appended.
1967        """
1968        from snputils.snp.io.write.pgen import PGENWriter
1969        writer = PGENWriter(snpobj=self, filename=file)
1970        writer.write()

Save the data stored in self to a .pgen file.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pgen. If the provided path does not have this extension, it will be appended.
def save_vcf(self, file: str | pathlib.Path) -> None:
1972    def save_vcf(self, file: Union[str, Path]) -> None:
1973        """
1974        Save the data stored in `self` to a `.vcf` file.
1975
1976        Args:
1977            file (str or pathlib.Path): 
1978                Path to the file where the data will be saved. It should end with `.vcf`. 
1979                If the provided path does not have this extension, it will be appended.
1980        """
1981        from snputils.snp.io.write.vcf import VCFWriter
1982        writer = VCFWriter(snpobj=self, filename=file)
1983        writer.write()

Save the data stored in self to a .vcf file.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf. If the provided path does not have this extension, it will be appended.
def save_pickle(self, file: str | pathlib.Path) -> None:
1985    def save_pickle(self, file: Union[str, Path]) -> None:
1986        """
1987        Save `self` in serialized form to a `.pkl` file.
1988
1989        Args:
1990            file (str or pathlib.Path): 
1991                Path to the file where the data will be saved. It should end with `.pkl`. 
1992                If the provided path does not have this extension, it will be appended.
1993        """
1994        import pickle
1995        with open(file, 'wb') as file:
1996            pickle.dump(self, file)

Save self in serialized form to a .pkl file.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pkl. If the provided path does not have this extension, it will be appended.
class GRGObject:
 16class GRGObject:
 17    """
 18    A class for Single Nucleotide Polymorphism (SNP) data.
 19    """
 20    def __init__(
 21        self,
 22        calldata_gt: Optional[GRGType] = None,
 23        filename: Optional[str] = None,
 24        mutable: Optional[bool] = None
 25    ) -> None:
 26        """
 27        Args:
 28            calldata_gt (GRG | MutableGRG, optional): 
 29                A Genotype Representation Graph containing genotype data for each sample. 
 30            filename (str, optional)
 31                File storing the GRG.
 32        """
 33        self.__calldata_gt = calldata_gt
 34        self.__filename = filename
 35        self.__mutable = mutable
 36        self.__latest   = False
 37
 38    def __getitem__(self, key: str) -> Any:
 39        """
 40        To access an attribute of the class using the square bracket notation,
 41        similar to a dictionary.
 42        """
 43        try:
 44            return getattr(self, key)
 45        except AttributeError:
 46            raise KeyError(f'Invalid key: {key}.')
 47
 48    def __setitem__(self, key: str, value: Any):
 49        """
 50        To set an attribute of the class using the square bracket notation,
 51        similar to a dictionary.
 52        """
 53        try:
 54            setattr(self, key, value)
 55        except AttributeError:
 56            raise KeyError(f'Invalid key: {key}.')
 57
 58    @property
 59    def calldata_gt(self) -> Optional[GRGType]:
 60        """
 61        Retrieve `calldata_gt`.
 62
 63        Returns:
 64            **GRG | MutableGRG:** 
 65                An GRG containing genotype data for all samples.
 66        """
 67        return self.__calldata_gt
 68
 69    @calldata_gt.setter
 70    def calldata_gt(self, x: GRGType):
 71        """
 72        Update `calldata_gt`.
 73        """
 74        self.__calldata_gt = x
 75
 76
 77    @property
 78    def filename(self) -> str:
 79        """
 80        Retrieve `filename`.
 81
 82        Returns:
 83            **str** 
 84                A string containing the file name.
 85        """
 86        return self.__filename
 87
 88    @filename.setter
 89    def filename(self, x: str):
 90        """
 91        Update `filename`.
 92        """
 93        self.__filename = x
 94    
 95    @property
 96    def mutable(self) -> Optional[bool]:
 97        return self.__mutable
 98
 99    def allele_freq(self) -> np.ndarray:
100        # allele frequency array
101        al_freq = np.ones(self.calldata_gt.num_samples) / self.calldata_gt.num_samples
102        return pyg.dot_product(self.calldata_gt, al_freq, pyg.TraversalDirection.UP)
103
104    def dot_product(self, array: np.ndarray, traversal_direction: pyg.TraversalDirection):
105        return pyg.dot_product(self.calldata_gt, array, traversal_direction)
106    
107    # TODO: consider moving this elsewhere.
108    def allele_freq_from_file(self, filename: Optional[str] = None) -> pd.DataFrame:
109        newfile = filename if filename is not None else self.__filename
110        if newfile is None:
111            raise ValueError("Either pass in a filename, or store an existing GRG filename.")
112
113        with tempfile.NamedTemporaryFile() as fp:
114            subprocess.run(["grg", "process", "freq", f"{newfile}"], stdout=fp, check=True)
115            fp.seek(0) # set the file cursor
116            return pd.read_csv(fp.name, sep="\t")
117        
118        
119    def gwas(self, phenotype_file: str, filename: Optional[str] = None) -> pd.DataFrame:
120        grg_file = filename if filename is not None else self.__filename
121        if grg_file is None:
122            raise ValueError("Either pass in a GRG filename, or store an existing GRG filename.")
123
124        with tempfile.NamedTemporaryFile(suffix=".tsv") as fp:
125            try:
126                subprocess.run(
127                    ["grapp", "assoc", "-p", f"{phenotype_file}", "-o", fp.name, f"{grg_file}"],
128                    check=True,
129                )
130            except FileNotFoundError as exc:
131                raise ImportError(
132                    "GWAS support requires the optional dependency 'grapp'. "
133                    "Install it with: pip install grapp"
134                ) from exc
135            return pd.read_csv(fp.name, sep="\t")
136    
137    def merge(self, combine_nodes: bool = False, *args) -> None:
138        # assert self.__mutable and isinstance(self.calldata_gt, pyg.MutableGRG), "GRG must be mutable"
139        for arg in args:
140            if not isinstance(arg, str):
141                raise TypeError("All merge inputs must be strings.")
142        # list of files, and combine_nodes
143        self.__calldata_gt.merge(list(args), combine_nodes)
144        #pep8 be damned
145        # if inplace: self.__calldata_gt = merged_data
146        # else      : return merged_data
147
148    def n_samples(self, ploidy = 2) -> int:
149        """
150        Get number of samples from GRG. Diploid by default. 
151        """
152        return int(self.__calldata_gt.num_samples / ploidy)
153    
154    def n_snps(self) -> int:
155        return self.__calldata_gt.num_mutations
156
157    def _sample_ids(self, n_samples: int, sample_prefix: str) -> np.ndarray:
158        default_ids = [f"{sample_prefix}_{idx}" for idx in range(n_samples)]
159        if self.__calldata_gt is None:
160            return np.asarray(default_ids, dtype=object)
161
162        has_individual_ids = bool(getattr(self.__calldata_gt, "has_individual_ids", False))
163        num_individuals = int(getattr(self.__calldata_gt, "num_individuals", 0))
164        if has_individual_ids and n_samples == num_individuals:
165            ids = []
166            for idx in range(n_samples):
167                try:
168                    sample_id = str(self.__calldata_gt.get_individual_id(idx))
169                except RuntimeError:
170                    sample_id = ""
171                ids.append(sample_id if sample_id else default_ids[idx])
172        else:
173            ids = default_ids
174
175        # Keep IDs unique for downstream writers.
176        seen = {}
177        unique_ids = []
178        for idx, sample_id in enumerate(ids):
179            count = seen.get(sample_id, 0)
180            unique_ids.append(sample_id if count == 0 else f"{sample_id}_{count}")
181            seen[sample_id] = count + 1
182
183        return np.asarray(unique_ids, dtype=object)
184
185    def to_snpobject(
186        self,
187        sum_strands: bool = False,
188        chrom: str = ".",
189        sample_prefix: str = "sample",
190    ):
191        """
192        Convert the GRG to a dense SNPObject.
193
194        Notes:
195            - This materializes the full genotype matrix, so memory usage scales with
196              `num_mutations * num_samples`.
197            - For diploid GRGs and `sum_strands=False`, output has shape
198              `(n_snps, n_samples, 2)`.
199            - For `sum_strands=True`, output has shape `(n_snps, n_samples)` with
200              per-individual allele counts.
201        """
202        from snputils.snp.genobj.snpobj import SNPObject
203
204        if self.__calldata_gt is None:
205            raise ValueError("Cannot convert to SNPObject: `calldata_gt` is None.")
206
207        grg = self.__calldata_gt
208        n_mutations = int(grg.num_mutations)
209        n_haplotypes = int(grg.num_samples)
210        ploidy = int(getattr(grg, "ploidy", 2))
211
212        if ploidy <= 0:
213            raise ValueError(f"Invalid ploidy in GRG: {ploidy}")
214        if n_haplotypes % ploidy != 0:
215            raise ValueError(
216                f"GRG has {n_haplotypes} haplotypes, not divisible by ploidy {ploidy}."
217            )
218
219        n_individuals = n_haplotypes // ploidy
220        chrom = str(chrom)
221
222        def _empty(shape):
223            return np.empty(shape, dtype=np.int8)
224
225        if sum_strands:
226            if n_mutations == 0:
227                calldata_gt = _empty((0, n_individuals))
228            elif ploidy == 1:
229                mutation_eye = np.eye(n_mutations, dtype=np.float64)
230                hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN)
231                calldata_gt = np.rint(hap_matrix).astype(np.int8, copy=False)
232            else:
233                mutation_eye = np.eye(n_mutations, dtype=np.float64)
234                diploid_matrix = pyg.matmul(
235                    grg, mutation_eye, pyg.TraversalDirection.DOWN, by_individual=True
236                )
237                calldata_gt = np.rint(diploid_matrix).astype(np.int8, copy=False)
238            sample_ids = self._sample_ids(n_individuals, sample_prefix)
239        else:
240            if ploidy != 2:
241                raise ValueError(
242                    "Phased SNPObject output requires diploid GRGs. "
243                    "Use `sum_strands=True` for non-diploid data."
244                )
245            if n_mutations == 0:
246                calldata_gt = _empty((0, n_individuals, ploidy))
247            else:
248                mutation_eye = np.eye(n_mutations, dtype=np.float64)
249                hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN)
250                hap_matrix = np.rint(hap_matrix).astype(np.int8, copy=False)
251                calldata_gt = hap_matrix.reshape(n_mutations, n_individuals, ploidy)
252            sample_ids = self._sample_ids(n_individuals, sample_prefix)
253
254        variants_ref = np.empty(n_mutations, dtype=object)
255        variants_alt = np.empty(n_mutations, dtype=object)
256        variants_pos = np.empty(n_mutations, dtype=np.int64)
257        variants_id = np.empty(n_mutations, dtype=object)
258
259        for mut_id in range(n_mutations):
260            mutation = grg.get_mutation_by_id(mut_id)
261            position = int(round(float(mutation.position)))
262            ref = str(mutation.ref_allele) if str(mutation.ref_allele) else "."
263            alt = str(mutation.allele) if str(mutation.allele) else "."
264            variants_pos[mut_id] = position
265            variants_ref[mut_id] = ref
266            variants_alt[mut_id] = alt
267            variants_id[mut_id] = f"{chrom}:{position}"
268
269        variants_chrom = np.full(n_mutations, chrom, dtype=object)
270        variants_filter_pass = np.full(n_mutations, "PASS", dtype=object)
271        variants_qual = np.full(n_mutations, np.nan, dtype=np.float32)
272
273        return SNPObject(
274            calldata_gt=calldata_gt,
275            samples=sample_ids,
276            variants_ref=variants_ref,
277            variants_alt=variants_alt,
278            variants_chrom=variants_chrom,
279            variants_filter_pass=variants_filter_pass,
280            variants_id=variants_id,
281            variants_pos=variants_pos,
282            variants_qual=variants_qual,
283        )
284
285    def copy(self) -> GRGObject:
286        """
287        Create and return a copy of `self`.
288
289        Returns:
290            **GRGObject:** 
291                A new instance of the current object.
292        """
293        return copy.deepcopy(self)
294
295    def keys(self) -> List[str]:
296        """
297        Retrieve a list of public attribute names for `self`.
298
299        Returns:
300            **list of str:** 
301                A list of attribute names, with internal name-mangling removed, 
302                for easier reference to public attributes in the instance.
303        """
304        return [attr.replace('_GRGObject__', '') for attr in vars(self)]
305
306    def to_grg(self, filename: str, 
307                     allow_simplify: bool = True):
308        pyg.save_grg(self.__calldata_gt, filename, allow_simplify)

A class for Single Nucleotide Polymorphism (SNP) data.

GRGObject( calldata_gt: _grgl.GRG | _grgl.MutableGRG | None = None, filename: str | None = None, mutable: bool | None = None)
20    def __init__(
21        self,
22        calldata_gt: Optional[GRGType] = None,
23        filename: Optional[str] = None,
24        mutable: Optional[bool] = None
25    ) -> None:
26        """
27        Args:
28            calldata_gt (GRG | MutableGRG, optional): 
29                A Genotype Representation Graph containing genotype data for each sample. 
30            filename (str, optional)
31                File storing the GRG.
32        """
33        self.__calldata_gt = calldata_gt
34        self.__filename = filename
35        self.__mutable = mutable
36        self.__latest   = False
Arguments:
  • calldata_gt (GRG | MutableGRG, optional): A Genotype Representation Graph containing genotype data for each sample.
  • filename (str, optional) File storing the GRG.
calldata_gt: _grgl.GRG | _grgl.MutableGRG | None
58    @property
59    def calldata_gt(self) -> Optional[GRGType]:
60        """
61        Retrieve `calldata_gt`.
62
63        Returns:
64            **GRG | MutableGRG:** 
65                An GRG containing genotype data for all samples.
66        """
67        return self.__calldata_gt

Retrieve calldata_gt.

Returns:

GRG | MutableGRG: An GRG containing genotype data for all samples.

filename: str
77    @property
78    def filename(self) -> str:
79        """
80        Retrieve `filename`.
81
82        Returns:
83            **str** 
84                A string containing the file name.
85        """
86        return self.__filename

Retrieve filename.

Returns:

str A string containing the file name.

mutable: bool | None
95    @property
96    def mutable(self) -> Optional[bool]:
97        return self.__mutable
def allele_freq(self) -> numpy.ndarray:
 99    def allele_freq(self) -> np.ndarray:
100        # allele frequency array
101        al_freq = np.ones(self.calldata_gt.num_samples) / self.calldata_gt.num_samples
102        return pyg.dot_product(self.calldata_gt, al_freq, pyg.TraversalDirection.UP)
def dot_product( self, array: numpy.ndarray, traversal_direction: _grgl.TraversalDirection):
104    def dot_product(self, array: np.ndarray, traversal_direction: pyg.TraversalDirection):
105        return pyg.dot_product(self.calldata_gt, array, traversal_direction)
def allele_freq_from_file(self, filename: str | None = None) -> pandas.DataFrame:
108    def allele_freq_from_file(self, filename: Optional[str] = None) -> pd.DataFrame:
109        newfile = filename if filename is not None else self.__filename
110        if newfile is None:
111            raise ValueError("Either pass in a filename, or store an existing GRG filename.")
112
113        with tempfile.NamedTemporaryFile() as fp:
114            subprocess.run(["grg", "process", "freq", f"{newfile}"], stdout=fp, check=True)
115            fp.seek(0) # set the file cursor
116            return pd.read_csv(fp.name, sep="\t")
def gwas( self, phenotype_file: str, filename: str | None = None) -> pandas.DataFrame:
119    def gwas(self, phenotype_file: str, filename: Optional[str] = None) -> pd.DataFrame:
120        grg_file = filename if filename is not None else self.__filename
121        if grg_file is None:
122            raise ValueError("Either pass in a GRG filename, or store an existing GRG filename.")
123
124        with tempfile.NamedTemporaryFile(suffix=".tsv") as fp:
125            try:
126                subprocess.run(
127                    ["grapp", "assoc", "-p", f"{phenotype_file}", "-o", fp.name, f"{grg_file}"],
128                    check=True,
129                )
130            except FileNotFoundError as exc:
131                raise ImportError(
132                    "GWAS support requires the optional dependency 'grapp'. "
133                    "Install it with: pip install grapp"
134                ) from exc
135            return pd.read_csv(fp.name, sep="\t")
def merge(self, combine_nodes: bool = False, *args) -> None:
137    def merge(self, combine_nodes: bool = False, *args) -> None:
138        # assert self.__mutable and isinstance(self.calldata_gt, pyg.MutableGRG), "GRG must be mutable"
139        for arg in args:
140            if not isinstance(arg, str):
141                raise TypeError("All merge inputs must be strings.")
142        # list of files, and combine_nodes
143        self.__calldata_gt.merge(list(args), combine_nodes)
144        #pep8 be damned
145        # if inplace: self.__calldata_gt = merged_data
146        # else      : return merged_data
def n_samples(self, ploidy=2) -> int:
148    def n_samples(self, ploidy = 2) -> int:
149        """
150        Get number of samples from GRG. Diploid by default. 
151        """
152        return int(self.__calldata_gt.num_samples / ploidy)

Get number of samples from GRG. Diploid by default.

def n_snps(self) -> int:
154    def n_snps(self) -> int:
155        return self.__calldata_gt.num_mutations
def to_snpobject( self, sum_strands: bool = False, chrom: str = '.', sample_prefix: str = 'sample'):
185    def to_snpobject(
186        self,
187        sum_strands: bool = False,
188        chrom: str = ".",
189        sample_prefix: str = "sample",
190    ):
191        """
192        Convert the GRG to a dense SNPObject.
193
194        Notes:
195            - This materializes the full genotype matrix, so memory usage scales with
196              `num_mutations * num_samples`.
197            - For diploid GRGs and `sum_strands=False`, output has shape
198              `(n_snps, n_samples, 2)`.
199            - For `sum_strands=True`, output has shape `(n_snps, n_samples)` with
200              per-individual allele counts.
201        """
202        from snputils.snp.genobj.snpobj import SNPObject
203
204        if self.__calldata_gt is None:
205            raise ValueError("Cannot convert to SNPObject: `calldata_gt` is None.")
206
207        grg = self.__calldata_gt
208        n_mutations = int(grg.num_mutations)
209        n_haplotypes = int(grg.num_samples)
210        ploidy = int(getattr(grg, "ploidy", 2))
211
212        if ploidy <= 0:
213            raise ValueError(f"Invalid ploidy in GRG: {ploidy}")
214        if n_haplotypes % ploidy != 0:
215            raise ValueError(
216                f"GRG has {n_haplotypes} haplotypes, not divisible by ploidy {ploidy}."
217            )
218
219        n_individuals = n_haplotypes // ploidy
220        chrom = str(chrom)
221
222        def _empty(shape):
223            return np.empty(shape, dtype=np.int8)
224
225        if sum_strands:
226            if n_mutations == 0:
227                calldata_gt = _empty((0, n_individuals))
228            elif ploidy == 1:
229                mutation_eye = np.eye(n_mutations, dtype=np.float64)
230                hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN)
231                calldata_gt = np.rint(hap_matrix).astype(np.int8, copy=False)
232            else:
233                mutation_eye = np.eye(n_mutations, dtype=np.float64)
234                diploid_matrix = pyg.matmul(
235                    grg, mutation_eye, pyg.TraversalDirection.DOWN, by_individual=True
236                )
237                calldata_gt = np.rint(diploid_matrix).astype(np.int8, copy=False)
238            sample_ids = self._sample_ids(n_individuals, sample_prefix)
239        else:
240            if ploidy != 2:
241                raise ValueError(
242                    "Phased SNPObject output requires diploid GRGs. "
243                    "Use `sum_strands=True` for non-diploid data."
244                )
245            if n_mutations == 0:
246                calldata_gt = _empty((0, n_individuals, ploidy))
247            else:
248                mutation_eye = np.eye(n_mutations, dtype=np.float64)
249                hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN)
250                hap_matrix = np.rint(hap_matrix).astype(np.int8, copy=False)
251                calldata_gt = hap_matrix.reshape(n_mutations, n_individuals, ploidy)
252            sample_ids = self._sample_ids(n_individuals, sample_prefix)
253
254        variants_ref = np.empty(n_mutations, dtype=object)
255        variants_alt = np.empty(n_mutations, dtype=object)
256        variants_pos = np.empty(n_mutations, dtype=np.int64)
257        variants_id = np.empty(n_mutations, dtype=object)
258
259        for mut_id in range(n_mutations):
260            mutation = grg.get_mutation_by_id(mut_id)
261            position = int(round(float(mutation.position)))
262            ref = str(mutation.ref_allele) if str(mutation.ref_allele) else "."
263            alt = str(mutation.allele) if str(mutation.allele) else "."
264            variants_pos[mut_id] = position
265            variants_ref[mut_id] = ref
266            variants_alt[mut_id] = alt
267            variants_id[mut_id] = f"{chrom}:{position}"
268
269        variants_chrom = np.full(n_mutations, chrom, dtype=object)
270        variants_filter_pass = np.full(n_mutations, "PASS", dtype=object)
271        variants_qual = np.full(n_mutations, np.nan, dtype=np.float32)
272
273        return SNPObject(
274            calldata_gt=calldata_gt,
275            samples=sample_ids,
276            variants_ref=variants_ref,
277            variants_alt=variants_alt,
278            variants_chrom=variants_chrom,
279            variants_filter_pass=variants_filter_pass,
280            variants_id=variants_id,
281            variants_pos=variants_pos,
282            variants_qual=variants_qual,
283        )

Convert the GRG to a dense SNPObject.

Notes:
  • This materializes the full genotype matrix, so memory usage scales with num_mutations * num_samples.
  • For diploid GRGs and sum_strands=False, output has shape (n_snps, n_samples, 2).
  • For sum_strands=True, output has shape (n_snps, n_samples) with per-individual allele counts.
def copy(self) -> GRGObject:
285    def copy(self) -> GRGObject:
286        """
287        Create and return a copy of `self`.
288
289        Returns:
290            **GRGObject:** 
291                A new instance of the current object.
292        """
293        return copy.deepcopy(self)

Create and return a copy of self.

Returns:

GRGObject: A new instance of the current object.

def keys(self) -> List[str]:
295    def keys(self) -> List[str]:
296        """
297        Retrieve a list of public attribute names for `self`.
298
299        Returns:
300            **list of str:** 
301                A list of attribute names, with internal name-mangling removed, 
302                for easier reference to public attributes in the instance.
303        """
304        return [attr.replace('_GRGObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def to_grg(self, filename: str, allow_simplify: bool = True):
306    def to_grg(self, filename: str, 
307                     allow_simplify: bool = True):
308        pyg.save_grg(self.__calldata_gt, filename, allow_simplify)
class SNPReader:
 8class SNPReader:
 9    def __new__(cls,
10                filename: Union[str, pathlib.Path],
11                vcf_backend: str = 'polars') -> SNPReader:
12        """
13        Automatically detect the SNP file format from the file extension, and return its corresponding reader.
14
15        Args:
16            filename: Filename of the file to read.
17            vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
18
19        Raises:
20            ValueError: If the filename does not have an extension or the extension is not supported.
21        """
22        filename = pathlib.Path(filename)
23        suffixes = filename.suffixes
24        if not suffixes:
25            raise ValueError("The filename should have an extension when using SNPReader.")
26
27        extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1]
28        extension = extension.lower()
29
30        if extension == ".vcf":
31            if vcf_backend == 'polars':
32                from snputils.snp.io.read.vcf import VCFReaderPolars
33
34                return VCFReaderPolars(filename)
35            elif vcf_backend == 'scikit-allel':
36                from snputils.snp.io.read.vcf import VCFReader
37
38                return VCFReader(filename)
39            else:
40                raise ValueError(f"VCF backend not supported: {vcf_backend}")
41        elif extension in (".bed", ".bim", ".fam"):
42            from snputils.snp.io.read.bed import BEDReader
43
44            return BEDReader(filename)
45        elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"):
46            from snputils.snp.io.read.pgen import PGENReader
47
48            return PGENReader(filename)
49        else:
50            raise ValueError(f"File format not supported: {filename}")
SNPReader(filename: str | pathlib.Path, vcf_backend: str = 'polars')
 9    def __new__(cls,
10                filename: Union[str, pathlib.Path],
11                vcf_backend: str = 'polars') -> SNPReader:
12        """
13        Automatically detect the SNP file format from the file extension, and return its corresponding reader.
14
15        Args:
16            filename: Filename of the file to read.
17            vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
18
19        Raises:
20            ValueError: If the filename does not have an extension or the extension is not supported.
21        """
22        filename = pathlib.Path(filename)
23        suffixes = filename.suffixes
24        if not suffixes:
25            raise ValueError("The filename should have an extension when using SNPReader.")
26
27        extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1]
28        extension = extension.lower()
29
30        if extension == ".vcf":
31            if vcf_backend == 'polars':
32                from snputils.snp.io.read.vcf import VCFReaderPolars
33
34                return VCFReaderPolars(filename)
35            elif vcf_backend == 'scikit-allel':
36                from snputils.snp.io.read.vcf import VCFReader
37
38                return VCFReader(filename)
39            else:
40                raise ValueError(f"VCF backend not supported: {vcf_backend}")
41        elif extension in (".bed", ".bim", ".fam"):
42            from snputils.snp.io.read.bed import BEDReader
43
44            return BEDReader(filename)
45        elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"):
46            from snputils.snp.io.read.pgen import PGENReader
47
48            return PGENReader(filename)
49        else:
50            raise ValueError(f"File format not supported: {filename}")

Automatically detect the SNP file format from the file extension, and return its corresponding reader.

Arguments:
  • filename: Filename of the file to read.
  • vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
Raises:
  • ValueError: If the filename does not have an extension or the extension is not supported.
@SNPBaseReader.register
class BEDReader(snputils.snp.io.read.base.SNPBaseReader):
 16@SNPBaseReader.register
 17class BEDReader(SNPBaseReader):
 18    def read(
 19        self,
 20        fields: Optional[List[str]] = None,
 21        exclude_fields: Optional[List[str]] = None,
 22        sample_ids: Optional[np.ndarray] = None,
 23        sample_idxs: Optional[np.ndarray] = None,
 24        variant_ids: Optional[np.ndarray] = None,
 25        variant_idxs: Optional[np.ndarray] = None,
 26        sum_strands: bool = False,
 27        separator: Optional[str] = None,
 28    ) -> SNPObject:
 29        """
 30        Read a bed fileset (bed, bim, fam) into a SNPObject.
 31
 32        Args:
 33            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 34                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 35                To extract all fields, set fields to None. Defaults to None.
 36            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 37                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 38                To exclude no fields, set exclude_fields to None. Defaults to None.
 39            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 40            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 41            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 42            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 43            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 44                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 45                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 46            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 47                If the automatic detection fails, please specify the separator manually.
 48
 49        Returns:
 50            **SNPObject**: 
 51                A SNPObject instance.
 52        """
 53        assert (
 54            sample_idxs is None or sample_ids is None
 55        ), "Only one of sample_idxs and sample_ids can be specified"
 56        assert (
 57            variant_idxs is None or variant_ids is None
 58        ), "Only one of variant_idxs and variant_ids can be specified"
 59
 60        if isinstance(fields, str):
 61            fields = [fields]
 62        if isinstance(exclude_fields, str):
 63            exclude_fields = [exclude_fields]
 64
 65        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"]
 66        exclude_fields = exclude_fields or []
 67        fields = [field for field in fields if field not in exclude_fields]
 68        only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 69
 70        filename_noext = str(self.filename)
 71        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
 72            filename_noext = filename_noext[:-4]
 73
 74        if only_read_bed:
 75            with open(filename_noext + '.fam', 'r') as f:
 76                file_num_samples = sum(1 for _ in f)  # Get sample count from fam file
 77            file_num_variants = None  # Not needed
 78        else:
 79            log.info(f"Reading {filename_noext}.bim")
 80
 81            if separator is None:
 82                with open(filename_noext + ".bim", "r") as file:
 83                    separator = csv.Sniffer().sniff(file.readline()).delimiter
 84
 85            bim = pl.read_csv(
 86                filename_noext + ".bim",
 87                separator=separator,
 88                has_header=False,
 89                new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
 90                schema_overrides={
 91                    "#CHROM": pl.String,
 92                    "ID": pl.String,
 93                    "CM": pl.Float64,
 94                    "POS": pl.Int64,
 95                    "ALT": pl.String,
 96                    "REF": pl.String
 97                },
 98                null_values=["NA"]
 99            ).with_row_index()
100            file_num_variants = bim.height
101
102            if variant_ids is not None:
103                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
104                variant_id_or_pos = (
105                    pl.col("ID").is_in(variant_id_values)
106                    | pl.concat_str(
107                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
108                    ).is_in(variant_id_values)
109                )
110                variant_idxs = (
111                    bim.filter(variant_id_or_pos)
112                    .select("index")
113                    .to_series()
114                    .to_numpy()
115                )
116
117            if variant_idxs is None:
118                num_variants = file_num_variants
119                variant_idxs = np.arange(num_variants, dtype=np.uint32)
120            else:
121                requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel()
122                bim = bim.filter(pl.col("index").is_in(requested_variant_idxs))
123                variant_idxs = bim.select("index").to_series().to_numpy()
124                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
125                num_variants = np.size(variant_idxs)
126
127            log.info(f"Reading {filename_noext}.fam")
128
129            fam = pl.read_csv(
130                filename_noext + ".fam",
131                separator=separator,
132                has_header=False,
133                new_columns=["Family ID", "IID", "Father ID",
134                             "Mother ID", "Sex code", "Phenotype value"],
135                schema_overrides={
136                    "Family ID": pl.String,
137                    "IID": pl.String,
138                    "Father ID": pl.String,
139                    "Mother ID": pl.String,
140                    "Sex code": pl.String,
141                },
142                null_values=["NA"]
143            ).with_row_index()
144            file_num_samples = fam.height
145
146            if sample_ids is not None:
147                sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy()
148
149            if sample_idxs is None:
150                num_samples = file_num_samples
151            else:
152                num_samples = np.size(sample_idxs)
153                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
154                fam = fam.filter(pl.col("index").is_in(sample_idxs))
155
156        if "GT" in fields:
157            log.info(f"Reading {filename_noext}.bed")
158            pgen_reader = pg.PgenReader(
159                str.encode(filename_noext + ".bed"),
160                raw_sample_ct=file_num_samples,
161                variant_ct=file_num_variants,
162                sample_subset=sample_idxs,
163            )
164
165            if only_read_bed:
166                num_samples = pgen_reader.get_raw_sample_ct()
167                num_variants = pgen_reader.get_variant_ct()
168                variant_idxs = np.arange(num_variants, dtype=np.uint32)
169
170            # required arrays: variant_idxs + sample_idxs + genotypes
171            if not sum_strands:
172                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
173            else:
174                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
175            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
176
177            if not sum_strands:
178                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
179                pgen_reader.read_alleles_list(variant_idxs, genotypes)
180                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
181            else:
182                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
183                pgen_reader.read_list(variant_idxs, genotypes)
184            pgen_reader.close()
185        else:
186            genotypes = None
187
188        log.info("Constructing SNPObject")
189
190        snpobj = SNPObject(
191            calldata_gt=genotypes if "GT" in fields else None,
192            samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
193            **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
194               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
195        )
196
197        log.info("Finished constructing SNPObject")
198        return snpobj
199
200    def _resolve_variant_idxs_for_iter(
201        self,
202        *,
203        variant_ids: Optional[np.ndarray],
204        variant_idxs: Optional[np.ndarray],
205        separator: Optional[str],
206    ) -> np.ndarray:
207        """
208        Resolve variant selectors to canonical file-order row indices.
209        """
210        filename_noext = str(self.filename)
211        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
212            filename_noext = filename_noext[:-4]
213
214        local_separator = separator
215        if local_separator is None:
216            with open(filename_noext + ".bim", "r") as file:
217                local_separator = csv.Sniffer().sniff(file.readline()).delimiter
218
219        bim = pl.read_csv(
220            filename_noext + ".bim",
221            separator=local_separator,
222            has_header=False,
223            new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
224            schema_overrides={
225                "#CHROM": pl.String,
226                "ID": pl.String,
227                "CM": pl.Float64,
228                "POS": pl.Int64,
229                "ALT": pl.String,
230                "REF": pl.String,
231            },
232            null_values=["NA"],
233        ).with_row_index()
234
235        if variant_ids is not None:
236            variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
237            variant_id_or_pos = (
238                pl.col("ID").is_in(variant_id_values)
239                | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in(
240                    variant_id_values
241                )
242            )
243            resolved = (
244                bim.filter(variant_id_or_pos)
245                .select("index")
246                .to_series()
247                .to_numpy()
248            )
249            return np.asarray(resolved, dtype=np.uint32)
250
251        if variant_idxs is not None:
252            requested = np.asarray(variant_idxs, dtype=np.uint32).ravel()
253            resolved = (
254                bim.filter(pl.col("index").is_in(requested))
255                .select("index")
256                .to_series()
257                .to_numpy()
258            )
259            return np.asarray(resolved, dtype=np.uint32)
260
261        return np.arange(bim.height, dtype=np.uint32)
262
263    def iter_read(
264        self,
265        fields: Optional[List[str]] = None,
266        exclude_fields: Optional[List[str]] = None,
267        sample_ids: Optional[np.ndarray] = None,
268        sample_idxs: Optional[np.ndarray] = None,
269        variant_ids: Optional[np.ndarray] = None,
270        variant_idxs: Optional[np.ndarray] = None,
271        sum_strands: bool = False,
272        separator: Optional[str] = None,
273        chunk_size: int = 10_000,
274    ) -> Iterator[SNPObject]:
275        """
276        Stream the BED fileset in variant chunks.
277
278        This yields a sequence of SNPObject chunks along the SNP axis.
279        """
280        if chunk_size < 1:
281            raise ValueError("chunk_size must be >= 1.")
282        if sample_idxs is not None and sample_ids is not None:
283            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
284        if variant_idxs is not None and variant_ids is not None:
285            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
286
287        selectors = self._resolve_variant_idxs_for_iter(
288            variant_ids=variant_ids,
289            variant_idxs=variant_idxs,
290            separator=separator,
291        )
292
293        n_selectors = int(selectors.size)
294        for start in range(0, n_selectors, int(chunk_size)):
295            stop = min(start + int(chunk_size), n_selectors)
296            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
297            yield self.read(
298                fields=fields,
299                exclude_fields=exclude_fields,
300                sample_ids=sample_ids,
301                sample_idxs=sample_idxs,
302                variant_idxs=selector_chunk,
303                sum_strands=sum_strands,
304                separator=separator,
305            )

Abstract class for SNP readers.

Attributes:
  • _filename: The path to the file storing SNP data.
def read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str | None = None) -> SNPObject:
 18    def read(
 19        self,
 20        fields: Optional[List[str]] = None,
 21        exclude_fields: Optional[List[str]] = None,
 22        sample_ids: Optional[np.ndarray] = None,
 23        sample_idxs: Optional[np.ndarray] = None,
 24        variant_ids: Optional[np.ndarray] = None,
 25        variant_idxs: Optional[np.ndarray] = None,
 26        sum_strands: bool = False,
 27        separator: Optional[str] = None,
 28    ) -> SNPObject:
 29        """
 30        Read a bed fileset (bed, bim, fam) into a SNPObject.
 31
 32        Args:
 33            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 34                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 35                To extract all fields, set fields to None. Defaults to None.
 36            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 37                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 38                To exclude no fields, set exclude_fields to None. Defaults to None.
 39            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 40            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 41            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 42            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 43            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 44                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 45                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 46            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 47                If the automatic detection fails, please specify the separator manually.
 48
 49        Returns:
 50            **SNPObject**: 
 51                A SNPObject instance.
 52        """
 53        assert (
 54            sample_idxs is None or sample_ids is None
 55        ), "Only one of sample_idxs and sample_ids can be specified"
 56        assert (
 57            variant_idxs is None or variant_ids is None
 58        ), "Only one of variant_idxs and variant_ids can be specified"
 59
 60        if isinstance(fields, str):
 61            fields = [fields]
 62        if isinstance(exclude_fields, str):
 63            exclude_fields = [exclude_fields]
 64
 65        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"]
 66        exclude_fields = exclude_fields or []
 67        fields = [field for field in fields if field not in exclude_fields]
 68        only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 69
 70        filename_noext = str(self.filename)
 71        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
 72            filename_noext = filename_noext[:-4]
 73
 74        if only_read_bed:
 75            with open(filename_noext + '.fam', 'r') as f:
 76                file_num_samples = sum(1 for _ in f)  # Get sample count from fam file
 77            file_num_variants = None  # Not needed
 78        else:
 79            log.info(f"Reading {filename_noext}.bim")
 80
 81            if separator is None:
 82                with open(filename_noext + ".bim", "r") as file:
 83                    separator = csv.Sniffer().sniff(file.readline()).delimiter
 84
 85            bim = pl.read_csv(
 86                filename_noext + ".bim",
 87                separator=separator,
 88                has_header=False,
 89                new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
 90                schema_overrides={
 91                    "#CHROM": pl.String,
 92                    "ID": pl.String,
 93                    "CM": pl.Float64,
 94                    "POS": pl.Int64,
 95                    "ALT": pl.String,
 96                    "REF": pl.String
 97                },
 98                null_values=["NA"]
 99            ).with_row_index()
100            file_num_variants = bim.height
101
102            if variant_ids is not None:
103                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
104                variant_id_or_pos = (
105                    pl.col("ID").is_in(variant_id_values)
106                    | pl.concat_str(
107                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
108                    ).is_in(variant_id_values)
109                )
110                variant_idxs = (
111                    bim.filter(variant_id_or_pos)
112                    .select("index")
113                    .to_series()
114                    .to_numpy()
115                )
116
117            if variant_idxs is None:
118                num_variants = file_num_variants
119                variant_idxs = np.arange(num_variants, dtype=np.uint32)
120            else:
121                requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel()
122                bim = bim.filter(pl.col("index").is_in(requested_variant_idxs))
123                variant_idxs = bim.select("index").to_series().to_numpy()
124                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
125                num_variants = np.size(variant_idxs)
126
127            log.info(f"Reading {filename_noext}.fam")
128
129            fam = pl.read_csv(
130                filename_noext + ".fam",
131                separator=separator,
132                has_header=False,
133                new_columns=["Family ID", "IID", "Father ID",
134                             "Mother ID", "Sex code", "Phenotype value"],
135                schema_overrides={
136                    "Family ID": pl.String,
137                    "IID": pl.String,
138                    "Father ID": pl.String,
139                    "Mother ID": pl.String,
140                    "Sex code": pl.String,
141                },
142                null_values=["NA"]
143            ).with_row_index()
144            file_num_samples = fam.height
145
146            if sample_ids is not None:
147                sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy()
148
149            if sample_idxs is None:
150                num_samples = file_num_samples
151            else:
152                num_samples = np.size(sample_idxs)
153                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
154                fam = fam.filter(pl.col("index").is_in(sample_idxs))
155
156        if "GT" in fields:
157            log.info(f"Reading {filename_noext}.bed")
158            pgen_reader = pg.PgenReader(
159                str.encode(filename_noext + ".bed"),
160                raw_sample_ct=file_num_samples,
161                variant_ct=file_num_variants,
162                sample_subset=sample_idxs,
163            )
164
165            if only_read_bed:
166                num_samples = pgen_reader.get_raw_sample_ct()
167                num_variants = pgen_reader.get_variant_ct()
168                variant_idxs = np.arange(num_variants, dtype=np.uint32)
169
170            # required arrays: variant_idxs + sample_idxs + genotypes
171            if not sum_strands:
172                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
173            else:
174                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
175            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
176
177            if not sum_strands:
178                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
179                pgen_reader.read_alleles_list(variant_idxs, genotypes)
180                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
181            else:
182                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
183                pgen_reader.read_list(variant_idxs, genotypes)
184            pgen_reader.close()
185        else:
186            genotypes = None
187
188        log.info("Constructing SNPObject")
189
190        snpobj = SNPObject(
191            calldata_gt=genotypes if "GT" in fields else None,
192            samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
193            **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
194               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
195        )
196
197        log.info("Finished constructing SNPObject")
198        return snpobj

Read a bed fileset (bed, bim, fam) into a SNPObject.

Arguments:
  • fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To extract all fields, set fields to None. Defaults to None.
  • exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To exclude no fields, set exclude_fields to None. Defaults to None.
  • sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
  • sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
  • variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
  • variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
  • sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand. Note: With the pgenlib backend, False uses ~8× more RAM, though calldata_gt is only 2× larger.
  • separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.
Returns:

SNPObject: A SNPObject instance.

def iter_read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str | None = None, chunk_size: int = 10000) -> Iterator[SNPObject]:
263    def iter_read(
264        self,
265        fields: Optional[List[str]] = None,
266        exclude_fields: Optional[List[str]] = None,
267        sample_ids: Optional[np.ndarray] = None,
268        sample_idxs: Optional[np.ndarray] = None,
269        variant_ids: Optional[np.ndarray] = None,
270        variant_idxs: Optional[np.ndarray] = None,
271        sum_strands: bool = False,
272        separator: Optional[str] = None,
273        chunk_size: int = 10_000,
274    ) -> Iterator[SNPObject]:
275        """
276        Stream the BED fileset in variant chunks.
277
278        This yields a sequence of SNPObject chunks along the SNP axis.
279        """
280        if chunk_size < 1:
281            raise ValueError("chunk_size must be >= 1.")
282        if sample_idxs is not None and sample_ids is not None:
283            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
284        if variant_idxs is not None and variant_ids is not None:
285            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
286
287        selectors = self._resolve_variant_idxs_for_iter(
288            variant_ids=variant_ids,
289            variant_idxs=variant_idxs,
290            separator=separator,
291        )
292
293        n_selectors = int(selectors.size)
294        for start in range(0, n_selectors, int(chunk_size)):
295            stop = min(start + int(chunk_size), n_selectors)
296            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
297            yield self.read(
298                fields=fields,
299                exclude_fields=exclude_fields,
300                sample_ids=sample_ids,
301                sample_idxs=sample_idxs,
302                variant_idxs=selector_chunk,
303                sum_strands=sum_strands,
304                separator=separator,
305            )

Stream the BED fileset in variant chunks.

This yields a sequence of SNPObject chunks along the SNP axis.

@SNPBaseReader.register
class GRGReader(snputils.snp.io.read.base.SNPBaseReader):
 9@SNPBaseReader.register
10class GRGReader(SNPBaseReader):
11    def read(self,
12             mutable: Optional[bool] = None,
13             load_up_edges: Optional[bool] = None,
14             binary_mutations: Optional[bool] = None) -> GRGObject:
15        """
16        Read in a GRG or TSKit File
17        """
18        file = str(pathlib.Path(self.filename).resolve())
19        extension = pathlib.Path(file).suffix.lower()
20        edges = load_up_edges if load_up_edges is not None else True
21        binmuts = binary_mutations if binary_mutations is not None else False
22
23        if extension == ".trees":
24            return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True)
25        if mutable:
26            return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True)
27
28        return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)

Abstract class for SNP readers.

Attributes:
  • _filename: The path to the file storing SNP data.
def read( self, mutable: bool | None = None, load_up_edges: bool | None = None, binary_mutations: bool | None = None) -> GRGObject:
11    def read(self,
12             mutable: Optional[bool] = None,
13             load_up_edges: Optional[bool] = None,
14             binary_mutations: Optional[bool] = None) -> GRGObject:
15        """
16        Read in a GRG or TSKit File
17        """
18        file = str(pathlib.Path(self.filename).resolve())
19        extension = pathlib.Path(file).suffix.lower()
20        edges = load_up_edges if load_up_edges is not None else True
21        binmuts = binary_mutations if binary_mutations is not None else False
22
23        if extension == ".trees":
24            return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True)
25        if mutable:
26            return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True)
27
28        return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)

Read in a GRG or TSKit File

class GRGWriter:
 9class GRGWriter:
10    def __init__(self, grgobj: Union[pyg.GRG, pyg.MutableGRG], filename: str):
11        self.grgobj = grgobj
12        self.mutability = False if isinstance(self.grgobj, pyg.GRG) else True
13        self.filename = filename
14    
15    def write(self, allow_simplify : Optional[bool]                         = None, 
16                    subset         : Optional[bool]                         = None,
17                    direction      : Optional[pyg.TraversalDirection]       = None,
18                    seed_list      : Optional[List[int]]                    = None,
19                    bp_range       : Optional[Tuple[int, int]]              = None):
20        """
21        """
22
23        if subset:
24            if direction is None:
25                raise ValueError("If subset is True, 'direction' must be provided.")
26            if seed_list is None:
27                raise ValueError("If subset is True, 'seed_list' must be provided.")
28            _bp_range = (0,0) if bp_range is None else bp_range
29            pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 
30        else:
31            _allow_simplify = True if allow_simplify is None else allow_simplify 
32            pyg.save_grg(self.grgobj, self.filename, _allow_simplify)
GRGWriter(grgobj: _grgl.GRG | _grgl.MutableGRG, filename: str)
10    def __init__(self, grgobj: Union[pyg.GRG, pyg.MutableGRG], filename: str):
11        self.grgobj = grgobj
12        self.mutability = False if isinstance(self.grgobj, pyg.GRG) else True
13        self.filename = filename
grgobj
mutability
filename
def write( self, allow_simplify: bool | None = None, subset: bool | None = None, direction: _grgl.TraversalDirection | None = None, seed_list: List[int] | None = None, bp_range: Tuple[int, int] | None = None):
15    def write(self, allow_simplify : Optional[bool]                         = None, 
16                    subset         : Optional[bool]                         = None,
17                    direction      : Optional[pyg.TraversalDirection]       = None,
18                    seed_list      : Optional[List[int]]                    = None,
19                    bp_range       : Optional[Tuple[int, int]]              = None):
20        """
21        """
22
23        if subset:
24            if direction is None:
25                raise ValueError("If subset is True, 'direction' must be provided.")
26            if seed_list is None:
27                raise ValueError("If subset is True, 'seed_list' must be provided.")
28            _bp_range = (0,0) if bp_range is None else bp_range
29            pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 
30        else:
31            _allow_simplify = True if allow_simplify is None else allow_simplify 
32            pyg.save_grg(self.grgobj, self.filename, _allow_simplify)
@SNPBaseReader.register
class PGENReader(snputils.snp.io.read.base.SNPBaseReader):
 24@SNPBaseReader.register
 25class PGENReader(SNPBaseReader):
 26    def read(
 27        self,
 28        fields: Optional[List[str]] = None,
 29        exclude_fields: Optional[List[str]] = None,
 30        sample_ids: Optional[np.ndarray] = None,
 31        sample_idxs: Optional[np.ndarray] = None,
 32        variant_ids: Optional[np.ndarray] = None,
 33        variant_idxs: Optional[np.ndarray] = None,
 34        sum_strands: bool = False,
 35        separator: str = None,
 36    ) -> SNPObject:
 37        """
 38        Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
 39
 40        Args:
 41            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 42                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 43                To extract all fields, set fields to None. Defaults to None.
 44            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 45                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 46                To exclude no fields, set exclude_fields to None. Defaults to None.
 47            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 48            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 49            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 50            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 51            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 52                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 53                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 54            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 55                If the automatic detection fails, please specify the separator manually.
 56
 57        Returns:
 58            **SNPObject**: 
 59                A SNPObject instance.
 60        """
 61        assert (
 62            sample_idxs is None or sample_ids is None
 63        ), "Only one of sample_idxs and sample_ids can be specified"
 64        assert (
 65            variant_idxs is None or variant_ids is None
 66        ), "Only one of variant_idxs and variant_ids can be specified"
 67
 68        if isinstance(fields, str):
 69            fields = [fields]
 70        if isinstance(exclude_fields, str):
 71            exclude_fields = [exclude_fields]
 72
 73        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"]
 74        exclude_fields = exclude_fields or []
 75        fields = [field for field in fields if field not in exclude_fields]
 76        only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 77
 78        filename_noext = str(self.filename)
 79        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
 80            if filename_noext.endswith(ext):
 81                filename_noext = filename_noext[:-len(ext)]
 82                break
 83
 84        if only_read_pgen:
 85            file_num_samples = None  # Not needed for pgen
 86            file_num_variants = None  # Not needed
 87        else:
 88            pvar_extensions = [".pvar", ".pvar.zst"]
 89            pvar_filename = None
 90            for ext in pvar_extensions:
 91                possible_pvar = filename_noext + ext
 92                if os.path.exists(possible_pvar):
 93                    pvar_filename = possible_pvar
 94                    break
 95            if pvar_filename is None:
 96                raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
 97
 98            log.info(f"Reading {pvar_filename}")
 99
100            pvar_has_header = True
101            pvar_header_line_num = 0
102            with _open_textfile(pvar_filename) as file:
103                for line_num, line in enumerate(file):
104                    if line.startswith("##"):  # Metadata
105                        continue
106                    else:
107                        if separator is None:
108                            separator = csv.Sniffer().sniff(file.readline()).delimiter
109                        if line.startswith("#CHROM"):  # Header
110                            pvar_header_line_num = line_num
111                            header = line.strip().split()
112                            break
113                        elif not line.startswith("#"):  # If no header, look at line 1
114                            pvar_has_header = False
115                            cols_in_pvar = len(line.strip().split(separator))
116                            if cols_in_pvar == 5:
117                                header = ["#CHROM", "ID", "POS", "ALT", "REF"]
118                            elif cols_in_pvar == 6:
119                                header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
120                            else:
121                                raise ValueError(
122                                    f"{pvar_filename} is not a valid pvar file."
123                                )
124                            break
125
126            pvar_reading_args = {
127                'separator': separator,
128                'skip_rows': pvar_header_line_num,
129                'has_header': pvar_has_header,
130                'new_columns': None if pvar_has_header else header,
131                'schema_overrides': {
132                    "#CHROM": pl.String,
133                    "POS": pl.UInt32,
134                    "ID": pl.String,
135                    "REF": pl.String,
136                    "ALT": pl.String,
137                },
138                'null_values': ["NA"],
139            }
140            if pvar_filename.endswith('.zst'):
141                pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy()
142            else:
143                pvar = pl.scan_csv(pvar_filename, **pvar_reading_args)
144
145            # We need to map requested IDs to row positions before reading genotypes.
146            variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect()
147            file_num_variants = variant_meta.height
148
149            if variant_ids is not None:
150                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
151                variant_id_or_pos = (
152                    pl.col("ID").is_in(variant_id_values)
153                    | pl.concat_str(
154                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
155                    ).is_in(variant_id_values)
156                )
157                variant_idxs = (
158                    variant_meta.filter(variant_id_or_pos)
159                    .select("index")
160                    .to_series()
161                    .to_numpy()
162                )
163
164            if variant_idxs is None:
165                num_variants = file_num_variants
166                variant_idxs = np.arange(num_variants, dtype=np.uint32)
167                pvar = pvar.collect()
168            else:
169                pvar = (
170                    pvar.with_row_index()
171                    .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel()))
172                    .collect()
173                )
174                variant_idxs = pvar.select("index").to_series().to_numpy()
175                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
176                num_variants = np.size(variant_idxs)
177                pvar = pvar.drop("index")
178
179            log.info(f"Reading {filename_noext}.psam")
180
181            with open(filename_noext + ".psam") as file:
182                first_line = file.readline().strip()
183                psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID"))
184
185            psam = pl.read_csv(
186                filename_noext + ".psam",
187                separator=separator,
188                has_header=psam_has_header,
189                new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"],
190                null_values=["NA"],
191            ).with_row_index()
192            if "#IID" in psam.columns:
193                psam = psam.rename({"#IID": "IID"})
194            if "#FID" in psam.columns:
195                psam = psam.rename({"#FID": "FID"})
196
197            file_num_samples = psam.height
198
199            if sample_ids is not None:
200                psam = psam.filter(pl.col("IID").is_in(sample_ids))
201                sample_idxs = psam.select("index").to_series().to_numpy()
202                num_samples = np.size(sample_idxs)
203            elif sample_idxs is not None:
204                num_samples = np.size(sample_idxs)
205                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
206                psam = psam.filter(pl.col("index").is_in(sample_idxs))
207            else:
208                num_samples = file_num_samples
209
210        if "GT" in fields:
211            log.info(f"Reading {filename_noext}.pgen")
212            pgen_reader = pg.PgenReader(
213                str.encode(filename_noext + ".pgen"),
214                raw_sample_ct=file_num_samples,
215                variant_ct=file_num_variants,
216                sample_subset=sample_idxs,
217            )
218
219            if only_read_pgen:
220                num_samples = pgen_reader.get_raw_sample_ct()
221                num_variants = pgen_reader.get_variant_ct()
222                variant_idxs = np.arange(num_variants, dtype=np.uint32)
223
224            # required arrays: variant_idxs + sample_idxs + genotypes
225            if not sum_strands:
226                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
227            else:
228                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
229            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
230
231            if not sum_strands:
232                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
233                pgen_reader.read_alleles_list(variant_idxs, genotypes)
234                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
235            else:
236                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
237                pgen_reader.read_list(variant_idxs, genotypes)
238            pgen_reader.close()
239        else:
240            genotypes = None
241
242        log.info("Constructing SNPObject")
243
244        snpobj = SNPObject(
245            calldata_gt=genotypes if "GT" in fields else None,
246            samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
247            **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
248               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
249        )
250
251        log.info("Finished constructing SNPObject")
252        return snpobj
253
254    def _resolve_variant_idxs_for_iter(
255        self,
256        *,
257        variant_ids: Optional[np.ndarray],
258        variant_idxs: Optional[np.ndarray],
259        separator: str = None,
260    ) -> np.ndarray:
261        """
262        Resolve variant selectors to canonical file-order row indices.
263        """
264        filename_noext = str(self.filename)
265        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
266            if filename_noext.endswith(ext):
267                filename_noext = filename_noext[:-len(ext)]
268                break
269
270        pvar_filename = None
271        for ext in [".pvar", ".pvar.zst"]:
272            candidate = filename_noext + ext
273            if os.path.exists(candidate):
274                pvar_filename = candidate
275                break
276        if pvar_filename is None:
277            raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
278
279        local_separator = separator
280
281        pvar_has_header = True
282        pvar_header_line_num = 0
283        with _open_textfile(pvar_filename) as file:
284            for line_num, line in enumerate(file):
285                if line.startswith("##"):
286                    continue
287                if local_separator is None:
288                    local_separator = csv.Sniffer().sniff(file.readline()).delimiter
289                if line.startswith("#CHROM"):
290                    pvar_header_line_num = line_num
291                    header = line.strip().split()
292                    break
293                if not line.startswith("#"):
294                    pvar_has_header = False
295                    cols_in_pvar = len(line.strip().split(local_separator))
296                    if cols_in_pvar == 5:
297                        header = ["#CHROM", "ID", "POS", "ALT", "REF"]
298                    elif cols_in_pvar == 6:
299                        header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
300                    else:
301                        raise ValueError(f"{pvar_filename} is not a valid pvar file.")
302                    break
303
304        pvar_reading_args = {
305            "separator": local_separator,
306            "skip_rows": pvar_header_line_num,
307            "has_header": pvar_has_header,
308            "new_columns": None if pvar_has_header else header,
309            "schema_overrides": {
310                "#CHROM": pl.String,
311                "POS": pl.UInt32,
312                "ID": pl.String,
313                "REF": pl.String,
314                "ALT": pl.String,
315            },
316            "null_values": ["NA"],
317        }
318        if pvar_filename.endswith(".zst"):
319            pvar = pl.read_csv(pvar_filename, **pvar_reading_args)
320        else:
321            pvar = pl.scan_csv(pvar_filename, **pvar_reading_args).collect()
322
323        variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index()
324
325        if variant_ids is not None:
326            variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
327            variant_id_or_pos = (
328                pl.col("ID").is_in(variant_id_values)
329                | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in(
330                    variant_id_values
331                )
332            )
333            resolved = (
334                variant_meta.filter(variant_id_or_pos)
335                .select("index")
336                .to_series()
337                .to_numpy()
338            )
339            return np.asarray(resolved, dtype=np.uint32)
340
341        if variant_idxs is not None:
342            requested = np.asarray(variant_idxs, dtype=np.uint32).ravel()
343            resolved = (
344                variant_meta.filter(pl.col("index").is_in(requested))
345                .select("index")
346                .to_series()
347                .to_numpy()
348            )
349            return np.asarray(resolved, dtype=np.uint32)
350
351        return np.arange(variant_meta.height, dtype=np.uint32)
352
353    def iter_read(
354        self,
355        fields: Optional[List[str]] = None,
356        exclude_fields: Optional[List[str]] = None,
357        sample_ids: Optional[np.ndarray] = None,
358        sample_idxs: Optional[np.ndarray] = None,
359        variant_ids: Optional[np.ndarray] = None,
360        variant_idxs: Optional[np.ndarray] = None,
361        sum_strands: bool = False,
362        separator: str = None,
363        chunk_size: int = 10_000,
364    ) -> Iterator[SNPObject]:
365        """
366        Stream the PGEN fileset in variant chunks.
367
368        This yields a sequence of SNPObject chunks along the SNP axis.
369        """
370        if chunk_size < 1:
371            raise ValueError("chunk_size must be >= 1.")
372        if sample_idxs is not None and sample_ids is not None:
373            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
374        if variant_idxs is not None and variant_ids is not None:
375            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
376
377        selectors = self._resolve_variant_idxs_for_iter(
378            variant_ids=variant_ids,
379            variant_idxs=variant_idxs,
380            separator=separator,
381        )
382
383        n_selectors = int(selectors.size)
384        for start in range(0, n_selectors, int(chunk_size)):
385            stop = min(start + int(chunk_size), n_selectors)
386            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
387            yield self.read(
388                fields=fields,
389                exclude_fields=exclude_fields,
390                sample_ids=sample_ids,
391                sample_idxs=sample_idxs,
392                variant_idxs=selector_chunk,
393                sum_strands=sum_strands,
394                separator=separator,
395            )

Abstract class for SNP readers.

Attributes:
  • _filename: The path to the file storing SNP data.
def read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str = None) -> SNPObject:
 26    def read(
 27        self,
 28        fields: Optional[List[str]] = None,
 29        exclude_fields: Optional[List[str]] = None,
 30        sample_ids: Optional[np.ndarray] = None,
 31        sample_idxs: Optional[np.ndarray] = None,
 32        variant_ids: Optional[np.ndarray] = None,
 33        variant_idxs: Optional[np.ndarray] = None,
 34        sum_strands: bool = False,
 35        separator: str = None,
 36    ) -> SNPObject:
 37        """
 38        Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
 39
 40        Args:
 41            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 42                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 43                To extract all fields, set fields to None. Defaults to None.
 44            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 45                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 46                To exclude no fields, set exclude_fields to None. Defaults to None.
 47            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 48            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 49            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 50            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 51            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 52                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 53                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 54            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 55                If the automatic detection fails, please specify the separator manually.
 56
 57        Returns:
 58            **SNPObject**: 
 59                A SNPObject instance.
 60        """
 61        assert (
 62            sample_idxs is None or sample_ids is None
 63        ), "Only one of sample_idxs and sample_ids can be specified"
 64        assert (
 65            variant_idxs is None or variant_ids is None
 66        ), "Only one of variant_idxs and variant_ids can be specified"
 67
 68        if isinstance(fields, str):
 69            fields = [fields]
 70        if isinstance(exclude_fields, str):
 71            exclude_fields = [exclude_fields]
 72
 73        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"]
 74        exclude_fields = exclude_fields or []
 75        fields = [field for field in fields if field not in exclude_fields]
 76        only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 77
 78        filename_noext = str(self.filename)
 79        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
 80            if filename_noext.endswith(ext):
 81                filename_noext = filename_noext[:-len(ext)]
 82                break
 83
 84        if only_read_pgen:
 85            file_num_samples = None  # Not needed for pgen
 86            file_num_variants = None  # Not needed
 87        else:
 88            pvar_extensions = [".pvar", ".pvar.zst"]
 89            pvar_filename = None
 90            for ext in pvar_extensions:
 91                possible_pvar = filename_noext + ext
 92                if os.path.exists(possible_pvar):
 93                    pvar_filename = possible_pvar
 94                    break
 95            if pvar_filename is None:
 96                raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
 97
 98            log.info(f"Reading {pvar_filename}")
 99
100            pvar_has_header = True
101            pvar_header_line_num = 0
102            with _open_textfile(pvar_filename) as file:
103                for line_num, line in enumerate(file):
104                    if line.startswith("##"):  # Metadata
105                        continue
106                    else:
107                        if separator is None:
108                            separator = csv.Sniffer().sniff(file.readline()).delimiter
109                        if line.startswith("#CHROM"):  # Header
110                            pvar_header_line_num = line_num
111                            header = line.strip().split()
112                            break
113                        elif not line.startswith("#"):  # If no header, look at line 1
114                            pvar_has_header = False
115                            cols_in_pvar = len(line.strip().split(separator))
116                            if cols_in_pvar == 5:
117                                header = ["#CHROM", "ID", "POS", "ALT", "REF"]
118                            elif cols_in_pvar == 6:
119                                header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
120                            else:
121                                raise ValueError(
122                                    f"{pvar_filename} is not a valid pvar file."
123                                )
124                            break
125
126            pvar_reading_args = {
127                'separator': separator,
128                'skip_rows': pvar_header_line_num,
129                'has_header': pvar_has_header,
130                'new_columns': None if pvar_has_header else header,
131                'schema_overrides': {
132                    "#CHROM": pl.String,
133                    "POS": pl.UInt32,
134                    "ID": pl.String,
135                    "REF": pl.String,
136                    "ALT": pl.String,
137                },
138                'null_values': ["NA"],
139            }
140            if pvar_filename.endswith('.zst'):
141                pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy()
142            else:
143                pvar = pl.scan_csv(pvar_filename, **pvar_reading_args)
144
145            # We need to map requested IDs to row positions before reading genotypes.
146            variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect()
147            file_num_variants = variant_meta.height
148
149            if variant_ids is not None:
150                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
151                variant_id_or_pos = (
152                    pl.col("ID").is_in(variant_id_values)
153                    | pl.concat_str(
154                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
155                    ).is_in(variant_id_values)
156                )
157                variant_idxs = (
158                    variant_meta.filter(variant_id_or_pos)
159                    .select("index")
160                    .to_series()
161                    .to_numpy()
162                )
163
164            if variant_idxs is None:
165                num_variants = file_num_variants
166                variant_idxs = np.arange(num_variants, dtype=np.uint32)
167                pvar = pvar.collect()
168            else:
169                pvar = (
170                    pvar.with_row_index()
171                    .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel()))
172                    .collect()
173                )
174                variant_idxs = pvar.select("index").to_series().to_numpy()
175                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
176                num_variants = np.size(variant_idxs)
177                pvar = pvar.drop("index")
178
179            log.info(f"Reading {filename_noext}.psam")
180
181            with open(filename_noext + ".psam") as file:
182                first_line = file.readline().strip()
183                psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID"))
184
185            psam = pl.read_csv(
186                filename_noext + ".psam",
187                separator=separator,
188                has_header=psam_has_header,
189                new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"],
190                null_values=["NA"],
191            ).with_row_index()
192            if "#IID" in psam.columns:
193                psam = psam.rename({"#IID": "IID"})
194            if "#FID" in psam.columns:
195                psam = psam.rename({"#FID": "FID"})
196
197            file_num_samples = psam.height
198
199            if sample_ids is not None:
200                psam = psam.filter(pl.col("IID").is_in(sample_ids))
201                sample_idxs = psam.select("index").to_series().to_numpy()
202                num_samples = np.size(sample_idxs)
203            elif sample_idxs is not None:
204                num_samples = np.size(sample_idxs)
205                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
206                psam = psam.filter(pl.col("index").is_in(sample_idxs))
207            else:
208                num_samples = file_num_samples
209
210        if "GT" in fields:
211            log.info(f"Reading {filename_noext}.pgen")
212            pgen_reader = pg.PgenReader(
213                str.encode(filename_noext + ".pgen"),
214                raw_sample_ct=file_num_samples,
215                variant_ct=file_num_variants,
216                sample_subset=sample_idxs,
217            )
218
219            if only_read_pgen:
220                num_samples = pgen_reader.get_raw_sample_ct()
221                num_variants = pgen_reader.get_variant_ct()
222                variant_idxs = np.arange(num_variants, dtype=np.uint32)
223
224            # required arrays: variant_idxs + sample_idxs + genotypes
225            if not sum_strands:
226                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
227            else:
228                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
229            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
230
231            if not sum_strands:
232                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
233                pgen_reader.read_alleles_list(variant_idxs, genotypes)
234                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
235            else:
236                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
237                pgen_reader.read_list(variant_idxs, genotypes)
238            pgen_reader.close()
239        else:
240            genotypes = None
241
242        log.info("Constructing SNPObject")
243
244        snpobj = SNPObject(
245            calldata_gt=genotypes if "GT" in fields else None,
246            samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
247            **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
248               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
249        )
250
251        log.info("Finished constructing SNPObject")
252        return snpobj

Read a pgen fileset (pgen, psam, pvar) into a SNPObject.

Arguments:
  • fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To extract all fields, set fields to None. Defaults to None.
  • exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To exclude no fields, set exclude_fields to None. Defaults to None.
  • sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
  • sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
  • variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
  • variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
  • sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand. Note: With the pgenlib backend, False uses ~8× more RAM, though calldata_gt is only 2× larger.
  • separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.
Returns:

SNPObject: A SNPObject instance.

def iter_read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str = None, chunk_size: int = 10000) -> Iterator[SNPObject]:
353    def iter_read(
354        self,
355        fields: Optional[List[str]] = None,
356        exclude_fields: Optional[List[str]] = None,
357        sample_ids: Optional[np.ndarray] = None,
358        sample_idxs: Optional[np.ndarray] = None,
359        variant_ids: Optional[np.ndarray] = None,
360        variant_idxs: Optional[np.ndarray] = None,
361        sum_strands: bool = False,
362        separator: str = None,
363        chunk_size: int = 10_000,
364    ) -> Iterator[SNPObject]:
365        """
366        Stream the PGEN fileset in variant chunks.
367
368        This yields a sequence of SNPObject chunks along the SNP axis.
369        """
370        if chunk_size < 1:
371            raise ValueError("chunk_size must be >= 1.")
372        if sample_idxs is not None and sample_ids is not None:
373            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
374        if variant_idxs is not None and variant_ids is not None:
375            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
376
377        selectors = self._resolve_variant_idxs_for_iter(
378            variant_ids=variant_ids,
379            variant_idxs=variant_idxs,
380            separator=separator,
381        )
382
383        n_selectors = int(selectors.size)
384        for start in range(0, n_selectors, int(chunk_size)):
385            stop = min(start + int(chunk_size), n_selectors)
386            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
387            yield self.read(
388                fields=fields,
389                exclude_fields=exclude_fields,
390                sample_ids=sample_ids,
391                sample_idxs=sample_idxs,
392                variant_idxs=selector_chunk,
393                sum_strands=sum_strands,
394                separator=separator,
395            )

Stream the PGEN fileset in variant chunks.

This yields a sequence of SNPObject chunks along the SNP axis.

@SNPBaseReader.register
class VCFReader(snputils.snp.io.read.base.SNPBaseReader):
 21@SNPBaseReader.register
 22class VCFReader(SNPBaseReader):
 23    def __init__(self, filename: Union[str, pathlib.Path]):
 24        super().__init__(filename)
 25        self._igd_path: Optional[pathlib.Path] = None
 26        self._grg_path: Optional[pathlib.Path] = None
 27        self.debug : bool = False
 28    def read(
 29        self,
 30        fields: Optional[List[str]] = None,
 31        exclude_fields: Optional[List[str]] = None,
 32        rename_fields: Optional[dict] = None,
 33        fills: Optional[dict] = None,
 34        region: Optional[str] = None,
 35        samples: Optional[List[str]] = None,
 36        sum_strands: bool = False,
 37    ) -> SNPObject:
 38        """
 39        Read a vcf file into a SNPObject.
 40
 41        Args:
 42            fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS',
 43                'calldata/GT']. If you are feeling lazy, you can drop the 'variants/'
 44                and 'calldata/' prefixes, in which case the fields will be matched
 45                against fields declared in the VCF header, with variants taking priority
 46                over calldata if a field with the same ID exists both in INFO and FORMAT
 47                headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out
 48                for fields like 'DP' which can be both INFO and FORMAT. To extract all
 49                fields, provide just the string '*'. To extract all variants fields
 50                (including all INFO fields) provide 'variants/*'. To extract all
 51                calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
 52            exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
 53            rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
 54            fills: Override the fill value used for empty values. Should be a dictionary
 55                mapping field names to fill values.
 56            region: Genomic region to extract variants for. If provided, should be a
 57                tabix-style region string, which can be either just a chromosome name
 58                (e.g., '2L'), or a chromosome name followed by 1-based beginning and
 59                end coordinates (e.g., '2L:100000-200000'). Note that only variants
 60                whose start position (POS) is within the requested range will be included.
 61                This is slightly different from the default tabix behaviour, where a
 62                variant (e.g., deletion) may be included if its position (POS) occurs
 63                before the requested region but its reference allele overlaps the
 64                region - such a variant will not be included in the data returned
 65                by this function.
 66            samples: Selection of samples to extract calldata for. If provided, should be
 67                a list of strings giving sample identifiers. May also be a list of
 68                integers giving indices of selected samples.
 69            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 70                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand.
 71
 72        Returns:
 73            **SNPObject**: 
 74                A SNPObject instance.
 75        """
 76        log.info(f"Reading {self.filename}")
 77
 78        vcf_dict = allel.read_vcf(
 79            str(self.filename),
 80            fields=fields,
 81            exclude_fields=exclude_fields,
 82            rename_fields=rename_fields,
 83            fills=fills,
 84            region=region,
 85            samples=samples,
 86            alt_number=1,
 87        )
 88        assert vcf_dict is not None  # suppress Flake8 warning
 89
 90        genotypes = vcf_dict["calldata/GT"].astype(np.int8)
 91        if sum_strands:
 92            genotypes = genotypes.sum(axis=2, dtype=np.int8)
 93
 94        snpobj = SNPObject(
 95            calldata_gt=genotypes,
 96            samples=vcf_dict["samples"],
 97            variants_ref=vcf_dict["variants/REF"],
 98            variants_alt=vcf_dict["variants/ALT"],
 99            variants_chrom=vcf_dict["variants/CHROM"],
100            variants_filter_pass=vcf_dict["variants/FILTER_PASS"],
101            variants_id=vcf_dict["variants/ID"],
102            variants_pos=vcf_dict["variants/POS"],
103            variants_qual=vcf_dict["variants/QUAL"],
104        )
105
106        log.info(f"Finished reading {self.filename}")
107        return snpobj
108    def to_igd(self,
109                igd_file : Optional[str] = None,
110                logfile_out : Optional[str] = None,
111                logfile_err : Optional[str] = None) -> None:
112        """
113        Convert the current VCF input file to IGD via `grg convert`.
114
115        Args:
116            igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`.
117            logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
118            logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
119
120        """
121
122        if not exists(self.filename):
123            raise FileNotFoundError(f"File {self.filename} does not exist")
124
125        lf_o  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
126        lf_e  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
127        name, _ext1 = splitext(str(self.filename))
128        name, _ext2 = splitext(name)
129        if igd_file is None:
130            self._igd_path = pathlib.Path(name + ".igd")
131        else:
132            self._igd_path = pathlib.Path(igd_file)
133
134        try:
135            subprocess.run(
136                ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))],
137                stdout=lf_o,
138                stderr=lf_e,
139                check=True,
140            )
141        finally:
142            if not isinstance(lf_o, int):
143                lf_o.close()
144            if not isinstance(lf_e, int):
145                lf_e.close()
146            
147    def to_grg(self,
148               range: Optional[str] = None,
149               parts: Optional[int] = None,
150               jobs: Optional[int] = None,
151               trees: Optional[int] = None,
152               binmuts: Optional[bool] = None,
153               no_file_cleanup: Optional[bool] = None,
154               maf_flip: Optional[bool] = None,
155               population_ids: Optional[str] = None,
156               mutation_batch_size: Optional[int] = None,
157               igd_file: Optional[str] = None,
158               out_file: Optional[str] = None,
159               verbose: Optional[bool] = None,
160               no_merge: Optional[bool] = None,
161               force: Optional[bool] = None,
162               logfile_out: Optional[str] = None,
163               logfile_err: Optional[str] = None
164               ) -> None:
165        """
166        Convert VCF input to a GRG file via `grg construct`.
167
168        If `igd_file` exists, it is used as construct input. If it does not
169        exist, it is first created via `to_igd` and then used for construction.
170        """
171        input_file = pathlib.Path(self.filename).resolve()
172        if igd_file is not None:
173            candidate_igd = pathlib.Path(igd_file)
174            if candidate_igd.exists():
175                self._igd_path = candidate_igd.resolve()
176            else:
177                self.to_igd(igd_file, logfile_out, logfile_err)
178            input_file = pathlib.Path(self._igd_path).resolve()
179
180        if out_file is not None:
181            self._grg_path = pathlib.Path(out_file)
182        else:
183            default_stem = splitext(str(input_file))[0]
184            if default_stem.endswith(".vcf"):
185                default_stem = splitext(default_stem)[0]
186            self._grg_path = pathlib.Path(default_stem + ".grg")
187
188        lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
189        lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
190        args = ["grg", "construct"]
191        args += self._setarg(range, "-r", None)
192        args += self._setarg(parts, "-p", 50)
193        args += self._setarg(jobs, "-j", multiprocessing.cpu_count())
194        args += self._setarg(trees, "-t", 16)
195        args += self._setarg(binmuts, "--binary-muts", None)
196        args += self._setarg(no_file_cleanup, "--no-file-cleanup", None)
197        args += self._setarg(maf_flip, "--maf-flip", None)
198        args += self._setarg(population_ids, "--population-ids", None)
199        args += self._setarg(mutation_batch_size, "--mutation-batch-size", None)
200        args += self._setarg(str(self._grg_path), "--out-file", None)
201        args += self._setarg(verbose, "--verbose", None)
202        args += self._setarg(no_merge, "--no-merge", None)
203        args += self._setarg(force, "--force", None)
204        args += [str(input_file)]
205        log.debug("Running grg construct command: %s", args)
206        try:
207            subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True)
208        finally:
209            if not isinstance(lf_o, int):
210                lf_o.close()
211            if not isinstance(lf_e, int):
212                lf_e.close()
213
214    def _setarg(self, x: Optional[Any], flag: str, default_arg: Optional[Any] = None) -> List[str]:
215        if isinstance(x, bool):
216            return [flag] if x else []
217        if x is None and default_arg is not None:
218            return [flag, f"{default_arg}"] 
219        elif x is not None:
220            return [flag, f"{x}"]
221        else:
222            return []

Abstract class for SNP readers.

Attributes:
  • _filename: The path to the file storing SNP data.
VCFReader(filename: str | pathlib.Path)
23    def __init__(self, filename: Union[str, pathlib.Path]):
24        super().__init__(filename)
25        self._igd_path: Optional[pathlib.Path] = None
26        self._grg_path: Optional[pathlib.Path] = None
27        self.debug : bool = False

Initialize the SNPBaseReader.

Arguments:
  • filename: The path to the file storing SNP data.
debug: bool
def read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, rename_fields: dict | None = None, fills: dict | None = None, region: str | None = None, samples: List[str] | None = None, sum_strands: bool = False) -> SNPObject:
 28    def read(
 29        self,
 30        fields: Optional[List[str]] = None,
 31        exclude_fields: Optional[List[str]] = None,
 32        rename_fields: Optional[dict] = None,
 33        fills: Optional[dict] = None,
 34        region: Optional[str] = None,
 35        samples: Optional[List[str]] = None,
 36        sum_strands: bool = False,
 37    ) -> SNPObject:
 38        """
 39        Read a vcf file into a SNPObject.
 40
 41        Args:
 42            fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS',
 43                'calldata/GT']. If you are feeling lazy, you can drop the 'variants/'
 44                and 'calldata/' prefixes, in which case the fields will be matched
 45                against fields declared in the VCF header, with variants taking priority
 46                over calldata if a field with the same ID exists both in INFO and FORMAT
 47                headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out
 48                for fields like 'DP' which can be both INFO and FORMAT. To extract all
 49                fields, provide just the string '*'. To extract all variants fields
 50                (including all INFO fields) provide 'variants/*'. To extract all
 51                calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
 52            exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
 53            rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
 54            fills: Override the fill value used for empty values. Should be a dictionary
 55                mapping field names to fill values.
 56            region: Genomic region to extract variants for. If provided, should be a
 57                tabix-style region string, which can be either just a chromosome name
 58                (e.g., '2L'), or a chromosome name followed by 1-based beginning and
 59                end coordinates (e.g., '2L:100000-200000'). Note that only variants
 60                whose start position (POS) is within the requested range will be included.
 61                This is slightly different from the default tabix behaviour, where a
 62                variant (e.g., deletion) may be included if its position (POS) occurs
 63                before the requested region but its reference allele overlaps the
 64                region - such a variant will not be included in the data returned
 65                by this function.
 66            samples: Selection of samples to extract calldata for. If provided, should be
 67                a list of strings giving sample identifiers. May also be a list of
 68                integers giving indices of selected samples.
 69            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 70                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand.
 71
 72        Returns:
 73            **SNPObject**: 
 74                A SNPObject instance.
 75        """
 76        log.info(f"Reading {self.filename}")
 77
 78        vcf_dict = allel.read_vcf(
 79            str(self.filename),
 80            fields=fields,
 81            exclude_fields=exclude_fields,
 82            rename_fields=rename_fields,
 83            fills=fills,
 84            region=region,
 85            samples=samples,
 86            alt_number=1,
 87        )
 88        assert vcf_dict is not None  # suppress Flake8 warning
 89
 90        genotypes = vcf_dict["calldata/GT"].astype(np.int8)
 91        if sum_strands:
 92            genotypes = genotypes.sum(axis=2, dtype=np.int8)
 93
 94        snpobj = SNPObject(
 95            calldata_gt=genotypes,
 96            samples=vcf_dict["samples"],
 97            variants_ref=vcf_dict["variants/REF"],
 98            variants_alt=vcf_dict["variants/ALT"],
 99            variants_chrom=vcf_dict["variants/CHROM"],
100            variants_filter_pass=vcf_dict["variants/FILTER_PASS"],
101            variants_id=vcf_dict["variants/ID"],
102            variants_pos=vcf_dict["variants/POS"],
103            variants_qual=vcf_dict["variants/QUAL"],
104        )
105
106        log.info(f"Finished reading {self.filename}")
107        return snpobj

Read a vcf file into a SNPObject.

Arguments:
  • fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS', 'calldata/GT']. If you are feeling lazy, you can drop the 'variants/' and 'calldata/' prefixes, in which case the fields will be matched against fields declared in the VCF header, with variants taking priority over calldata if a field with the same ID exists both in INFO and FORMAT headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out for fields like 'DP' which can be both INFO and FORMAT. To extract all fields, provide just the string ''. To extract all variants fields (including all INFO fields) provide 'variants/'. To extract all calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
  • exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
  • rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
  • fills: Override the fill value used for empty values. Should be a dictionary mapping field names to fill values.
  • region: Genomic region to extract variants for. If provided, should be a tabix-style region string, which can be either just a chromosome name (e.g., '2L'), or a chromosome name followed by 1-based beginning and end coordinates (e.g., '2L:100000-200000'). Note that only variants whose start position (POS) is within the requested range will be included. This is slightly different from the default tabix behaviour, where a variant (e.g., deletion) may be included if its position (POS) occurs before the requested region but its reference allele overlaps the region - such a variant will not be included in the data returned by this function.
  • samples: Selection of samples to extract calldata for. If provided, should be a list of strings giving sample identifiers. May also be a list of integers giving indices of selected samples.
  • sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand.
Returns:

SNPObject: A SNPObject instance.

def to_igd( self, igd_file: str | None = None, logfile_out: str | None = None, logfile_err: str | None = None) -> None:
108    def to_igd(self,
109                igd_file : Optional[str] = None,
110                logfile_out : Optional[str] = None,
111                logfile_err : Optional[str] = None) -> None:
112        """
113        Convert the current VCF input file to IGD via `grg convert`.
114
115        Args:
116            igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`.
117            logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
118            logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
119
120        """
121
122        if not exists(self.filename):
123            raise FileNotFoundError(f"File {self.filename} does not exist")
124
125        lf_o  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
126        lf_e  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
127        name, _ext1 = splitext(str(self.filename))
128        name, _ext2 = splitext(name)
129        if igd_file is None:
130            self._igd_path = pathlib.Path(name + ".igd")
131        else:
132            self._igd_path = pathlib.Path(igd_file)
133
134        try:
135            subprocess.run(
136                ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))],
137                stdout=lf_o,
138                stderr=lf_e,
139                check=True,
140            )
141        finally:
142            if not isinstance(lf_o, int):
143                lf_o.close()
144            if not isinstance(lf_e, int):
145                lf_e.close()

Convert the current VCF input file to IGD via grg convert.

Arguments:
  • igd_file: Output IGD file path. Defaults to <vcf_stem>.igd.
  • logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
  • logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
def to_grg( self, range: str | None = None, parts: int | None = None, jobs: int | None = None, trees: int | None = None, binmuts: bool | None = None, no_file_cleanup: bool | None = None, maf_flip: bool | None = None, population_ids: str | None = None, mutation_batch_size: int | None = None, igd_file: str | None = None, out_file: str | None = None, verbose: bool | None = None, no_merge: bool | None = None, force: bool | None = None, logfile_out: str | None = None, logfile_err: str | None = None) -> None:
147    def to_grg(self,
148               range: Optional[str] = None,
149               parts: Optional[int] = None,
150               jobs: Optional[int] = None,
151               trees: Optional[int] = None,
152               binmuts: Optional[bool] = None,
153               no_file_cleanup: Optional[bool] = None,
154               maf_flip: Optional[bool] = None,
155               population_ids: Optional[str] = None,
156               mutation_batch_size: Optional[int] = None,
157               igd_file: Optional[str] = None,
158               out_file: Optional[str] = None,
159               verbose: Optional[bool] = None,
160               no_merge: Optional[bool] = None,
161               force: Optional[bool] = None,
162               logfile_out: Optional[str] = None,
163               logfile_err: Optional[str] = None
164               ) -> None:
165        """
166        Convert VCF input to a GRG file via `grg construct`.
167
168        If `igd_file` exists, it is used as construct input. If it does not
169        exist, it is first created via `to_igd` and then used for construction.
170        """
171        input_file = pathlib.Path(self.filename).resolve()
172        if igd_file is not None:
173            candidate_igd = pathlib.Path(igd_file)
174            if candidate_igd.exists():
175                self._igd_path = candidate_igd.resolve()
176            else:
177                self.to_igd(igd_file, logfile_out, logfile_err)
178            input_file = pathlib.Path(self._igd_path).resolve()
179
180        if out_file is not None:
181            self._grg_path = pathlib.Path(out_file)
182        else:
183            default_stem = splitext(str(input_file))[0]
184            if default_stem.endswith(".vcf"):
185                default_stem = splitext(default_stem)[0]
186            self._grg_path = pathlib.Path(default_stem + ".grg")
187
188        lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
189        lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
190        args = ["grg", "construct"]
191        args += self._setarg(range, "-r", None)
192        args += self._setarg(parts, "-p", 50)
193        args += self._setarg(jobs, "-j", multiprocessing.cpu_count())
194        args += self._setarg(trees, "-t", 16)
195        args += self._setarg(binmuts, "--binary-muts", None)
196        args += self._setarg(no_file_cleanup, "--no-file-cleanup", None)
197        args += self._setarg(maf_flip, "--maf-flip", None)
198        args += self._setarg(population_ids, "--population-ids", None)
199        args += self._setarg(mutation_batch_size, "--mutation-batch-size", None)
200        args += self._setarg(str(self._grg_path), "--out-file", None)
201        args += self._setarg(verbose, "--verbose", None)
202        args += self._setarg(no_merge, "--no-merge", None)
203        args += self._setarg(force, "--force", None)
204        args += [str(input_file)]
205        log.debug("Running grg construct command: %s", args)
206        try:
207            subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True)
208        finally:
209            if not isinstance(lf_o, int):
210                lf_o.close()
211            if not isinstance(lf_e, int):
212                lf_e.close()

Convert VCF input to a GRG file via grg construct.

If igd_file exists, it is used as construct input. If it does not exist, it is first created via to_igd and then used for construction.

class BEDWriter:
 14class BEDWriter:
 15    """Writes an object in bed/bim/fam formats in the specified output path.
 16
 17    Args:
 18        snpobj: The SNPObject to be written.
 19        file: The output file path.
 20
 21    """
 22
 23    def __init__(self, snpobj: SNPObject, filename: str):
 24        self.__snpobj = snpobj.copy()
 25        self.__filename = Path(filename)
 26
 27    def write(
 28            self,
 29            rename_missing_values: bool = True, 
 30            before: Union[int, float, str] = -1, 
 31            after: Union[int, float, str] = '.'
 32        ):
 33        """
 34        Writes the SNPObject to bed/bim/fam formats.
 35
 36        Args:
 37            rename_missing_values (bool, optional):
 38                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 39                Defaults to True.
 40            before (int, float, or str, default=-1): 
 41                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 42                Default is -1.
 43            after (int, float, or str, default='.'): 
 44                The value that will replace `before`. Default is '.'.
 45        """
 46        # Save .bed file
 47        if self.__filename.suffix != '.bed':
 48            self.__filename = self.__filename.with_suffix('.bed')
 49
 50        log.info(f"Writing .bed file: {self.__filename}")
 51
 52        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 53        if rename_missing_values:
 54            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 55
 56        # If the input matrix has three dimensions, it indicates that the data is divided into two strands.
 57        if len(self.__snpobj.calldata_gt.shape) == 3:
 58            # Sum the two strands
 59            self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8)
 60
 61        # Infer the number of samples and variants from the matrix
 62        samples, variants = self.__snpobj.calldata_gt.shape
 63
 64        # Define the PgenWriter to save the data
 65        data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'),
 66                                  sample_ct=samples,
 67                                  variant_ct=variants,
 68                                  nonref_flags=True,
 69                                  hardcall_phase_present=False,
 70                                  dosage_present=True,
 71                                  dosage_phase_present=False)
 72
 73        # Fill the data_save object with the matrix of individuals x variants
 74        for snp_i in range(0, variants):
 75            data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i]))
 76
 77        # Save the .bed file
 78        data_save.close()
 79
 80        log.info(f"Finished writing .bed file: {self.__filename}")
 81
 82        # Remove .bed from the file name
 83        if self.__filename.suffix == '.bed':
 84            self.__filename = self.__filename.with_suffix('')
 85
 86        # Save .fam file
 87        log.info(f"Writing .fam file: {self.__filename}")
 88
 89        # Fill .fam file
 90        fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait'])
 91        fam_file['iid'] = self.__snpobj.samples
 92        fam_file['fid'] = self.__snpobj.samples
 93
 94        # Save .fam file
 95        fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False)
 96        log.info(f"Finished writing .fam file: {self.__filename}")
 97
 98        # Save .bim file
 99        log.info(f"Writing .bim file: {self.__filename}")
100
101        # Fill .bim file
102        bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1'])
103        bim_file['chrom'] = self.__snpobj.variants_chrom
104        bim_file['snp'] = self.__snpobj.variants_id
105        bim_file['cm'] = 0  # TODO: read, save and write too if available?
106        log.warning("The .bim file is being saved with 0 cM values.")
107        bim_file['pos'] = self.__snpobj.variants_pos
108        bim_file['a0'] = self.__snpobj.variants_alt
109        bim_file['a1'] = self.__snpobj.variants_ref
110
111        # Save .bim file
112        bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False)
113        log.info(f"Finished writing .bim file: {self.__filename}")

Writes an object in bed/bim/fam formats in the specified output path.

Arguments:
  • snpobj: The SNPObject to be written.
  • file: The output file path.
BEDWriter(snpobj: SNPObject, filename: str)
23    def __init__(self, snpobj: SNPObject, filename: str):
24        self.__snpobj = snpobj.copy()
25        self.__filename = Path(filename)
def write( self, rename_missing_values: bool = True, before: int | float | str = -1, after: int | float | str = '.'):
 27    def write(
 28            self,
 29            rename_missing_values: bool = True, 
 30            before: Union[int, float, str] = -1, 
 31            after: Union[int, float, str] = '.'
 32        ):
 33        """
 34        Writes the SNPObject to bed/bim/fam formats.
 35
 36        Args:
 37            rename_missing_values (bool, optional):
 38                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 39                Defaults to True.
 40            before (int, float, or str, default=-1): 
 41                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 42                Default is -1.
 43            after (int, float, or str, default='.'): 
 44                The value that will replace `before`. Default is '.'.
 45        """
 46        # Save .bed file
 47        if self.__filename.suffix != '.bed':
 48            self.__filename = self.__filename.with_suffix('.bed')
 49
 50        log.info(f"Writing .bed file: {self.__filename}")
 51
 52        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 53        if rename_missing_values:
 54            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 55
 56        # If the input matrix has three dimensions, it indicates that the data is divided into two strands.
 57        if len(self.__snpobj.calldata_gt.shape) == 3:
 58            # Sum the two strands
 59            self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8)
 60
 61        # Infer the number of samples and variants from the matrix
 62        samples, variants = self.__snpobj.calldata_gt.shape
 63
 64        # Define the PgenWriter to save the data
 65        data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'),
 66                                  sample_ct=samples,
 67                                  variant_ct=variants,
 68                                  nonref_flags=True,
 69                                  hardcall_phase_present=False,
 70                                  dosage_present=True,
 71                                  dosage_phase_present=False)
 72
 73        # Fill the data_save object with the matrix of individuals x variants
 74        for snp_i in range(0, variants):
 75            data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i]))
 76
 77        # Save the .bed file
 78        data_save.close()
 79
 80        log.info(f"Finished writing .bed file: {self.__filename}")
 81
 82        # Remove .bed from the file name
 83        if self.__filename.suffix == '.bed':
 84            self.__filename = self.__filename.with_suffix('')
 85
 86        # Save .fam file
 87        log.info(f"Writing .fam file: {self.__filename}")
 88
 89        # Fill .fam file
 90        fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait'])
 91        fam_file['iid'] = self.__snpobj.samples
 92        fam_file['fid'] = self.__snpobj.samples
 93
 94        # Save .fam file
 95        fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False)
 96        log.info(f"Finished writing .fam file: {self.__filename}")
 97
 98        # Save .bim file
 99        log.info(f"Writing .bim file: {self.__filename}")
100
101        # Fill .bim file
102        bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1'])
103        bim_file['chrom'] = self.__snpobj.variants_chrom
104        bim_file['snp'] = self.__snpobj.variants_id
105        bim_file['cm'] = 0  # TODO: read, save and write too if available?
106        log.warning("The .bim file is being saved with 0 cM values.")
107        bim_file['pos'] = self.__snpobj.variants_pos
108        bim_file['a0'] = self.__snpobj.variants_alt
109        bim_file['a1'] = self.__snpobj.variants_ref
110
111        # Save .bim file
112        bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False)
113        log.info(f"Finished writing .bim file: {self.__filename}")

Writes the SNPObject to bed/bim/fam formats.

Arguments:
  • rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
  • before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
  • after (int, float, or str, default='.'): The value that will replace before. Default is '.'.
class PGENWriter:
 15class PGENWriter:
 16    """
 17    Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path.
 18    """
 19
 20    def __init__(self, snpobj: SNPObject, filename: str):
 21        """
 22        Initializes the PGENWriter instance.
 23
 24        Args:
 25            snpobj (SNPObject): The SNPObject containing genotype data to be written.
 26            filename (str): Base path for the output files (excluding extension).
 27        """
 28        self.__snpobj = snpobj
 29        self.__filename = Path(filename)
 30
 31    def write(
 32            self, 
 33            vzs: bool = False,
 34            rename_missing_values: bool = True, 
 35            before: Union[int, float, str] = -1, 
 36            after: Union[int, float, str] = '.'
 37        ):
 38        """
 39        Writes the SNPObject data to .pgen, .psam, and .pvar files.
 40
 41        Args:
 42            vzs (bool, optional): 
 43                If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 44            rename_missing_values (bool, optional):
 45                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 46                Defaults to True.
 47            before (int, float, or str, default=-1): 
 48                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 49                Default is -1.
 50            after (int, float, or str, default='.'): 
 51                The value that will replace `before`. Default is '.'.
 52        """
 53        file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst")
 54        if self.__filename.suffix in file_extensions:
 55            self.__filename = self.__filename.with_suffix('')
 56
 57        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 58        if rename_missing_values:
 59            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 60
 61        self.write_pvar(vzs=vzs)
 62        self.write_psam()
 63        self.write_pgen()
 64
 65    def write_pvar(self, vzs: bool = False):
 66        """
 67        Writes variant data to the .pvar file.
 68
 69        Args:
 70            vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 71        """
 72        output_filename = f"{self.__filename}.pvar"
 73        if vzs:
 74            output_filename += ".zst"
 75            log.info(f"Writing to {output_filename} (compressed)")
 76        else:
 77            log.info(f"Writing to {output_filename}")
 78
 79        df = pl.DataFrame(
 80            {
 81                "#CHROM": self.__snpobj.variants_chrom,
 82                "POS": self.__snpobj.variants_pos,
 83                "ID": self.__snpobj.variants_id,
 84                "REF": self.__snpobj.variants_ref,
 85                "ALT": self.__snpobj.variants_alt,
 86                "FILTER": self.__snpobj.variants_filter_pass,
 87                # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost)
 88            }
 89        )
 90        # TODO: add header to the .pvar file, if not it's lost
 91
 92        # Write the DataFrame to a CSV string
 93        csv_data = df.write_csv(None, separator="\t")
 94
 95        if vzs:
 96            # Compress the CSV data using zstd
 97            cctx = zstd.ZstdCompressor()
 98            compressed_data = cctx.compress(csv_data.encode('utf-8'))
 99            with open(output_filename, 'wb') as f:
100                f.write(compressed_data)
101        else:
102            with open(output_filename, 'w') as f:
103                f.write(csv_data)
104
105    def write_psam(self):
106        """
107        Writes sample metadata to the .psam file.
108        """
109        log.info(f"Writing {self.__filename}.psam")
110        df = pl.DataFrame(
111            {
112                "#IID": self.__snpobj.samples,
113                "SEX": "NA",  # Add SEX as nan for now
114                # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost)
115            }
116        )
117        df.write_csv(f"{self.__filename}.psam", separator="\t")
118
119    def write_pgen(self):
120        """
121        Writes the genotype data to a .pgen file.
122        """
123        log.info(f"Writing to {self.__filename}.pgen")
124        summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True
125        if not summed_strands:
126            num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape
127            # Flatten the genotype matrix for pgenlib
128            flat_genotypes = self.__snpobj.calldata_gt.reshape(
129                num_variants, num_samples * num_alleles
130            )
131            with pg.PgenWriter(
132                filename=f"{self.__filename}.pgen".encode('utf-8'),
133                sample_ct=num_samples,
134                variant_ct=num_variants,
135                hardcall_phase_present=True,
136            ) as writer:
137                for variant_index in range(num_variants):
138                    writer.append_alleles(
139                        flat_genotypes[variant_index].astype(np.int32), all_phased=True
140                    )
141        else:
142            num_variants, num_samples = self.__snpobj.calldata_gt.shape
143            # Transpose to (samples, variants)
144            genotypes = self.__snpobj.calldata_gt.T  # Shape is (samples, variants)
145            with pg.PgenWriter(
146                filename=f"{self.__filename}.pgen".encode('utf-8'),
147                sample_ct=num_samples,
148                variant_ct=num_variants,
149                hardcall_phase_present=False,
150            ) as writer:
151                for variant_index in range(num_variants):
152                    variant_genotypes = genotypes[:, variant_index].astype(np.int8)
153                    # Map missing genotypes to -9 if necessary
154                    variant_genotypes[variant_genotypes == -1] = -9
155                    writer.append_biallelic(np.ascontiguousarray(variant_genotypes))

Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path.

PGENWriter(snpobj: SNPObject, filename: str)
20    def __init__(self, snpobj: SNPObject, filename: str):
21        """
22        Initializes the PGENWriter instance.
23
24        Args:
25            snpobj (SNPObject): The SNPObject containing genotype data to be written.
26            filename (str): Base path for the output files (excluding extension).
27        """
28        self.__snpobj = snpobj
29        self.__filename = Path(filename)

Initializes the PGENWriter instance.

Arguments:
  • snpobj (SNPObject): The SNPObject containing genotype data to be written.
  • filename (str): Base path for the output files (excluding extension).
def write( self, vzs: bool = False, rename_missing_values: bool = True, before: int | float | str = -1, after: int | float | str = '.'):
31    def write(
32            self, 
33            vzs: bool = False,
34            rename_missing_values: bool = True, 
35            before: Union[int, float, str] = -1, 
36            after: Union[int, float, str] = '.'
37        ):
38        """
39        Writes the SNPObject data to .pgen, .psam, and .pvar files.
40
41        Args:
42            vzs (bool, optional): 
43                If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
44            rename_missing_values (bool, optional):
45                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
46                Defaults to True.
47            before (int, float, or str, default=-1): 
48                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
49                Default is -1.
50            after (int, float, or str, default='.'): 
51                The value that will replace `before`. Default is '.'.
52        """
53        file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst")
54        if self.__filename.suffix in file_extensions:
55            self.__filename = self.__filename.with_suffix('')
56
57        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
58        if rename_missing_values:
59            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
60
61        self.write_pvar(vzs=vzs)
62        self.write_psam()
63        self.write_pgen()

Writes the SNPObject data to .pgen, .psam, and .pvar files.

Arguments:
  • vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
  • rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
  • before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
  • after (int, float, or str, default='.'): The value that will replace before. Default is '.'.
def write_pvar(self, vzs: bool = False):
 65    def write_pvar(self, vzs: bool = False):
 66        """
 67        Writes variant data to the .pvar file.
 68
 69        Args:
 70            vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 71        """
 72        output_filename = f"{self.__filename}.pvar"
 73        if vzs:
 74            output_filename += ".zst"
 75            log.info(f"Writing to {output_filename} (compressed)")
 76        else:
 77            log.info(f"Writing to {output_filename}")
 78
 79        df = pl.DataFrame(
 80            {
 81                "#CHROM": self.__snpobj.variants_chrom,
 82                "POS": self.__snpobj.variants_pos,
 83                "ID": self.__snpobj.variants_id,
 84                "REF": self.__snpobj.variants_ref,
 85                "ALT": self.__snpobj.variants_alt,
 86                "FILTER": self.__snpobj.variants_filter_pass,
 87                # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost)
 88            }
 89        )
 90        # TODO: add header to the .pvar file, if not it's lost
 91
 92        # Write the DataFrame to a CSV string
 93        csv_data = df.write_csv(None, separator="\t")
 94
 95        if vzs:
 96            # Compress the CSV data using zstd
 97            cctx = zstd.ZstdCompressor()
 98            compressed_data = cctx.compress(csv_data.encode('utf-8'))
 99            with open(output_filename, 'wb') as f:
100                f.write(compressed_data)
101        else:
102            with open(output_filename, 'w') as f:
103                f.write(csv_data)

Writes variant data to the .pvar file.

Arguments:
  • vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
def write_psam(self):
105    def write_psam(self):
106        """
107        Writes sample metadata to the .psam file.
108        """
109        log.info(f"Writing {self.__filename}.psam")
110        df = pl.DataFrame(
111            {
112                "#IID": self.__snpobj.samples,
113                "SEX": "NA",  # Add SEX as nan for now
114                # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost)
115            }
116        )
117        df.write_csv(f"{self.__filename}.psam", separator="\t")

Writes sample metadata to the .psam file.

def write_pgen(self):
119    def write_pgen(self):
120        """
121        Writes the genotype data to a .pgen file.
122        """
123        log.info(f"Writing to {self.__filename}.pgen")
124        summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True
125        if not summed_strands:
126            num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape
127            # Flatten the genotype matrix for pgenlib
128            flat_genotypes = self.__snpobj.calldata_gt.reshape(
129                num_variants, num_samples * num_alleles
130            )
131            with pg.PgenWriter(
132                filename=f"{self.__filename}.pgen".encode('utf-8'),
133                sample_ct=num_samples,
134                variant_ct=num_variants,
135                hardcall_phase_present=True,
136            ) as writer:
137                for variant_index in range(num_variants):
138                    writer.append_alleles(
139                        flat_genotypes[variant_index].astype(np.int32), all_phased=True
140                    )
141        else:
142            num_variants, num_samples = self.__snpobj.calldata_gt.shape
143            # Transpose to (samples, variants)
144            genotypes = self.__snpobj.calldata_gt.T  # Shape is (samples, variants)
145            with pg.PgenWriter(
146                filename=f"{self.__filename}.pgen".encode('utf-8'),
147                sample_ct=num_samples,
148                variant_ct=num_variants,
149                hardcall_phase_present=False,
150            ) as writer:
151                for variant_index in range(num_variants):
152                    variant_genotypes = genotypes[:, variant_index].astype(np.int8)
153                    # Map missing genotypes to -9 if necessary
154                    variant_genotypes[variant_genotypes == -1] = -9
155                    writer.append_biallelic(np.ascontiguousarray(variant_genotypes))

Writes the genotype data to a .pgen file.

class VCFWriter:
 14class VCFWriter:
 15    """
 16    A writer class for exporting SNP data from a `snputils.snp.genobj.SNPObject` 
 17    into an `.vcf` file.
 18    """
 19    def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False):
 20        """
 21        Args:
 22            snpobj (SNPObject):
 23                A SNPObject instance.
 24            file (str or pathlib.Path): 
 25                Path to the file where the data will be saved. It should end with `.vcf`. 
 26                If the provided path does not have this extension, the `.vcf` extension will be appended.
 27            n_jobs: 
 28                Number of jobs to run in parallel. 
 29                - `None`: use 1 job unless within a `joblib.parallel_backend` context.  
 30                - `-1`: use all available processors.  
 31                - Any other integer: use the specified number of jobs.
 32            phased: 
 33                If True, genotype data is written in "maternal|paternal" format.  
 34                If False, genotype data is written in "maternal/paternal" format.
 35        """
 36        self.__snpobj = snpobj
 37        self.__filename = Path(filename)
 38        self.__n_jobs = n_jobs
 39        self.__phased = phased
 40
 41    def write(
 42            self,
 43            chrom_partition: bool = False,
 44            rename_missing_values: bool = True,
 45            before: Union[int, float, str] = -1,
 46            after: Union[int, float, str] = '.',
 47            variants_info: Optional[Sequence[str]] = None,
 48        ):
 49        """
 50        Writes the SNP data to VCF file(s).
 51
 52        Args:
 53            chrom_partition (bool, optional):
 54                If True, individual VCF files are generated for each chromosome.
 55                If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
 56            rename_missing_values (bool, optional):
 57                If True, renames potential missing values in `snpobj.calldata_gt` before writing.
 58                Defaults to True.
 59            before (int, float, or str, default=-1):
 60                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 61                Default is -1.
 62            after (int, float, or str, default='.'):
 63                The value that will replace `before`. Default is '.'.
 64            variants_info (sequence of str, optional):
 65                Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count.
 66                When provided, a ##INFO header line for END is written if any value contains ``END=``.
 67        """
 68        self.__chrom_partition = chrom_partition
 69
 70        file_extensions = (".vcf", ".bcf")
 71        if self.__filename.suffix in file_extensions:
 72            self.__file_extension = self.__filename.suffix
 73            self.__filename = self.__filename.with_suffix('')
 74        else:
 75            self.__file_extension = ".vcf"
 76
 77        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 78        if rename_missing_values:
 79            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 80
 81        data = self.__snpobj
 82
 83        if self.__chrom_partition:
 84            chroms = data.unique_chrom
 85
 86            for chrom in chroms:
 87                data_chrom = data.filter_variants(chrom=chrom, inplace=False)
 88                if variants_info is not None:
 89                    mask = data.variants_chrom == chrom
 90                    info_chrom = [variants_info[i] for i in np.where(mask)[0]]
 91                else:
 92                    info_chrom = None
 93                log.debug(f'Storing chromosome {chrom}')
 94                self._write_chromosome_data(chrom, data_chrom, info_chrom)
 95        else:
 96            self._write_chromosome_data("All", data, variants_info)
 97
 98    def _write_chromosome_data(
 99        self, chrom, data_chrom, variants_info: Optional[Sequence[str]] = None
100    ):
101        """
102        Writes the SNP data for a specific chromosome to a VCF file.
103
104        Args:
105            chrom: The chromosome name.
106            data_chrom: The SNPObject instance containing the data for the chromosome.
107            variants_info: Optional per-variant INFO strings; length must match variant count.
108        """
109        npy3 = data_chrom.calldata_gt
110        n_windows, n_samples, _ = npy3.shape
111
112        if chrom == "All":
113            file = self.__filename.with_suffix(self.__file_extension)
114        else:
115            file = self.__filename.parent / f"{self.__filename.stem}_{chrom}{self.__file_extension}"
116
117        out = open(file, "w")
118        out.write("##fileformat=VCFv4.1\n")
119        out.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">\n')
120        if variants_info is not None and any("END=" in s for s in variants_info):
121            out.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the segment">\n')
122        for c in set(data_chrom.variants_chrom):
123            out.write(f"##contig=<ID={c}>\n")
124        cols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + list(data_chrom.samples)
125        out.write("\t".join(cols) + "\n")
126
127        sep = "|" if self.__phased else "/"
128        for i in range(n_windows):
129            chrom_val = data_chrom.variants_chrom[i]
130            pos = data_chrom.variants_pos[i]
131            vid = data_chrom.variants_id[i]
132            ref = data_chrom.variants_ref[i]
133            alt = data_chrom.variants_alt[i]
134            info_str = variants_info[i] if variants_info is not None else "."
135            row = npy3[i]
136            genotypes = [
137                f"{row[s,0]}{sep}{row[s,1]}"
138                for s in range(n_samples)
139            ]
140            line = "\t".join([
141                str(chrom_val), str(pos), vid, ref, alt,
142                ".", "PASS", info_str, "GT", *genotypes
143            ])
144            out.write(line + "\n")
145        out.close()

A writer class for exporting SNP data from a snputils.snp.genobj.SNPObject into an .vcf file.

VCFWriter( snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False)
19    def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False):
20        """
21        Args:
22            snpobj (SNPObject):
23                A SNPObject instance.
24            file (str or pathlib.Path): 
25                Path to the file where the data will be saved. It should end with `.vcf`. 
26                If the provided path does not have this extension, the `.vcf` extension will be appended.
27            n_jobs: 
28                Number of jobs to run in parallel. 
29                - `None`: use 1 job unless within a `joblib.parallel_backend` context.  
30                - `-1`: use all available processors.  
31                - Any other integer: use the specified number of jobs.
32            phased: 
33                If True, genotype data is written in "maternal|paternal" format.  
34                If False, genotype data is written in "maternal/paternal" format.
35        """
36        self.__snpobj = snpobj
37        self.__filename = Path(filename)
38        self.__n_jobs = n_jobs
39        self.__phased = phased
Arguments:
  • snpobj (SNPObject): A SNPObject instance.
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf. If the provided path does not have this extension, the .vcf extension will be appended.
  • n_jobs: Number of jobs to run in parallel.
    • None: use 1 job unless within a joblib.parallel_backend context.
    • -1: use all available processors.
    • Any other integer: use the specified number of jobs.
  • phased: If True, genotype data is written in "maternal|paternal" format.
    If False, genotype data is written in "maternal/paternal" format.
def write( self, chrom_partition: bool = False, rename_missing_values: bool = True, before: int | float | str = -1, after: int | float | str = '.', variants_info: Sequence[str] | None = None):
41    def write(
42            self,
43            chrom_partition: bool = False,
44            rename_missing_values: bool = True,
45            before: Union[int, float, str] = -1,
46            after: Union[int, float, str] = '.',
47            variants_info: Optional[Sequence[str]] = None,
48        ):
49        """
50        Writes the SNP data to VCF file(s).
51
52        Args:
53            chrom_partition (bool, optional):
54                If True, individual VCF files are generated for each chromosome.
55                If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
56            rename_missing_values (bool, optional):
57                If True, renames potential missing values in `snpobj.calldata_gt` before writing.
58                Defaults to True.
59            before (int, float, or str, default=-1):
60                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
61                Default is -1.
62            after (int, float, or str, default='.'):
63                The value that will replace `before`. Default is '.'.
64            variants_info (sequence of str, optional):
65                Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count.
66                When provided, a ##INFO header line for END is written if any value contains ``END=``.
67        """
68        self.__chrom_partition = chrom_partition
69
70        file_extensions = (".vcf", ".bcf")
71        if self.__filename.suffix in file_extensions:
72            self.__file_extension = self.__filename.suffix
73            self.__filename = self.__filename.with_suffix('')
74        else:
75            self.__file_extension = ".vcf"
76
77        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
78        if rename_missing_values:
79            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
80
81        data = self.__snpobj
82
83        if self.__chrom_partition:
84            chroms = data.unique_chrom
85
86            for chrom in chroms:
87                data_chrom = data.filter_variants(chrom=chrom, inplace=False)
88                if variants_info is not None:
89                    mask = data.variants_chrom == chrom
90                    info_chrom = [variants_info[i] for i in np.where(mask)[0]]
91                else:
92                    info_chrom = None
93                log.debug(f'Storing chromosome {chrom}')
94                self._write_chromosome_data(chrom, data_chrom, info_chrom)
95        else:
96            self._write_chromosome_data("All", data, variants_info)

Writes the SNP data to VCF file(s).

Arguments:
  • chrom_partition (bool, optional): If True, individual VCF files are generated for each chromosome. If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
  • rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
  • before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
  • after (int, float, or str, default='.'): The value that will replace before. Default is '.'.
  • variants_info (sequence of str, optional): Per-variant INFO column values (e.g. ["END=2000", "END=3000"]). Length must match variant count. When provided, a ##INFO header line for END is written if any value contains END=.
def read_snp( filename: str | pathlib.Path, **kwargs) -> SNPObject:
11def read_snp(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
12    """
13    Automatically detect the file format and read it into a SNPObject.
14
15    Args:
16        filename: Filename of the file to read.
17        **kwargs: Additional arguments passed to the reader method.
18
19    Raises:
20        ValueError: If the filename does not have an extension or the extension is not supported.
21    """
22    from snputils.snp.io.read.auto import SNPReader
23
24    return SNPReader(filename).read(**kwargs)

Automatically detect the file format and read it into a SNPObject.

Arguments:
  • filename: Filename of the file to read.
  • **kwargs: Additional arguments passed to the reader method.
Raises:
  • ValueError: If the filename does not have an extension or the extension is not supported.
def read_bed( filename: str | pathlib.Path, **kwargs) -> SNPObject:
27def read_bed(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
28    """
29    Read a BED fileset into a SNPObject.
30
31    Args:
32        filename: Filename of the BED fileset to read.
33        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.bed.BEDReader` for possible parameters.
34    """
35    from snputils.snp.io.read.bed import BEDReader
36
37    return BEDReader(filename).read(**kwargs)

Read a BED fileset into a SNPObject.

Arguments:
  • filename: Filename of the BED fileset to read.
  • **kwargs: Additional arguments passed to the reader method. See snputils.snp.io.read.bed.BEDReader for possible parameters.
def read_pgen( filename: str | pathlib.Path, **kwargs) -> SNPObject:
40def read_pgen(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
41    """
42    Read a PGEN fileset into a SNPObject.
43
44    Args:
45        filename: Filename of the PGEN fileset to read.
46        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.pgen.PGENReader` for possible parameters.
47    """
48    from snputils.snp.io.read.pgen import PGENReader
49
50    return PGENReader(filename).read(**kwargs)

Read a PGEN fileset into a SNPObject.

Arguments:
def read_vcf( filename: str | pathlib.Path, backend: str = 'polars', **kwargs) -> SNPObject:
53def read_vcf(filename: Union[str, pathlib.Path], 
54             backend: str = 'polars',
55             **kwargs) -> SNPObject:
56    """
57    Read a VCF fileset into a SNPObject.
58
59    Args:
60        filename: Filename of the VCF fileset to read.
61        backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'.
62        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.vcf.VCFReader` for possible parameters.
63    """
64    from snputils.snp.io.read.vcf import VCFReader, VCFReaderPolars
65    if backend == 'polars':
66        print(f"Reading {filename} with polars backend")
67        return VCFReaderPolars(filename).read(**kwargs)
68    else:
69        print(f"Reading {filename} with scikit-allel backend")
70        return VCFReader(filename).read(**kwargs)

Read a VCF fileset into a SNPObject.

Arguments:
  • filename: Filename of the VCF fileset to read.
  • backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'.
  • **kwargs: Additional arguments passed to the reader method. See snputils.snp.io.read.vcf.VCFReader for possible parameters.
def read_grg( filename: str | pathlib.Path, **kwargs) -> GRGObject:
73def read_grg(filename: Union[str, pathlib.Path], **kwargs) -> "GRGObject":
74    """
75    Read a GRG file into a GRGObject.
76
77    Args:
78        filename: Filename of the GRG file to read.
79        **kwargs: Additional arguments passed to the reader method.
80    """
81    try:
82        from snputils.snp.io.read.grg import GRGReader
83    except ModuleNotFoundError as exc:
84        if exc.name == "pygrgl":
85            raise ImportError(
86                "GRG support requires the optional dependency 'pygrgl'. "
87                "Install it with: pip install pygrgl"
88            ) from exc
89        raise
90
91    return GRGReader(filename).read(**kwargs)

Read a GRG file into a GRGObject.

Arguments:
  • filename: Filename of the GRG file to read.
  • **kwargs: Additional arguments passed to the reader method.
class LocalAncestryObject(snputils.ancestry.genobj.base.AncestryObject):
 17class LocalAncestryObject(AncestryObject):
 18    """
 19    A class for window-level Local Ancestry Inference (LAI) data.
 20    """
 21    def __init__(
 22        self,
 23        haplotypes: List[str], 
 24        lai: np.ndarray,
 25        samples: Optional[List[str]] = None, 
 26        ancestry_map: Optional[Dict[str, str]] = None, 
 27        window_sizes: Optional[np.ndarray] = None,
 28        centimorgan_pos: Optional[np.ndarray] = None,
 29        chromosomes: Optional[np.ndarray] = None,
 30        physical_pos: Optional[np.ndarray] = None
 31    ) -> None:
 32        """
 33        Args:
 34            haplotypes (list of str of length n_haplotypes):
 35                A list of unique haplotype identifiers.
 36            lai (array of shape (n_windows, n_haplotypes)): 
 37                A 2D array containing local ancestry inference values, where each row represents a 
 38                genomic window, and each column corresponds to a haplotype phase for each sample.
 39            samples (list of str of length n_samples, optional):
 40                A list of unique sample identifiers.
 41            ancestry_map (dict of str to str, optional):
 42                A dictionary mapping ancestry codes to region names.
 43            window_sizes (array of shape (n_windows,), optional): 
 44                An array specifying the number of SNPs in each genomic window.
 45            centimorgan_pos (array of shape (n_windows, 2), optional): 
 46                A 2D array containing the start and end centimorgan positions for each window.
 47            chromosomes (array of shape (n_windows,), optional): 
 48                An array with chromosome numbers corresponding to each genomic window.
 49            physical_pos (array of shape (n_windows, 2), optional): 
 50                A 2D array containing the start and end physical positions for each window.
 51        """
 52        if lai.ndim != 2:
 53            raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).")
 54        
 55        # Determine the number of unique ancestries and samples from the LAI array
 56        n_ancestries = len(np.unique(lai))
 57        n_haplotypes = lai.shape[1]
 58        n_samples = n_haplotypes // 2
 59
 60        super(LocalAncestryObject, self).__init__(n_samples, n_ancestries)
 61
 62        self.__haplotypes = haplotypes
 63        self.__lai = lai
 64        self.__window_sizes = window_sizes
 65        self.__centimorgan_pos = centimorgan_pos
 66        self.__samples = samples
 67        self.__chromosomes = chromosomes
 68        self.__physical_pos = physical_pos
 69        self.__ancestry_map = ancestry_map
 70
 71        # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map
 72        self._sanity_check()
 73
 74    def __getitem__(self, key):
 75        """
 76        To access an attribute of the class using the square bracket notation,
 77        similar to a dictionary.
 78        """
 79        try:
 80            return getattr(self, key)
 81        except AttributeError:
 82            raise KeyError(f'Invalid key: {key}')
 83
 84    def __setitem__(self, key, value):
 85        """
 86        To set an attribute of the class using the square bracket notation,
 87        similar to a dictionary.
 88        """
 89        try:
 90            setattr(self, key, value)
 91        except AttributeError:
 92            raise KeyError(f'Invalid key: {key}')
 93
 94    @property
 95    def haplotypes(self) -> List[str]:
 96        """
 97        Retrieve `haplotypes`.
 98
 99        Returns:
100            **list of length n_haplotypes:** A list of unique haplotype identifiers.
101        """
102        return self.__haplotypes
103
104    @haplotypes.setter
105    def haplotypes(self, x):
106        """
107        Update `haplotypes`.
108        """
109        self.__haplotypes = x
110
111    @property
112    def lai(self) -> np.ndarray:
113        """
114        Retrieve `lai`.
115
116        Returns:
117            **array of shape (n_windows, n_haplotypes):** 
118                A 2D array containing local ancestry inference values, where each row represents a 
119                genomic window, and each column corresponds to a haplotype phase for each sample.
120        """
121        return self.__lai
122
123    @lai.setter
124    def lai(self, x):
125        """
126        Update `lai`.
127        """
128        self.__lai = x
129
130    @property
131    def samples(self) -> Optional[List[str]]:
132        """
133        Retrieve `samples`.
134
135        Returns:
136            **list of str:** A list of unique sample identifiers.
137        """
138        if self.__samples is not None:
139            return self.__samples
140        elif self.__haplotypes is not None:
141            return [hap.split('.')[0] for hap in self.__haplotypes][::2]
142        else:
143            return None
144    
145    @samples.setter
146    def samples(self, x):
147        """
148        Update `samples`.
149        """
150        self.__samples = x
151
152    @property
153    def ancestry_map(self) -> Optional[Dict[str, str]]:
154        """
155        Retrieve `ancestry_map`.
156
157        Returns:
158            **dict of str to str:** A dictionary mapping ancestry codes to region names.
159        """
160        return self.__ancestry_map
161
162    @ancestry_map.setter
163    def ancestry_map(self, x):
164        """
165        Update `ancestry_map`.
166        """
167        self.__ancestry_map = x
168
169    @property
170    def window_sizes(self) -> Optional[np.ndarray]:
171        """
172        Retrieve `window_sizes`.
173
174        Returns:
175            **array of shape (n_windows,):** 
176                An array specifying the number of SNPs in each genomic window.
177        """
178        return self.__window_sizes
179        
180    @window_sizes.setter
181    def window_sizes(self, x):
182        """
183        Update `window_sizes`.
184        """
185        self.__window_sizes = x
186
187    @property
188    def centimorgan_pos(self) -> Optional[np.ndarray]:
189        """
190        Retrieve `centimorgan_pos`.
191
192        Returns:
193            **array of shape (n_windows, 2):** 
194                A 2D array containing the start and end centimorgan positions for each window.
195        """
196        return self.__centimorgan_pos
197
198    @centimorgan_pos.setter
199    def centimorgan_pos(self, x):
200        """
201        Update `centimorgan_pos`.
202        """
203        self.__centimorgan_pos = x
204
205    @property
206    def chromosomes(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `chromosomes`.
209
210        Returns:
211            **array of shape (n_windows,):** 
212                An array with chromosome numbers corresponding to each genomic window.
213        """
214        return self.__chromosomes
215        
216    @chromosomes.setter
217    def chromosomes(self, x):
218        """
219        Update `chromosomes`.
220        """
221        self.__chromosomes = x
222
223    @property
224    def physical_pos(self) -> Optional[np.ndarray]:
225        """
226        Retrieve `physical_pos`.
227
228        Returns:
229            **array of shape (n_windows, 2):** 
230                A 2D array containing the start and end physical positions for each window.
231        """
232        return self.__physical_pos
233
234    @physical_pos.setter
235    def physical_pos(self, x):
236        """
237        Update `physical_pos`.
238        """
239        self.__physical_pos = x
240
241    @property
242    def n_samples(self) -> int:
243        """
244        Retrieve `n_samples`.
245
246        Returns:
247            **int:** 
248                The total number of samples.
249        """
250        if self.__samples is not None:
251            return len(self.__samples)
252        elif self.__haplotypes is not None:
253            # Divide by 2 because each sample has two associated haplotypes
254            return len(self.__haplotypes) // 2
255        else:
256            # Divide by 2 because columns represent haplotypes
257            return self.__lai.shape[1] // 2
258
259    @property
260    def n_ancestries(self) -> int:
261        """
262        Retrieve `n_ancestries`.
263
264        Returns:
265            **int:** The total number of unique ancestries.
266        """
267        return len(np.unique(self.__lai))
268    
269    @property
270    def n_haplotypes(self) -> int:
271        """
272        Retrieve `n_haplotypes`.
273
274        Returns:
275            **int:** The total number of haplotypes.
276        """
277        if self.__haplotypes is not None:
278            return len(self.__haplotypes)
279        else:
280            return self.__lai.shape[1]
281
282    @property
283    def n_windows(self) -> int:
284        """
285        Retrieve `n_windows`.
286
287        Returns:
288            **int:** The total number of genomic windows.
289        """
290        return self.__lai.shape[0]
291
292    def copy(self) -> 'LocalAncestryObject':
293        """
294        Create and return a copy of `self`.
295
296        Returns:
297            **LocalAncestryObject:** 
298                A new instance of the current object.
299        """
300        return copy.copy(self)
301
302    def keys(self) -> List[str]:
303        """
304        Retrieve a list of public attribute names for `self`.
305
306        Returns:
307            **list of str:** 
308                A list of attribute names, with internal name-mangling removed, 
309                for easier reference to public attributes in the instance.
310        """
311        return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
312
313    def filter_windows(
314            self,
315            indexes: Union[int, Sequence[int], np.ndarray],
316            include: bool = True,
317            inplace: bool = False
318        ) -> Optional['LocalAncestryObject']:
319        """
320        Filter genomic windows based on specified indexes. 
321
322        This method updates the `lai` attribute to include or exclude the specified genomic windows. 
323        Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 
324        updated accordingly if they are not None. The order of genomic windows is preserved.
325
326        Negative indexes are supported and follow 
327        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 
328
329        Args:
330            indexes (int or array-like of int): 
331                Index(es) of the windows to include or exclude. Can be a single integer or a
332                sequence of integers. Negative indexes are supported.
333            include (bool, default=True): 
334                If True, includes only the specified windows. If False, excludes the specified
335                windows. Default is True.
336            inplace (bool, default=False): 
337                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 
338                the windows filtered. Default is False.
339
340        Returns:
341            **Optional[LocalAncestryObject]:** 
342                A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 
343                If `inplace=True`, modifies `self` in place and returns None.
344        """
345        # Convert indexes to a NumPy array
346        indexes = np.atleast_1d(indexes)
347
348        # Get total number of windows
349        n_windows = self.n_windows
350
351        # Validate indexes, allowing negative indexes
352        if np.any((indexes < -n_windows) | (indexes >= n_windows)):
353            raise IndexError("One or more indexes are out of bounds.")
354
355        # Create boolean mask
356        mask = np.zeros(n_windows, dtype=bool)
357        mask[indexes] = True
358
359        # Invert mask if `include=False`
360        if not include:
361            mask = ~mask
362        
363        # Filter `lai`
364        filtered_lai = self['lai'][mask, :] 
365        
366        # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering
367        filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None
368        filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None
369        filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None
370        filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None
371
372        # Modify the original object if `inplace=True`, otherwise create and return a copy
373        if inplace:
374            self['lai'] = filtered_lai
375            if filtered_window_sizes is not None:
376                self['window_sizes'] = filtered_window_sizes
377            if filtered_chromosomes is not None:
378                self['chromosomes'] = filtered_chromosomes
379            if filtered_centimorgan_pos is not None:
380                self['centimorgan_pos'] = filtered_centimorgan_pos
381            if filtered_physical_pos is not None:
382                self['physical_pos'] = filtered_physical_pos
383            return None
384        else:
385            laiobj = self.copy()
386            laiobj['lai'] = filtered_lai
387            if filtered_window_sizes is not None:
388                laiobj['window_sizes'] = filtered_window_sizes
389            if filtered_chromosomes is not None:
390                laiobj['chromosomes'] = filtered_chromosomes
391            if filtered_centimorgan_pos is not None:
392                laiobj['centimorgan_pos'] = filtered_centimorgan_pos
393            if filtered_physical_pos is not None:
394                laiobj['physical_pos'] = filtered_physical_pos
395            return laiobj
396
397    def filter_samples(
398        self,
399        samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
400        indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
401        include: bool = True,
402        reorder: bool = False,
403        inplace: bool = False
404    ) -> Optional['LocalAncestryObject']:
405        """
406        Filter samples based on specified names or indexes.
407
408        This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 
409        samples. Each sample is associated with two haplotypes, which are included or excluded together.
410        The order of the samples is preserved. Set `reorder=True` to match the ordering of the
411        provided `samples` and/or `indexes` lists when including.
412
413        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
414        indexes will be included or excluded.
415        
416        Negative indexes are supported and follow 
417        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
418
419        Args:
420            samples (str or array_like of str, optional): 
421                 Name(s) of the samples to include or exclude. Can be a single sample name or a
422                 sequence of sample names. Default is None.
423            indexes (int or array_like of int, optional):
424                Index(es) of the samples to include or exclude. Can be a single index or a sequence
425                of indexes. Negative indexes are supported. Default is None.
426            include (bool, default=True): 
427                If True, includes only the specified samples. If False, excludes the specified
428                samples. Default is True.
429            inplace (bool, default=False): 
430                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 
431                samples filtered. Default is False.
432
433        Returns:
434            **Optional[LocalAncestryObject]:** 
435                A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 
436                If `inplace=True`, modifies `self` in place and returns None.
437        """
438        if samples is None and indexes is None:
439            raise UserWarning("At least one of 'samples' or 'indexes' must be provided.")
440
441        n_haplotypes = self.n_haplotypes
442        n_samples = self.n_samples
443
444        # Create mask based on sample names
445        if samples is not None:
446            samples = np.asarray(samples).ravel()
447            # Extract sample names from haplotype identifiers
448            haplotype_ids = np.array(self['haplotypes'])
449            sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids])
450            # Create mask for haplotypes belonging to specified samples
451            mask_samples = np.isin(sample_names, samples)
452        else:
453            mask_samples = np.zeros(n_haplotypes, dtype=bool)
454
455        # Create mask based on sample indexes
456        if indexes is not None:
457            indexes = np.asarray(indexes).ravel()
458
459            # Validate indexes, allowing negative indexes
460            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
461            if out_of_bounds_indexes.size > 0:
462                raise ValueError(f"One or more sample indexes are out of bounds.")
463
464            # Adjust negative indexes
465            indexes = np.mod(indexes, n_samples)
466            
467            # Get haplotype indexes for the specified sample indexes
468            haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1])
469            # Create mask for haplotypes
470            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
471            mask_indexes[haplotype_indexes] = True
472        else:
473            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
474
475        # Combine masks using logical OR (union of samples)
476        mask_combined = mask_samples | mask_indexes
477
478        if not include:
479            mask_combined = ~mask_combined
480
481        # Optionally compute an ordering of selected samples that follows the provided lists
482        ordered_sample_indices = None
483        sample_mask = mask_combined.reshape(-1, 2).any(axis=1)
484        if include and reorder:
485            sel_sample_indices = np.where(sample_mask)[0]
486            ordered_list: List[int] = []
487            added = np.zeros(self.n_samples, dtype=bool)
488
489            # Source of sample names for ordering logic
490            haplotype_ids = np.array(self['haplotypes'])
491            sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2]
492
493            # Respect the order in `samples`
494            if samples is not None:
495                for s in np.atleast_1d(samples):
496                    # Find the sample index by name (first occurrence)
497                    matches = np.where(sample_names_by_sample == s)[0]
498                    for idx in matches:
499                        if sample_mask[idx] and not added[idx]:
500                            ordered_list.append(int(idx))
501                            added[idx] = True
502
503            # Then respect the order in `indexes`
504            if indexes is not None:
505                adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples)
506                for idx in adj_idx:
507                    if sample_mask[idx] and not added[idx]:
508                        ordered_list.append(int(idx))
509                        added[idx] = True
510
511            # Append any remaining selected samples in their original order
512            for idx in sel_sample_indices:
513                if not added[idx]:
514                    ordered_list.append(int(idx))
515
516            ordered_sample_indices = np.asarray(ordered_list, dtype=int)
517
518        # Filter / reorder arrays
519        if ordered_sample_indices is not None:
520            hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1])
521            filtered_lai = self['lai'][:, hap_idx]
522            filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist()
523            filtered_samples = (
524                np.array(self['samples'])[ordered_sample_indices].tolist()
525                if self['samples'] is not None else None
526            )
527        else:
528            # Filter `lai`
529            filtered_lai = self['lai'][:, mask_combined]
530            # Filter `haplotypes`
531            filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist()
532            # Filter `samples`, checking if they are None before filtering
533            filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None
534
535        if inplace:
536            self['haplotypes'] = filtered_haplotypes
537            self['samples'] = filtered_samples
538            self['lai'] = filtered_lai
539            return None
540        else:
541            laiobj = self.copy()
542            laiobj['haplotypes'] = filtered_haplotypes
543            laiobj['samples'] = filtered_samples
544            laiobj['lai'] = filtered_lai
545            return laiobj
546
547    def convert_to_snp_level(
548        self,
549        snpobject: Optional['SNPObject'] = None,
550        variants_chrom: Optional[np.ndarray] = None,
551        variants_pos: Optional[np.ndarray] = None,
552        variants_ref: Optional[np.ndarray] = None,
553        variants_alt: Optional[np.ndarray] = None,
554        variants_filter_pass: Optional[np.ndarray] = None,
555        variants_id: Optional[np.ndarray] = None,
556        variants_qual: Optional[np.ndarray] = None,
557        lai_format: str = "3D"
558    ) -> 'SNPObject':
559        """
560        Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 
561        with optional support for SNP data.
562        
563        If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 
564        SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 
565        coordinates are used to assign ancestry values based on their respective windows.
566
567        If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments.
568        In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned.
569
570        Args:
571            snpobject (SNPObject, optional):
572                An existing `SNPObject` to extract SNP attributes from.
573            variants_chrom (array of shape (n_snps,), optional): 
574                An array containing the chromosome for each SNP.
575            variants_pos (array of shape (n_snps,), optional): 
576                An array containing the chromosomal positions for each SNP.
577            variants_ref (array of shape (n_snps,), optional): 
578                An array containing the reference allele for each SNP.
579            variants_alt (array of shape (n_snps,), optional): 
580                An array containing the alternate allele for each SNP.
581            variants_filter_pass (array of shape (n_snps,), optional): 
582                An array indicating whether each SNP passed control checks.
583            variants_id (array of shape (n_snps,), optional): 
584                An array containing unique identifiers (IDs) for each SNP.
585            variants_qual (array of shape (n_snps,), optional): 
586                An array containing the Phred-scaled quality score for each SNP.
587            lai_format (str, optional):
588                Determines the shape of `calldata_lai`:
589                    - `"3D"` (default): Shape `(n_snps, n_samples, 2)`.
590                    - `"2D"`: Shape `(n_snps, n_samples * 2)`.
591
592        Returns:
593            **SNPObject**: 
594                A `SNPObject` containing SNP-level ancestry data and updated SNP attributes.
595        """
596        from snputils.snp.genobj.snpobj import SNPObject
597
598        assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'."
599
600        # Extract attributes from SNPObject if provided
601        if snpobject is not None:
602            variants_chrom = variants_chrom or snpobject.variants_chrom
603            variants_pos = variants_pos or snpobject.variants_pos
604            variants_ref = variants_ref or snpobject.variants_ref
605            variants_alt = variants_alt or snpobject.variants_alt
606            variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass
607            variants_id = variants_id or snpobject.variants_id
608            variants_qual = variants_qual or snpobject.variants_qual
609
610        n_samples = self.n_samples
611        lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None
612
613        if variants_pos is None or variants_chrom is None:
614            # Generate all SNP positions and corresponding chromosome labels between window boundaries
615            variants_pos_list = []
616            variants_chrom_list = []
617            ancestry_list = []
618
619            for i in range(self.n_windows):
620                start = int(self.physical_pos[i, 0])
621                end = int(self.physical_pos[i, 1])
622                chrom = self.chromosomes[i]
623
624                # Generate SNP positions at each base pair within the window range
625                positions_in_window = np.arange(start, end + 1)
626                if positions_in_window.size == 0:
627                    continue  # Skip windows that contain no valid SNP positions
628
629                n_positions = positions_in_window.size
630                variants_pos_list.append(positions_in_window)
631                variants_chrom_list.append(np.full(n_positions, chrom))
632
633                ancestry_repeated = (
634                    np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0)
635                    if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0)
636                )
637                ancestry_list.append(ancestry_repeated)
638
639            # Store SNP positions, their corresponding chromosome labels, and their associated ancestry
640            variants_pos = np.concatenate(variants_pos_list)
641            variants_chrom = np.concatenate(variants_chrom_list)
642            calldata_lai = np.concatenate(ancestry_list)
643        else:
644            # Use the provided SNP positions and chromosomes
645            n_snps = len(variants_pos)
646            if len(variants_chrom) != n_snps:
647                raise ValueError("`variants_pos` and `variants_chrom` must have the same length.")
648
649            # Initialize an array to store the corresponding window index for each SNP
650            # Default value is -1, meaning no matching window found
651            snp_to_window_indices = np.full(n_snps, -1, dtype=int)
652
653            # Identify unique chromosome names sorted by order of appearence
654            _, idx = np.unique(variants_chrom, return_index=True)
655            unique_chroms = variants_chrom[np.sort(idx)]
656
657            # Iterate through each unique chromosome to map SNPs to windows
658            for chrom in unique_chroms:
659                # Get indices of SNPs that belong to the current chromosome
660                snp_indices = np.where(variants_chrom == chrom)[0]
661                snp_pos_chr = variants_pos[snp_indices]
662                
663                # Get indices of windows that belong to the current chromosome
664                window_indices = np.where(self.chromosomes == chrom)[0]
665                if window_indices.size == 0:
666                    continue  # Skip if no windows exist for this chromosome
667                
668                # Extract start and end positions of the windows on this chromosome
669                window_starts_chr = self.physical_pos[:, 0][window_indices]
670                window_ends_chr = self.physical_pos[:, 1][window_indices]
671                
672                # Find the right-most window that a SNP would fit into (sorted order)
673                inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1
674                
675                # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries
676                valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds])
677
678                # Assign valid SNPs to their corresponding window indices
679                snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]]
680                log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}")
681
682            # Initialize SNP-level ancestry array with a missing-value sentinel.
683            # `-1` marks SNPs that do not fall within any LAI window.
684            shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2)
685            calldata_lai = np.full(shape, -1, dtype=np.int16)
686
687            # Assign ancestry values to SNPs with valid window assignments
688            valid_mask = (snp_to_window_indices != -1)
689            snp_indices = np.where(valid_mask)[0]
690            snp_to_window_indices = snp_to_window_indices[snp_indices]
691
692            if lai_format == "3D":
693                calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices]
694            else:  # "2D"
695                calldata_lai[snp_indices] = self.lai[snp_to_window_indices]
696
697        if snpobject is not None:
698            # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai``
699            snpobject.variants_chrom = variants_chrom
700            snpobject.variants_pos = variants_pos
701            snpobject.variants_ref = variants_ref
702            snpobject.variants_alt = variants_alt
703            snpobject.variants_filter_pass = variants_filter_pass
704            snpobject.variants_id = variants_id
705            snpobject.variants_qual = variants_qual
706            snpobject.calldata_lai = calldata_lai
707            snpobject.ancestry_map = self.ancestry_map
708            return snpobject
709        else:
710            # Otherwise, create a new SNPObject
711            return SNPObject(
712                calldata_lai=calldata_lai.view(),
713                samples=self.samples,
714                variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref,
715                variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt,
716                variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass,
717                variants_chrom=variants_chrom.view(),
718                variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id,
719                variants_pos=variants_pos.view(),
720                variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual,
721                ancestry_map=self.ancestry_map
722            )
723
724    def _sanity_check(self) -> None:
725        """
726        Perform sanity checks on the parsed data to ensure data integrity.
727
728        This method checks that all unique ancestries in LAI are represented 
729        in the ancestry map.
730
731        Args:
732            lai (np.ndarray): The LAI data array.
733            ancestry_map (dict, optional): A dictionary mapping ancestry codes to region names, if available.
734        """
735        # Get unique ancestries from LAI data
736        unique_ancestries = np.unique(self.lai)
737
738        if self.ancestry_map is not None:
739            # Check if all unique ancestries in the LAI are present in the ancestry map
740            for ancestry in unique_ancestries:
741                if str(ancestry) not in self.ancestry_map:
742                    warnings.warn(
743                        f"Ancestry '{ancestry}' found in LAI data is not represented in the ancestry map."
744                    )
745
746    def save(self, file: Union[str, Path]) -> None:
747        """
748        Save the data stored in `self` to a specified file.
749        If the file already exists, it will be overwritten.
750
751        The format of the saved file is determined by the file extension provided in the `file` 
752        argument.
753
754        **Supported formats:**
755
756        - `.msp`: Text-based MSP format.
757        - `.msp.tsv`: Text-based MSP format with TSV extension.
758        - `.pkl`: Pickle format for saving `self` in serialized form.
759
760        Args:
761            file (str or pathlib.Path): 
762                Path to the file where the data will be saved. The extension of the file determines the save format. 
763                Supported extensions: `.msp`, `.msp.tsv`, `.pkl`.
764        """
765        path = Path(file)
766        suffixes = [suffix.lower() for suffix in path.suffixes]
767
768        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
769            self.save_msp(file)
770        elif suffixes[-1] == '.pkl':
771            self.save_pickle(file)
772        else:
773            raise ValueError(
774                f"Unsupported file extension: {suffixes[-1]}"
775                "Supported extensions are: .msp, .msp.tsv, .pkl."
776            )
777
778    def save_msp(self, file: Union[str, Path]) -> None:
779        """
780        Save the data stored in `self` to a `.msp` file.
781        If the file already exists, it will be overwritten.
782
783        Args:
784            file (str or pathlib.Path): 
785                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
786                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
787        """
788        from snputils.ancestry.io.local.write import MSPWriter
789
790        MSPWriter(self, file).write()
791
792    def save_pickle(self, file: Union[str, Path]) -> None:
793        """
794        Save `self` in serialized form to a `.pkl` file.
795        If the file already exists, it will be overwritten.
796
797        Args:
798            file (str or pathlib.Path): 
799                Path to the file where the data will be saved. It should end with `.pkl`. 
800                If the provided path does not have this extension, it will be appended.
801        """
802        import pickle
803        with open(file, 'wb') as file:
804            pickle.dump(self, file)

A class for window-level Local Ancestry Inference (LAI) data.

LocalAncestryObject( haplotypes: List[str], lai: numpy.ndarray, samples: List[str] | None = None, ancestry_map: Dict[str, str] | None = None, window_sizes: numpy.ndarray | None = None, centimorgan_pos: numpy.ndarray | None = None, chromosomes: numpy.ndarray | None = None, physical_pos: numpy.ndarray | None = None)
21    def __init__(
22        self,
23        haplotypes: List[str], 
24        lai: np.ndarray,
25        samples: Optional[List[str]] = None, 
26        ancestry_map: Optional[Dict[str, str]] = None, 
27        window_sizes: Optional[np.ndarray] = None,
28        centimorgan_pos: Optional[np.ndarray] = None,
29        chromosomes: Optional[np.ndarray] = None,
30        physical_pos: Optional[np.ndarray] = None
31    ) -> None:
32        """
33        Args:
34            haplotypes (list of str of length n_haplotypes):
35                A list of unique haplotype identifiers.
36            lai (array of shape (n_windows, n_haplotypes)): 
37                A 2D array containing local ancestry inference values, where each row represents a 
38                genomic window, and each column corresponds to a haplotype phase for each sample.
39            samples (list of str of length n_samples, optional):
40                A list of unique sample identifiers.
41            ancestry_map (dict of str to str, optional):
42                A dictionary mapping ancestry codes to region names.
43            window_sizes (array of shape (n_windows,), optional): 
44                An array specifying the number of SNPs in each genomic window.
45            centimorgan_pos (array of shape (n_windows, 2), optional): 
46                A 2D array containing the start and end centimorgan positions for each window.
47            chromosomes (array of shape (n_windows,), optional): 
48                An array with chromosome numbers corresponding to each genomic window.
49            physical_pos (array of shape (n_windows, 2), optional): 
50                A 2D array containing the start and end physical positions for each window.
51        """
52        if lai.ndim != 2:
53            raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).")
54        
55        # Determine the number of unique ancestries and samples from the LAI array
56        n_ancestries = len(np.unique(lai))
57        n_haplotypes = lai.shape[1]
58        n_samples = n_haplotypes // 2
59
60        super(LocalAncestryObject, self).__init__(n_samples, n_ancestries)
61
62        self.__haplotypes = haplotypes
63        self.__lai = lai
64        self.__window_sizes = window_sizes
65        self.__centimorgan_pos = centimorgan_pos
66        self.__samples = samples
67        self.__chromosomes = chromosomes
68        self.__physical_pos = physical_pos
69        self.__ancestry_map = ancestry_map
70
71        # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map
72        self._sanity_check()
Arguments:
  • haplotypes (list of str of length n_haplotypes): A list of unique haplotype identifiers.
  • lai (array of shape (n_windows, n_haplotypes)): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
  • samples (list of str of length n_samples, optional): A list of unique sample identifiers.
  • ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
  • window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
  • centimorgan_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end centimorgan positions for each window.
  • chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
  • physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
haplotypes: List[str]
 94    @property
 95    def haplotypes(self) -> List[str]:
 96        """
 97        Retrieve `haplotypes`.
 98
 99        Returns:
100            **list of length n_haplotypes:** A list of unique haplotype identifiers.
101        """
102        return self.__haplotypes

Retrieve haplotypes.

Returns:

list of length n_haplotypes: A list of unique haplotype identifiers.

lai: numpy.ndarray
111    @property
112    def lai(self) -> np.ndarray:
113        """
114        Retrieve `lai`.
115
116        Returns:
117            **array of shape (n_windows, n_haplotypes):** 
118                A 2D array containing local ancestry inference values, where each row represents a 
119                genomic window, and each column corresponds to a haplotype phase for each sample.
120        """
121        return self.__lai

Retrieve lai.

Returns:

array of shape (n_windows, n_haplotypes): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.

samples: List[str] | None
130    @property
131    def samples(self) -> Optional[List[str]]:
132        """
133        Retrieve `samples`.
134
135        Returns:
136            **list of str:** A list of unique sample identifiers.
137        """
138        if self.__samples is not None:
139            return self.__samples
140        elif self.__haplotypes is not None:
141            return [hap.split('.')[0] for hap in self.__haplotypes][::2]
142        else:
143            return None

Retrieve samples.

Returns:

list of str: A list of unique sample identifiers.

ancestry_map: Dict[str, str] | None
152    @property
153    def ancestry_map(self) -> Optional[Dict[str, str]]:
154        """
155        Retrieve `ancestry_map`.
156
157        Returns:
158            **dict of str to str:** A dictionary mapping ancestry codes to region names.
159        """
160        return self.__ancestry_map

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names.

window_sizes: numpy.ndarray | None
169    @property
170    def window_sizes(self) -> Optional[np.ndarray]:
171        """
172        Retrieve `window_sizes`.
173
174        Returns:
175            **array of shape (n_windows,):** 
176                An array specifying the number of SNPs in each genomic window.
177        """
178        return self.__window_sizes

Retrieve window_sizes.

Returns:

array of shape (n_windows,): An array specifying the number of SNPs in each genomic window.

centimorgan_pos: numpy.ndarray | None
187    @property
188    def centimorgan_pos(self) -> Optional[np.ndarray]:
189        """
190        Retrieve `centimorgan_pos`.
191
192        Returns:
193            **array of shape (n_windows, 2):** 
194                A 2D array containing the start and end centimorgan positions for each window.
195        """
196        return self.__centimorgan_pos

Retrieve centimorgan_pos.

Returns:

array of shape (n_windows, 2): A 2D array containing the start and end centimorgan positions for each window.

chromosomes: numpy.ndarray | None
205    @property
206    def chromosomes(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `chromosomes`.
209
210        Returns:
211            **array of shape (n_windows,):** 
212                An array with chromosome numbers corresponding to each genomic window.
213        """
214        return self.__chromosomes

Retrieve chromosomes.

Returns:

array of shape (n_windows,): An array with chromosome numbers corresponding to each genomic window.

physical_pos: numpy.ndarray | None
223    @property
224    def physical_pos(self) -> Optional[np.ndarray]:
225        """
226        Retrieve `physical_pos`.
227
228        Returns:
229            **array of shape (n_windows, 2):** 
230                A 2D array containing the start and end physical positions for each window.
231        """
232        return self.__physical_pos

Retrieve physical_pos.

Returns:

array of shape (n_windows, 2): A 2D array containing the start and end physical positions for each window.

n_samples: int
241    @property
242    def n_samples(self) -> int:
243        """
244        Retrieve `n_samples`.
245
246        Returns:
247            **int:** 
248                The total number of samples.
249        """
250        if self.__samples is not None:
251            return len(self.__samples)
252        elif self.__haplotypes is not None:
253            # Divide by 2 because each sample has two associated haplotypes
254            return len(self.__haplotypes) // 2
255        else:
256            # Divide by 2 because columns represent haplotypes
257            return self.__lai.shape[1] // 2

Retrieve n_samples.

Returns:

int: The total number of samples.

n_ancestries: int
259    @property
260    def n_ancestries(self) -> int:
261        """
262        Retrieve `n_ancestries`.
263
264        Returns:
265            **int:** The total number of unique ancestries.
266        """
267        return len(np.unique(self.__lai))

Retrieve n_ancestries.

Returns:

int: The total number of unique ancestries.

n_haplotypes: int
269    @property
270    def n_haplotypes(self) -> int:
271        """
272        Retrieve `n_haplotypes`.
273
274        Returns:
275            **int:** The total number of haplotypes.
276        """
277        if self.__haplotypes is not None:
278            return len(self.__haplotypes)
279        else:
280            return self.__lai.shape[1]

Retrieve n_haplotypes.

Returns:

int: The total number of haplotypes.

n_windows: int
282    @property
283    def n_windows(self) -> int:
284        """
285        Retrieve `n_windows`.
286
287        Returns:
288            **int:** The total number of genomic windows.
289        """
290        return self.__lai.shape[0]

Retrieve n_windows.

Returns:

int: The total number of genomic windows.

def copy(self) -> LocalAncestryObject:
292    def copy(self) -> 'LocalAncestryObject':
293        """
294        Create and return a copy of `self`.
295
296        Returns:
297            **LocalAncestryObject:** 
298                A new instance of the current object.
299        """
300        return copy.copy(self)

Create and return a copy of self.

Returns:

LocalAncestryObject: A new instance of the current object.

def keys(self) -> List[str]:
302    def keys(self) -> List[str]:
303        """
304        Retrieve a list of public attribute names for `self`.
305
306        Returns:
307            **list of str:** 
308                A list of attribute names, with internal name-mangling removed, 
309                for easier reference to public attributes in the instance.
310        """
311        return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def filter_windows( self, indexes: int | Sequence[int] | numpy.ndarray, include: bool = True, inplace: bool = False) -> LocalAncestryObject | None:
313    def filter_windows(
314            self,
315            indexes: Union[int, Sequence[int], np.ndarray],
316            include: bool = True,
317            inplace: bool = False
318        ) -> Optional['LocalAncestryObject']:
319        """
320        Filter genomic windows based on specified indexes. 
321
322        This method updates the `lai` attribute to include or exclude the specified genomic windows. 
323        Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 
324        updated accordingly if they are not None. The order of genomic windows is preserved.
325
326        Negative indexes are supported and follow 
327        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 
328
329        Args:
330            indexes (int or array-like of int): 
331                Index(es) of the windows to include or exclude. Can be a single integer or a
332                sequence of integers. Negative indexes are supported.
333            include (bool, default=True): 
334                If True, includes only the specified windows. If False, excludes the specified
335                windows. Default is True.
336            inplace (bool, default=False): 
337                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 
338                the windows filtered. Default is False.
339
340        Returns:
341            **Optional[LocalAncestryObject]:** 
342                A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 
343                If `inplace=True`, modifies `self` in place and returns None.
344        """
345        # Convert indexes to a NumPy array
346        indexes = np.atleast_1d(indexes)
347
348        # Get total number of windows
349        n_windows = self.n_windows
350
351        # Validate indexes, allowing negative indexes
352        if np.any((indexes < -n_windows) | (indexes >= n_windows)):
353            raise IndexError("One or more indexes are out of bounds.")
354
355        # Create boolean mask
356        mask = np.zeros(n_windows, dtype=bool)
357        mask[indexes] = True
358
359        # Invert mask if `include=False`
360        if not include:
361            mask = ~mask
362        
363        # Filter `lai`
364        filtered_lai = self['lai'][mask, :] 
365        
366        # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering
367        filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None
368        filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None
369        filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None
370        filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None
371
372        # Modify the original object if `inplace=True`, otherwise create and return a copy
373        if inplace:
374            self['lai'] = filtered_lai
375            if filtered_window_sizes is not None:
376                self['window_sizes'] = filtered_window_sizes
377            if filtered_chromosomes is not None:
378                self['chromosomes'] = filtered_chromosomes
379            if filtered_centimorgan_pos is not None:
380                self['centimorgan_pos'] = filtered_centimorgan_pos
381            if filtered_physical_pos is not None:
382                self['physical_pos'] = filtered_physical_pos
383            return None
384        else:
385            laiobj = self.copy()
386            laiobj['lai'] = filtered_lai
387            if filtered_window_sizes is not None:
388                laiobj['window_sizes'] = filtered_window_sizes
389            if filtered_chromosomes is not None:
390                laiobj['chromosomes'] = filtered_chromosomes
391            if filtered_centimorgan_pos is not None:
392                laiobj['centimorgan_pos'] = filtered_centimorgan_pos
393            if filtered_physical_pos is not None:
394                laiobj['physical_pos'] = filtered_physical_pos
395            return laiobj

Filter genomic windows based on specified indexes.

This method updates the lai attribute to include or exclude the specified genomic windows. Attributes such as window_sizes, centimorgan_pos, chromosomes, and physical_pos will also be updated accordingly if they are not None. The order of genomic windows is preserved.

Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • indexes (int or array-like of int): Index(es) of the windows to include or exclude. Can be a single integer or a sequence of integers. Negative indexes are supported.
  • include (bool, default=True): If True, includes only the specified windows. If False, excludes the specified windows. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new LocalAncestryObject with the windows filtered. Default is False.
Returns:

Optional[LocalAncestryObject]: A new LocalAncestryObject with the specified windows filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def filter_samples( self, samples: str | Sequence[str] | numpy.ndarray | None = None, indexes: int | Sequence[int] | numpy.ndarray | None = None, include: bool = True, reorder: bool = False, inplace: bool = False) -> LocalAncestryObject | None:
397    def filter_samples(
398        self,
399        samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
400        indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
401        include: bool = True,
402        reorder: bool = False,
403        inplace: bool = False
404    ) -> Optional['LocalAncestryObject']:
405        """
406        Filter samples based on specified names or indexes.
407
408        This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 
409        samples. Each sample is associated with two haplotypes, which are included or excluded together.
410        The order of the samples is preserved. Set `reorder=True` to match the ordering of the
411        provided `samples` and/or `indexes` lists when including.
412
413        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
414        indexes will be included or excluded.
415        
416        Negative indexes are supported and follow 
417        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
418
419        Args:
420            samples (str or array_like of str, optional): 
421                 Name(s) of the samples to include or exclude. Can be a single sample name or a
422                 sequence of sample names. Default is None.
423            indexes (int or array_like of int, optional):
424                Index(es) of the samples to include or exclude. Can be a single index or a sequence
425                of indexes. Negative indexes are supported. Default is None.
426            include (bool, default=True): 
427                If True, includes only the specified samples. If False, excludes the specified
428                samples. Default is True.
429            inplace (bool, default=False): 
430                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 
431                samples filtered. Default is False.
432
433        Returns:
434            **Optional[LocalAncestryObject]:** 
435                A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 
436                If `inplace=True`, modifies `self` in place and returns None.
437        """
438        if samples is None and indexes is None:
439            raise UserWarning("At least one of 'samples' or 'indexes' must be provided.")
440
441        n_haplotypes = self.n_haplotypes
442        n_samples = self.n_samples
443
444        # Create mask based on sample names
445        if samples is not None:
446            samples = np.asarray(samples).ravel()
447            # Extract sample names from haplotype identifiers
448            haplotype_ids = np.array(self['haplotypes'])
449            sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids])
450            # Create mask for haplotypes belonging to specified samples
451            mask_samples = np.isin(sample_names, samples)
452        else:
453            mask_samples = np.zeros(n_haplotypes, dtype=bool)
454
455        # Create mask based on sample indexes
456        if indexes is not None:
457            indexes = np.asarray(indexes).ravel()
458
459            # Validate indexes, allowing negative indexes
460            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
461            if out_of_bounds_indexes.size > 0:
462                raise ValueError(f"One or more sample indexes are out of bounds.")
463
464            # Adjust negative indexes
465            indexes = np.mod(indexes, n_samples)
466            
467            # Get haplotype indexes for the specified sample indexes
468            haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1])
469            # Create mask for haplotypes
470            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
471            mask_indexes[haplotype_indexes] = True
472        else:
473            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
474
475        # Combine masks using logical OR (union of samples)
476        mask_combined = mask_samples | mask_indexes
477
478        if not include:
479            mask_combined = ~mask_combined
480
481        # Optionally compute an ordering of selected samples that follows the provided lists
482        ordered_sample_indices = None
483        sample_mask = mask_combined.reshape(-1, 2).any(axis=1)
484        if include and reorder:
485            sel_sample_indices = np.where(sample_mask)[0]
486            ordered_list: List[int] = []
487            added = np.zeros(self.n_samples, dtype=bool)
488
489            # Source of sample names for ordering logic
490            haplotype_ids = np.array(self['haplotypes'])
491            sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2]
492
493            # Respect the order in `samples`
494            if samples is not None:
495                for s in np.atleast_1d(samples):
496                    # Find the sample index by name (first occurrence)
497                    matches = np.where(sample_names_by_sample == s)[0]
498                    for idx in matches:
499                        if sample_mask[idx] and not added[idx]:
500                            ordered_list.append(int(idx))
501                            added[idx] = True
502
503            # Then respect the order in `indexes`
504            if indexes is not None:
505                adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples)
506                for idx in adj_idx:
507                    if sample_mask[idx] and not added[idx]:
508                        ordered_list.append(int(idx))
509                        added[idx] = True
510
511            # Append any remaining selected samples in their original order
512            for idx in sel_sample_indices:
513                if not added[idx]:
514                    ordered_list.append(int(idx))
515
516            ordered_sample_indices = np.asarray(ordered_list, dtype=int)
517
518        # Filter / reorder arrays
519        if ordered_sample_indices is not None:
520            hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1])
521            filtered_lai = self['lai'][:, hap_idx]
522            filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist()
523            filtered_samples = (
524                np.array(self['samples'])[ordered_sample_indices].tolist()
525                if self['samples'] is not None else None
526            )
527        else:
528            # Filter `lai`
529            filtered_lai = self['lai'][:, mask_combined]
530            # Filter `haplotypes`
531            filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist()
532            # Filter `samples`, checking if they are None before filtering
533            filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None
534
535        if inplace:
536            self['haplotypes'] = filtered_haplotypes
537            self['samples'] = filtered_samples
538            self['lai'] = filtered_lai
539            return None
540        else:
541            laiobj = self.copy()
542            laiobj['haplotypes'] = filtered_haplotypes
543            laiobj['samples'] = filtered_samples
544            laiobj['lai'] = filtered_lai
545            return laiobj

Filter samples based on specified names or indexes.

This method updates the lai, haplotypes, and samples attributes to include or exclude the specified samples. Each sample is associated with two haplotypes, which are included or excluded together. The order of the samples is preserved. Set reorder=True to match the ordering of the provided samples and/or indexes lists when including.

If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.

Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
  • indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
  • include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new LocalAncestryObject with the samples filtered. Default is False.
Returns:

Optional[LocalAncestryObject]: A new LocalAncestryObject with the specified samples filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def convert_to_snp_level( self, snpobject: SNPObject | None = None, variants_chrom: numpy.ndarray | None = None, variants_pos: numpy.ndarray | None = None, variants_ref: numpy.ndarray | None = None, variants_alt: numpy.ndarray | None = None, variants_filter_pass: numpy.ndarray | None = None, variants_id: numpy.ndarray | None = None, variants_qual: numpy.ndarray | None = None, lai_format: str = '3D') -> SNPObject:
547    def convert_to_snp_level(
548        self,
549        snpobject: Optional['SNPObject'] = None,
550        variants_chrom: Optional[np.ndarray] = None,
551        variants_pos: Optional[np.ndarray] = None,
552        variants_ref: Optional[np.ndarray] = None,
553        variants_alt: Optional[np.ndarray] = None,
554        variants_filter_pass: Optional[np.ndarray] = None,
555        variants_id: Optional[np.ndarray] = None,
556        variants_qual: Optional[np.ndarray] = None,
557        lai_format: str = "3D"
558    ) -> 'SNPObject':
559        """
560        Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 
561        with optional support for SNP data.
562        
563        If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 
564        SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 
565        coordinates are used to assign ancestry values based on their respective windows.
566
567        If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments.
568        In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned.
569
570        Args:
571            snpobject (SNPObject, optional):
572                An existing `SNPObject` to extract SNP attributes from.
573            variants_chrom (array of shape (n_snps,), optional): 
574                An array containing the chromosome for each SNP.
575            variants_pos (array of shape (n_snps,), optional): 
576                An array containing the chromosomal positions for each SNP.
577            variants_ref (array of shape (n_snps,), optional): 
578                An array containing the reference allele for each SNP.
579            variants_alt (array of shape (n_snps,), optional): 
580                An array containing the alternate allele for each SNP.
581            variants_filter_pass (array of shape (n_snps,), optional): 
582                An array indicating whether each SNP passed control checks.
583            variants_id (array of shape (n_snps,), optional): 
584                An array containing unique identifiers (IDs) for each SNP.
585            variants_qual (array of shape (n_snps,), optional): 
586                An array containing the Phred-scaled quality score for each SNP.
587            lai_format (str, optional):
588                Determines the shape of `calldata_lai`:
589                    - `"3D"` (default): Shape `(n_snps, n_samples, 2)`.
590                    - `"2D"`: Shape `(n_snps, n_samples * 2)`.
591
592        Returns:
593            **SNPObject**: 
594                A `SNPObject` containing SNP-level ancestry data and updated SNP attributes.
595        """
596        from snputils.snp.genobj.snpobj import SNPObject
597
598        assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'."
599
600        # Extract attributes from SNPObject if provided
601        if snpobject is not None:
602            variants_chrom = variants_chrom or snpobject.variants_chrom
603            variants_pos = variants_pos or snpobject.variants_pos
604            variants_ref = variants_ref or snpobject.variants_ref
605            variants_alt = variants_alt or snpobject.variants_alt
606            variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass
607            variants_id = variants_id or snpobject.variants_id
608            variants_qual = variants_qual or snpobject.variants_qual
609
610        n_samples = self.n_samples
611        lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None
612
613        if variants_pos is None or variants_chrom is None:
614            # Generate all SNP positions and corresponding chromosome labels between window boundaries
615            variants_pos_list = []
616            variants_chrom_list = []
617            ancestry_list = []
618
619            for i in range(self.n_windows):
620                start = int(self.physical_pos[i, 0])
621                end = int(self.physical_pos[i, 1])
622                chrom = self.chromosomes[i]
623
624                # Generate SNP positions at each base pair within the window range
625                positions_in_window = np.arange(start, end + 1)
626                if positions_in_window.size == 0:
627                    continue  # Skip windows that contain no valid SNP positions
628
629                n_positions = positions_in_window.size
630                variants_pos_list.append(positions_in_window)
631                variants_chrom_list.append(np.full(n_positions, chrom))
632
633                ancestry_repeated = (
634                    np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0)
635                    if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0)
636                )
637                ancestry_list.append(ancestry_repeated)
638
639            # Store SNP positions, their corresponding chromosome labels, and their associated ancestry
640            variants_pos = np.concatenate(variants_pos_list)
641            variants_chrom = np.concatenate(variants_chrom_list)
642            calldata_lai = np.concatenate(ancestry_list)
643        else:
644            # Use the provided SNP positions and chromosomes
645            n_snps = len(variants_pos)
646            if len(variants_chrom) != n_snps:
647                raise ValueError("`variants_pos` and `variants_chrom` must have the same length.")
648
649            # Initialize an array to store the corresponding window index for each SNP
650            # Default value is -1, meaning no matching window found
651            snp_to_window_indices = np.full(n_snps, -1, dtype=int)
652
653            # Identify unique chromosome names sorted by order of appearence
654            _, idx = np.unique(variants_chrom, return_index=True)
655            unique_chroms = variants_chrom[np.sort(idx)]
656
657            # Iterate through each unique chromosome to map SNPs to windows
658            for chrom in unique_chroms:
659                # Get indices of SNPs that belong to the current chromosome
660                snp_indices = np.where(variants_chrom == chrom)[0]
661                snp_pos_chr = variants_pos[snp_indices]
662                
663                # Get indices of windows that belong to the current chromosome
664                window_indices = np.where(self.chromosomes == chrom)[0]
665                if window_indices.size == 0:
666                    continue  # Skip if no windows exist for this chromosome
667                
668                # Extract start and end positions of the windows on this chromosome
669                window_starts_chr = self.physical_pos[:, 0][window_indices]
670                window_ends_chr = self.physical_pos[:, 1][window_indices]
671                
672                # Find the right-most window that a SNP would fit into (sorted order)
673                inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1
674                
675                # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries
676                valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds])
677
678                # Assign valid SNPs to their corresponding window indices
679                snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]]
680                log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}")
681
682            # Initialize SNP-level ancestry array with a missing-value sentinel.
683            # `-1` marks SNPs that do not fall within any LAI window.
684            shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2)
685            calldata_lai = np.full(shape, -1, dtype=np.int16)
686
687            # Assign ancestry values to SNPs with valid window assignments
688            valid_mask = (snp_to_window_indices != -1)
689            snp_indices = np.where(valid_mask)[0]
690            snp_to_window_indices = snp_to_window_indices[snp_indices]
691
692            if lai_format == "3D":
693                calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices]
694            else:  # "2D"
695                calldata_lai[snp_indices] = self.lai[snp_to_window_indices]
696
697        if snpobject is not None:
698            # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai``
699            snpobject.variants_chrom = variants_chrom
700            snpobject.variants_pos = variants_pos
701            snpobject.variants_ref = variants_ref
702            snpobject.variants_alt = variants_alt
703            snpobject.variants_filter_pass = variants_filter_pass
704            snpobject.variants_id = variants_id
705            snpobject.variants_qual = variants_qual
706            snpobject.calldata_lai = calldata_lai
707            snpobject.ancestry_map = self.ancestry_map
708            return snpobject
709        else:
710            # Otherwise, create a new SNPObject
711            return SNPObject(
712                calldata_lai=calldata_lai.view(),
713                samples=self.samples,
714                variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref,
715                variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt,
716                variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass,
717                variants_chrom=variants_chrom.view(),
718                variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id,
719                variants_pos=variants_pos.view(),
720                variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual,
721                ancestry_map=self.ancestry_map
722            )

Convert self into a snputils.snp.genobj.SNPObject SNP-level Local Ancestry Information (LAI), with optional support for SNP data.

If SNP positions (variants_pos) and/or chromosomes (variants_chrom) are not specified, the method generates SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP coordinates are used to assign ancestry values based on their respective windows.

If a SNPObject is provided, its attributes are used unless explicitly overridden by the function arguments. In that case, the SNPObject is updated with the (optional) new attributes and the computed calldata_lai, then returned.

Arguments:
  • snpobject (SNPObject, optional): An existing SNPObject to extract SNP attributes from.
  • variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
  • variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
  • variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
  • variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
  • variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
  • variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
  • variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
  • lai_format (str, optional): Determines the shape of calldata_lai:
    • "3D" (default): Shape (n_snps, n_samples, 2).
    • "2D": Shape (n_snps, n_samples * 2).
Returns:

SNPObject: A SNPObject containing SNP-level ancestry data and updated SNP attributes.

def save(self, file: str | pathlib.Path) -> None:
746    def save(self, file: Union[str, Path]) -> None:
747        """
748        Save the data stored in `self` to a specified file.
749        If the file already exists, it will be overwritten.
750
751        The format of the saved file is determined by the file extension provided in the `file` 
752        argument.
753
754        **Supported formats:**
755
756        - `.msp`: Text-based MSP format.
757        - `.msp.tsv`: Text-based MSP format with TSV extension.
758        - `.pkl`: Pickle format for saving `self` in serialized form.
759
760        Args:
761            file (str or pathlib.Path): 
762                Path to the file where the data will be saved. The extension of the file determines the save format. 
763                Supported extensions: `.msp`, `.msp.tsv`, `.pkl`.
764        """
765        path = Path(file)
766        suffixes = [suffix.lower() for suffix in path.suffixes]
767
768        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
769            self.save_msp(file)
770        elif suffixes[-1] == '.pkl':
771            self.save_pickle(file)
772        else:
773            raise ValueError(
774                f"Unsupported file extension: {suffixes[-1]}"
775                "Supported extensions are: .msp, .msp.tsv, .pkl."
776            )

Save the data stored in self to a specified file. If the file already exists, it will be overwritten.

The format of the saved file is determined by the file extension provided in the file argument.

Supported formats:

  • .msp: Text-based MSP format.
  • .msp.tsv: Text-based MSP format with TSV extension.
  • .pkl: Pickle format for saving self in serialized form.
Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format. Supported extensions: .msp, .msp.tsv, .pkl.
def save_msp(self, file: str | pathlib.Path) -> None:
778    def save_msp(self, file: Union[str, Path]) -> None:
779        """
780        Save the data stored in `self` to a `.msp` file.
781        If the file already exists, it will be overwritten.
782
783        Args:
784            file (str or pathlib.Path): 
785                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
786                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
787        """
788        from snputils.ancestry.io.local.write import MSPWriter
789
790        MSPWriter(self, file).write()

Save the data stored in self to a .msp file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.
def save_pickle(self, file: str | pathlib.Path) -> None:
792    def save_pickle(self, file: Union[str, Path]) -> None:
793        """
794        Save `self` in serialized form to a `.pkl` file.
795        If the file already exists, it will be overwritten.
796
797        Args:
798            file (str or pathlib.Path): 
799                Path to the file where the data will be saved. It should end with `.pkl`. 
800                If the provided path does not have this extension, it will be appended.
801        """
802        import pickle
803        with open(file, 'wb') as file:
804            pickle.dump(self, file)

Save self in serialized form to a .pkl file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pkl. If the provided path does not have this extension, it will be appended.
class GlobalAncestryObject(snputils.ancestry.genobj.base.AncestryObject):
 10class GlobalAncestryObject(AncestryObject):
 11    """
 12    A class for Global Ancestry Inference (GAI) data.
 13    """
 14    def __init__(
 15        self,
 16        Q: np.ndarray,
 17        P: Optional[np.ndarray] = None,
 18        samples: Optional[Sequence] = None,
 19        snps: Optional[Sequence] = None,
 20        ancestries: Optional[Sequence] = None
 21    ) -> None:
 22        """
 23        Args:
 24            Q (array of shape (n_samples, n_ancestries)):
 25                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
 26                and each column corresponds to an ancestry.
 27            P (array of shape (n_snps, n_ancestries)):
 28                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
 29                and each column corresponds to an ancestry.
 30            samples (sequence of length n_samples, optional):
 31                A sequence containing unique identifiers for each sample. If None, sample identifiers 
 32                are assigned as integers from `0` to `n_samples - 1`.
 33            snps (sequence of length n_snps, optional):
 34                A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 
 35                from `0` to `n_snps - 1`.
 36            ancestries (sequence of length n_samples, optional):
 37                A sequence containing ancestry labels for each sample.
 38        """
 39        # Determine dimensions
 40        n_samples, n_ancestries_Q = Q.shape
 41        if P is not None:
 42            n_snps, n_ancestries_P = P.shape
 43            if n_ancestries_Q != n_ancestries_P:
 44                raise ValueError(
 45                    f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same."
 46                )
 47
 48        n_ancestries = n_ancestries_Q
 49
 50        # Assign default sample identifiers if none provided
 51        if samples is None:
 52            samples = list(range(n_samples))
 53        else:
 54            samples = list(samples)
 55            if len(samples) != n_samples:
 56                raise ValueError(
 57                    f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})."
 58                )
 59
 60        # Assign default SNP identifiers if none provided
 61        if P is None:
 62            snps = None
 63        else:
 64            if snps is None:
 65                snps = list(range(n_snps))
 66            else:
 67                snps = list(snps)
 68                if len(snps) != n_snps:
 69                    raise ValueError(
 70                        f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})."
 71                    )
 72
 73        if ancestries is not None:
 74            if len(ancestries) != n_samples:
 75                raise ValueError(
 76                    f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})."
 77                )
 78
 79        super().__init__(n_samples, n_ancestries)
 80
 81        # Store attributes
 82        self.__Q = Q
 83        self.__P = P
 84        self.__samples = np.asarray(samples)
 85        self.__snps = np.asarray(snps) if snps is not None else None
 86        self.__ancestries = np.asarray(ancestries) if ancestries is not None else None
 87
 88        # Perform sanity checks
 89        self._sanity_check()
 90
 91    def __getitem__(self, key):
 92        """
 93        To access an attribute of the class using the square bracket notation,
 94        similar to a dictionary.
 95        """
 96        try:
 97            return getattr(self, key)
 98        except AttributeError:
 99            raise KeyError(f'Invalid key: {key}')
100
101    def __setitem__(self, key, value):
102        """
103        To set an attribute of the class using the square bracket notation,
104        similar to a dictionary.
105        """
106        try:
107            setattr(self, key, value)
108        except AttributeError:
109            raise KeyError(f'Invalid key: {key}')
110
111    @property
112    def Q(self) -> np.ndarray:
113        """
114        Retrieve `Q`.
115
116        Returns:
117            **array of shape (n_samples, n_ancestries):** 
118                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
119                and each column corresponds to an ancestry.
120        """
121        return self.__Q
122    
123    @Q.setter
124    def Q(self, x: np.ndarray):
125        """
126        Update `Q`.
127        """
128        if x.shape != (self.n_samples, self.n_ancestries):
129            raise ValueError(
130                f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {x.shape}."
131            )
132        self.__Q = x
133    
134    @property
135    def P(self) -> np.ndarray:
136        """
137        Retrieve `P`.
138
139        Returns:
140            **array of shape (n_snps, n_ancestries):** 
141                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
142                and each column corresponds to an ancestry.
143        """
144        return self.__P
145
146    @P.setter
147    def P(self, x: np.ndarray):
148        """
149        Update `P`.
150        """
151        if x.shape[1] != self.n_ancestries:
152            raise ValueError(
153                f"P must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}."
154            )
155        self.__P = x
156        self._sanity_check()
157    
158    @property
159    def F(self) -> np.ndarray:
160        """
161        Alias for `P`.
162
163        Returns:
164            **array of shape (n_snps, n_ancestries):** 
165                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
166                and each column corresponds to an ancestry.
167        """
168        return self.P
169
170    @F.setter
171    def F(self, x: np.ndarray):
172        """
173        Update `F`.
174        """
175        if x.shape[1] != self.n_ancestries:
176            raise ValueError(
177                f"F must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}."
178            )
179        self.__P = x
180    
181    @property
182    def samples(self) -> Optional[np.ndarray]:
183        """
184        Retrieve `samples`.
185
186        Returns:
187            **array of shape (n_samples,):** 
188                An array containing unique identifiers for each sample. If None, sample 
189                identifiers are assigned as integers from `0` to `n_samples - 1`.
190        """
191        return self.__samples
192        
193    @samples.setter
194    def samples(self, x: Sequence):
195        """
196        Update `samples`.
197        """
198        x = list(x)
199        if len(x) != self.n_samples:
200            raise ValueError(
201                f"samples must have length {self.n_samples}; got length {len(x)}."
202            )
203        self.__samples = x
204
205    @property
206    def snps(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `snps`.
209
210        Returns:
211            **array of shape (n_snps,):** 
212                An array containing identifiers for each SNP. If None, SNPs are assigned as integers 
213                from `0` to `n_snps - 1`.
214        """
215        return self.__snps
216
217    @snps.setter
218    def snps(self, x: Sequence):
219        """
220        Update `snps`.
221        """
222        x = list(x)
223        if len(x) != self.n_snps:
224            raise ValueError(
225                f"snps must have length {self.n_snps}; got length {len(x)}."
226            )
227        self.__snps = np.asarray(x)
228
229    @property
230    def ancestries(self) -> Optional[np.ndarray]:
231        """
232        Retrieve `ancestries`.
233
234        Returns:
235            **array of shape (n_samples,):** 
236                An array containing ancestry labels for each sample.
237        """
238        return self.__ancestries
239    
240    @ancestries.setter
241    def ancestries(self, x: Sequence):
242        """
243        Update `ancestries`.
244        """
245        x = list(x)
246        num_x = len(x)
247        num_unique_x = len(np.unique(x))
248
249        if num_x != self.n_samples:
250            raise ValueError(
251                f"ancestries must have length {self.n_samples}; got length {num_x}."
252            )
253        if num_unique_x > self.n_ancestries:
254            raise ValueError(
255                f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_x} unique labels."
256            )
257        self.__ancestries = np.asarray(x)
258    
259    @property
260    def n_samples(self) -> int:
261        """
262        Retrieve `n_samples`.
263
264        Returns:
265            **int:** The total number of samples.
266        """
267        return self.__Q.shape[0]
268
269    @property
270    def n_snps(self) -> int:
271        """
272        Retrieve `n_snps`.
273
274        Returns:
275            **int:** The total number of SNPs.
276        """
277        return 0 if self.__P is None else self.__P.shape[0]
278
279    @property
280    def n_ancestries(self) -> int:
281        """
282        Retrieve `n_ancestries`.
283
284        Returns:
285            **int:** The total number of unique ancestries.
286        """
287        return self.__Q.shape[1]
288
289    def copy(self) -> 'GlobalAncestryObject':
290        """
291        Create and return a copy of `self`.
292
293        Returns:
294            **GlobalAncestryObject:** A new instance of the current object.
295        """
296        return copy.copy(self)
297
298    def keys(self) -> List[str]:
299        """
300        Retrieve a list of public attribute names for `self`.
301
302        Returns:
303            **list of str:** 
304                A list of attribute names, with internal name-mangling removed, 
305                for easier reference to public attributes in the instance.
306        """
307        return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
308
309    def _sanity_check(self) -> None:
310        """
311        Perform sanity checks to ensure that matrix dimensions are consistent with expected sizes.
312        
313        Raises:
314            **ValueError:** If any of the matrix dimensions do not match the expected sizes.
315        """       
316        # Check that the Q matrix has the correct shape
317        if self.__Q.shape != (self.n_samples, self.n_ancestries):
318            raise ValueError(
319                f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {self.__Q.shape}."
320            )
321
322        # Check that the P matrix has the correct shape (if provided)
323        if self.__P is not None:
324            if self.__P.shape != (self.n_snps, self.n_ancestries):
325                raise ValueError(
326                    f"P must have shape ({self.n_snps}, {self.n_ancestries}); got {self.__P.shape}."
327                )
328
329        # Check that samples length matches n_samples
330        if self.samples is not None:
331            if len(self.__samples) != self.n_samples:
332                raise ValueError(
333                    f"samples must have length {self.n_samples}; got length {len(self.__samples)}."
334                )
335
336        # Check that snps length matches n_snps
337        if self.snps is not None:
338            if len(self.__snps) != self.n_snps:
339                raise ValueError(
340                    f"snps must have length {self.n_snps}; got length {len(self.__snps)}."
341                )
342
343        # Check that ancestries length matches n_samples
344        if self.ancestries is not None:
345            if len(self.__ancestries) != self.n_samples:
346                raise ValueError(
347                    f"ancestries must have length {self.n_samples}; got length {len(self.__ancestries)}."
348                )
349
350            # Check number of unique ancestry labels
351            num_unique_ancestries = len(np.unique(self.__ancestries))
352            if num_unique_ancestries > self.n_ancestries:
353                raise ValueError(
354                    f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_ancestries} unique labels."
355                )
356
357    def save(self, file: Union[str, Path]) -> None:
358        """
359        Save the data stored in `self` to a specified file or set of files.
360
361        The format of the saved file(s) is determined by the file extension provided in the `file` 
362        argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 
363        the file is treated as a prefix for saving ADMIXTURE files.
364
365        **Supported formats:**
366
367        - `.pkl`: Pickle format for saving `self` in serialized form.
368        - Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
369
370        Args:
371            file (str or pathlib.Path): 
372                Path to the file where the data will be saved. If the extension is `.pkl`, the object
373                is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
374        """
375        path = Path(file)
376        suffix = path.suffix.lower()
377
378        if suffix == '.pkl':
379            self.save_pickle(path)
380        else:
381            self.save_admixture(path)
382
383    def save_admixture(self, file_prefix: Union[str, Path]) -> None:
384        """
385        Save the data stored in `self` into multiple ADMIXTURE files.
386        If the file already exists, it will be overwritten.
387
388        **Output files:**
389
390        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
391        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
392        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
393        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
394        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
395        
396        Args:
397            file_prefix (str or pathlib.Path): 
398                The base prefix for output file names, including directory path but excluding file extensions. 
399                The prefix is used to generate specific file names for each output, with file-specific 
400                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
401        """
402        from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter
403
404        AdmixtureWriter(self, file_prefix).write()
405
406    def save_pickle(self, file: Union[str, Path]) -> None:
407        """
408        Save `self` in serialized form to a `.pkl` file.
409        If the file already exists, it will be overwritten.
410
411        Args:
412            file (str or pathlib.Path): 
413                Path to the file where the data will be saved. It should end with `.pkl`. 
414                If the provided path does not have this extension, it will be appended.
415        """
416        import pickle
417        with open(file, 'wb') as file:
418            pickle.dump(self, file)

A class for Global Ancestry Inference (GAI) data.

GlobalAncestryObject( Q: numpy.ndarray, P: numpy.ndarray | None = None, samples: Sequence | None = None, snps: Sequence | None = None, ancestries: Sequence | None = None)
14    def __init__(
15        self,
16        Q: np.ndarray,
17        P: Optional[np.ndarray] = None,
18        samples: Optional[Sequence] = None,
19        snps: Optional[Sequence] = None,
20        ancestries: Optional[Sequence] = None
21    ) -> None:
22        """
23        Args:
24            Q (array of shape (n_samples, n_ancestries)):
25                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
26                and each column corresponds to an ancestry.
27            P (array of shape (n_snps, n_ancestries)):
28                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
29                and each column corresponds to an ancestry.
30            samples (sequence of length n_samples, optional):
31                A sequence containing unique identifiers for each sample. If None, sample identifiers 
32                are assigned as integers from `0` to `n_samples - 1`.
33            snps (sequence of length n_snps, optional):
34                A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 
35                from `0` to `n_snps - 1`.
36            ancestries (sequence of length n_samples, optional):
37                A sequence containing ancestry labels for each sample.
38        """
39        # Determine dimensions
40        n_samples, n_ancestries_Q = Q.shape
41        if P is not None:
42            n_snps, n_ancestries_P = P.shape
43            if n_ancestries_Q != n_ancestries_P:
44                raise ValueError(
45                    f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same."
46                )
47
48        n_ancestries = n_ancestries_Q
49
50        # Assign default sample identifiers if none provided
51        if samples is None:
52            samples = list(range(n_samples))
53        else:
54            samples = list(samples)
55            if len(samples) != n_samples:
56                raise ValueError(
57                    f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})."
58                )
59
60        # Assign default SNP identifiers if none provided
61        if P is None:
62            snps = None
63        else:
64            if snps is None:
65                snps = list(range(n_snps))
66            else:
67                snps = list(snps)
68                if len(snps) != n_snps:
69                    raise ValueError(
70                        f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})."
71                    )
72
73        if ancestries is not None:
74            if len(ancestries) != n_samples:
75                raise ValueError(
76                    f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})."
77                )
78
79        super().__init__(n_samples, n_ancestries)
80
81        # Store attributes
82        self.__Q = Q
83        self.__P = P
84        self.__samples = np.asarray(samples)
85        self.__snps = np.asarray(snps) if snps is not None else None
86        self.__ancestries = np.asarray(ancestries) if ancestries is not None else None
87
88        # Perform sanity checks
89        self._sanity_check()
Arguments:
  • Q (array of shape (n_samples, n_ancestries)): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
  • P (array of shape (n_snps, n_ancestries)): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
  • samples (sequence of length n_samples, optional): A sequence containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from 0 to n_samples - 1.
  • snps (sequence of length n_snps, optional): A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers from 0 to n_snps - 1.
  • ancestries (sequence of length n_samples, optional): A sequence containing ancestry labels for each sample.
Q: numpy.ndarray
111    @property
112    def Q(self) -> np.ndarray:
113        """
114        Retrieve `Q`.
115
116        Returns:
117            **array of shape (n_samples, n_ancestries):** 
118                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
119                and each column corresponds to an ancestry.
120        """
121        return self.__Q

Retrieve Q.

Returns:

array of shape (n_samples, n_ancestries): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.

P: numpy.ndarray
134    @property
135    def P(self) -> np.ndarray:
136        """
137        Retrieve `P`.
138
139        Returns:
140            **array of shape (n_snps, n_ancestries):** 
141                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
142                and each column corresponds to an ancestry.
143        """
144        return self.__P

Retrieve P.

Returns:

array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

F: numpy.ndarray
158    @property
159    def F(self) -> np.ndarray:
160        """
161        Alias for `P`.
162
163        Returns:
164            **array of shape (n_snps, n_ancestries):** 
165                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
166                and each column corresponds to an ancestry.
167        """
168        return self.P

Alias for P.

Returns:

array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

samples: numpy.ndarray | None
181    @property
182    def samples(self) -> Optional[np.ndarray]:
183        """
184        Retrieve `samples`.
185
186        Returns:
187            **array of shape (n_samples,):** 
188                An array containing unique identifiers for each sample. If None, sample 
189                identifiers are assigned as integers from `0` to `n_samples - 1`.
190        """
191        return self.__samples

Retrieve samples.

Returns:

array of shape (n_samples,): An array containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from 0 to n_samples - 1.

snps: numpy.ndarray | None
205    @property
206    def snps(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `snps`.
209
210        Returns:
211            **array of shape (n_snps,):** 
212                An array containing identifiers for each SNP. If None, SNPs are assigned as integers 
213                from `0` to `n_snps - 1`.
214        """
215        return self.__snps

Retrieve snps.

Returns:

array of shape (n_snps,): An array containing identifiers for each SNP. If None, SNPs are assigned as integers from 0 to n_snps - 1.

ancestries: numpy.ndarray | None
229    @property
230    def ancestries(self) -> Optional[np.ndarray]:
231        """
232        Retrieve `ancestries`.
233
234        Returns:
235            **array of shape (n_samples,):** 
236                An array containing ancestry labels for each sample.
237        """
238        return self.__ancestries

Retrieve ancestries.

Returns:

array of shape (n_samples,): An array containing ancestry labels for each sample.

n_samples: int
259    @property
260    def n_samples(self) -> int:
261        """
262        Retrieve `n_samples`.
263
264        Returns:
265            **int:** The total number of samples.
266        """
267        return self.__Q.shape[0]

Retrieve n_samples.

Returns:

int: The total number of samples.

n_snps: int
269    @property
270    def n_snps(self) -> int:
271        """
272        Retrieve `n_snps`.
273
274        Returns:
275            **int:** The total number of SNPs.
276        """
277        return 0 if self.__P is None else self.__P.shape[0]

Retrieve n_snps.

Returns:

int: The total number of SNPs.

n_ancestries: int
279    @property
280    def n_ancestries(self) -> int:
281        """
282        Retrieve `n_ancestries`.
283
284        Returns:
285            **int:** The total number of unique ancestries.
286        """
287        return self.__Q.shape[1]

Retrieve n_ancestries.

Returns:

int: The total number of unique ancestries.

def copy(self) -> GlobalAncestryObject:
289    def copy(self) -> 'GlobalAncestryObject':
290        """
291        Create and return a copy of `self`.
292
293        Returns:
294            **GlobalAncestryObject:** A new instance of the current object.
295        """
296        return copy.copy(self)

Create and return a copy of self.

Returns:

GlobalAncestryObject: A new instance of the current object.

def keys(self) -> List[str]:
298    def keys(self) -> List[str]:
299        """
300        Retrieve a list of public attribute names for `self`.
301
302        Returns:
303            **list of str:** 
304                A list of attribute names, with internal name-mangling removed, 
305                for easier reference to public attributes in the instance.
306        """
307        return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def save(self, file: str | pathlib.Path) -> None:
357    def save(self, file: Union[str, Path]) -> None:
358        """
359        Save the data stored in `self` to a specified file or set of files.
360
361        The format of the saved file(s) is determined by the file extension provided in the `file` 
362        argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 
363        the file is treated as a prefix for saving ADMIXTURE files.
364
365        **Supported formats:**
366
367        - `.pkl`: Pickle format for saving `self` in serialized form.
368        - Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
369
370        Args:
371            file (str or pathlib.Path): 
372                Path to the file where the data will be saved. If the extension is `.pkl`, the object
373                is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
374        """
375        path = Path(file)
376        suffix = path.suffix.lower()
377
378        if suffix == '.pkl':
379            self.save_pickle(path)
380        else:
381            self.save_admixture(path)

Save the data stored in self to a specified file or set of files.

The format of the saved file(s) is determined by the file extension provided in the file argument. If the extension is .pkl, the object is serialized as a pickle file. Otherwise, the file is treated as a prefix for saving ADMIXTURE files.

Supported formats:

  • .pkl: Pickle format for saving self in serialized form.
  • Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. If the extension is .pkl, the object is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
def save_admixture(self, file_prefix: str | pathlib.Path) -> None:
383    def save_admixture(self, file_prefix: Union[str, Path]) -> None:
384        """
385        Save the data stored in `self` into multiple ADMIXTURE files.
386        If the file already exists, it will be overwritten.
387
388        **Output files:**
389
390        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
391        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
392        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
393        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
394        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
395        
396        Args:
397            file_prefix (str or pathlib.Path): 
398                The base prefix for output file names, including directory path but excluding file extensions. 
399                The prefix is used to generate specific file names for each output, with file-specific 
400                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
401        """
402        from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter
403
404        AdmixtureWriter(self, file_prefix).write()

Save the data stored in self into multiple ADMIXTURE files. If the file already exists, it will be overwritten.

Output files:

  • <file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).
  • <file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).
  • <file_prefix>.map: Ancestry file (if ancestries information is available).
Arguments:
  • file_prefix (str or pathlib.Path): The base prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).
def save_pickle(self, file: str | pathlib.Path) -> None:
406    def save_pickle(self, file: Union[str, Path]) -> None:
407        """
408        Save `self` in serialized form to a `.pkl` file.
409        If the file already exists, it will be overwritten.
410
411        Args:
412            file (str or pathlib.Path): 
413                Path to the file where the data will be saved. It should end with `.pkl`. 
414                If the provided path does not have this extension, it will be appended.
415        """
416        import pickle
417        with open(file, 'wb') as file:
418            pickle.dump(self, file)

Save self in serialized form to a .pkl file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pkl. If the provided path does not have this extension, it will be appended.
class MSPReader(snputils.ancestry.io.local.read.base.LAIBaseReader):
 30class MSPReader(LAIBaseReader):
 31    """
 32    A reader class for parsing Local Ancestry Inference (LAI) data from an `.msp` or `msp.tsv` file
 33    and constructing a `snputils.ancestry.genobj.LocalAncestryObject`.
 34    """
 35    def __init__(self, file: Union[str, Path]) -> None:
 36        """
 37        Args:
 38            file (str or pathlib.Path): 
 39                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 40        """
 41        self.__file = Path(file)
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 51        """
 52        return self.__file
 53
 54    def _get_samples(self, msp_df: pd.DataFrame, first_lai_col_indx: int) -> List[str]:
 55        """
 56        Extract unique sample identifiers from the pandas DataFrame.
 57
 58        Args:
 59            msp_df (pd.DataFrame): 
 60                The DataFrame representing the `.msp` data, including LAI columns.
 61            first_lai_col_indx (int): 
 62                Index of the first column containing LAI data.
 63
 64        Returns:
 65            **list:** List of unique sample identifiers.
 66        """
 67        # Get all columns starting from the first LAI data column
 68        query_samples_dub = msp_df.columns[first_lai_col_indx:]
 69
 70        # Select only one of the maternal/paternal samples by taking every second sample
 71        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 72        query_samples_sing = query_samples_dub[single_ind_idx]
 73
 74        # Remove the suffix from sample names to get clean identifiers
 75        query_samples = [qs[:-2] for qs in query_samples_sing]
 76
 77        return query_samples
 78
 79    def _get_samples_from_haplotypes(self, haplotypes: List[str]) -> List[str]:
 80        query_samples_dub = np.asarray(haplotypes, dtype=object)
 81        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 82        query_samples_sing = query_samples_dub[single_ind_idx]
 83        return [str(qs)[:-2] for qs in query_samples_sing]
 84
 85    def _parse_header_and_comment(self) -> tuple[Optional[str], List[str]]:
 86        with open(self.file) as f:
 87            first_line = f.readline()
 88            second_line = f.readline()
 89
 90        first_line_ = [h.strip() for h in first_line.split("\t")]
 91        second_line_ = [h.strip() for h in second_line.split("\t")]
 92
 93        if "#chm" in first_line_:
 94            return None, first_line_
 95        if "#chm" in second_line_:
 96            return first_line, second_line_
 97
 98        raise ValueError(
 99            f"Header not found. Expected '#chm' in the first two lines. "
100            f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
101        )
102
103    def _get_first_lai_col_indx(self, header: List[str]) -> int:
104        column_counter = 1
105        if "spos" in header and "epos" in header:
106            column_counter += 2
107        if "sgpos" in header and "egpos" in header:
108            column_counter += 2
109        if "n snps" in header:
110            column_counter += 1
111        return column_counter
112
113    def read_metadata(self) -> MSPMetadata:
114        comment, header = self._parse_header_and_comment()
115
116        if len(header) != len(set(header)):
117            raise ValueError("Duplicate columns detected in the header.")
118
119        first_lai_col_indx = self._get_first_lai_col_indx(header)
120        haplotypes = header[first_lai_col_indx:]
121        samples = self._get_samples_from_haplotypes(haplotypes)
122        ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None
123
124        return MSPMetadata(
125            header=header,
126            comment=comment,
127            first_lai_col_indx=first_lai_col_indx,
128            haplotypes=haplotypes,
129            samples=samples,
130            ancestry_map=ancestry_map,
131            has_physical_pos=("spos" in header and "epos" in header),
132            has_centimorgan_pos=("sgpos" in header and "egpos" in header),
133            has_window_sizes=("n snps" in header),
134        )
135
136    def iter_windows(
137        self,
138        chunk_size: int = 1024,
139        sample_indices: Optional[np.ndarray] = None,
140    ) -> Iterator[Dict[str, np.ndarray]]:
141        metadata = self.read_metadata()
142
143        if chunk_size < 1:
144            raise ValueError("chunk_size must be >= 1.")
145
146        header = metadata.header
147        first_lai_col_indx = metadata.first_lai_col_indx
148        column_index = {name: i for i, name in enumerate(header)}
149        chrom_col_idx = column_index["#chm"]
150
151        spos_col_idx: Optional[int] = None
152        epos_col_idx: Optional[int] = None
153        if metadata.has_physical_pos:
154            spos_col_idx = column_index["spos"]
155            epos_col_idx = column_index["epos"]
156
157        if sample_indices is None:
158            hap_col_indices = list(range(first_lai_col_indx, len(header)))
159        else:
160            sample_indices = np.asarray(sample_indices, dtype=np.int64)
161            if sample_indices.size == 0:
162                raise ValueError("sample_indices cannot be empty.")
163            if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)):
164                raise ValueError("sample_indices contain out-of-bounds sample indexes.")
165
166            hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64)
167            hap_indices[0::2] = 2 * sample_indices
168            hap_indices[1::2] = 2 * sample_indices + 1
169            hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist()
170
171        n_selected_haps = len(hap_col_indices)
172        n_total_haps = len(metadata.haplotypes)
173        all_haps_selected = (
174            n_selected_haps == n_total_haps
175            and n_selected_haps > 0
176            and hap_col_indices[0] == first_lai_col_indx
177            and hap_col_indices[-1] == (len(header) - 1)
178        )
179
180        # Pre-compute relative indices for the sample-subset path so the
181        # inner loop can use np.fromstring (C-level) + numpy fancy indexing
182        # instead of a Python for-loop over potentially millions of columns.
183        if not all_haps_selected:
184            _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx
185        else:
186            _relative_hap_idx = None
187
188        row_in_chunk = 0
189        window_start = 0
190        chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
191        lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
192        physical_pos_chunk = (
193            np.empty((int(chunk_size), 2), dtype=np.int64)
194            if metadata.has_physical_pos
195            else None
196        )
197
198        with open(self.file, "r", encoding="utf-8") as handle:
199            for line_no, raw_line in enumerate(handle, start=1):
200                if not raw_line:
201                    continue
202                if raw_line.startswith("#"):
203                    continue
204
205                line = raw_line.rstrip("\n")
206                if not line:
207                    continue
208
209                # Both paths split only at the metadata/haplotype boundary,
210                # then use np.fromstring (C parser) for the haplotype tail.
211                fields = line.split("\t", first_lai_col_indx)
212                if len(fields) != (first_lai_col_indx + 1):
213                    raise ValueError(
214                        f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} "
215                        f"prefix segments when parsing haplotypes."
216                    )
217
218                chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx]
219                if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None:
220                    physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx])
221                    physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx])
222
223                lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8)
224
225                if all_haps_selected:
226                    if lai_row.size != n_selected_haps:
227                        raise ValueError(
228                            f"Malformed MSP haplotype row at line {line_no}: expected "
229                            f"{n_selected_haps} haplotype values, got {lai_row.size}."
230                        )
231                    lai_chunk[row_in_chunk, :] = lai_row
232                else:
233                    if lai_row.size < n_total_haps:
234                        raise ValueError(
235                            f"Malformed MSP haplotype row at line {line_no}: expected at least "
236                            f"{n_total_haps} haplotype values, got {lai_row.size}."
237                        )
238                    lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx]
239
240                row_in_chunk += 1
241                if row_in_chunk == chunk_size:
242                    window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
243                    yield {
244                        "window_indexes": window_indexes,
245                        "chromosomes": chromosomes_chunk,
246                        "physical_pos": physical_pos_chunk,
247                        "lai": lai_chunk,
248                    }
249
250                    window_start += row_in_chunk
251                    row_in_chunk = 0
252                    chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
253                    lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
254                    if metadata.has_physical_pos:
255                        physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64)
256                    else:
257                        physical_pos_chunk = None
258
259        if row_in_chunk > 0:
260            window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
261            yield {
262                "window_indexes": window_indexes,
263                "chromosomes": chromosomes_chunk[:row_in_chunk],
264                "physical_pos": (
265                    physical_pos_chunk[:row_in_chunk]
266                    if physical_pos_chunk is not None
267                    else None
268                ),
269                "lai": lai_chunk[:row_in_chunk],
270            }
271
272    def _get_ancestry_map_from_comment(self, comment: str) -> Dict[str, str]:
273        """
274        Construct an ancestry map from the comment line of the `.msp` file.
275
276        This method parses the comment string to create a mapping of ancestry numerical identifiers 
277        to their corresponding ancestry names (e.g., '0': 'African').
278
279        Args:
280            comment (str): 
281                The comment line containing ancestry mapping information.
282
283        Returns:
284            dict: A dictionary mapping ancestry codes (as strings) to ancestry names.
285        """
286        comment = comment.strip()
287
288        # Remove everything before the colon, if present
289        if ':' in comment:
290            comment = comment.split(':', 1)[1].strip()
291
292        ancestry_map: Dict[str, str] = {}
293
294        # Split on tabs, spaces, commas, semicolons or any combination of them
295        tokens = [tok.strip() for tok in re.split(r'[,\t; ]+', comment) if tok]
296
297        for tok in tokens:
298            if '=' not in tok:
299                continue  # Skip invalid pieces
300
301            left, right = (p.strip() for p in tok.split('=', 1))
302
303            # Detect whether format is "Pop=0" or "0=Pop"
304            if left.isdigit() and not right.isdigit():
305                ancestry_map[left] = right       # 0=Africa
306            elif right.isdigit() and not left.isdigit():
307                ancestry_map[right] = left       # Africa=0
308            else:
309                # Fallback (if both sides are digits or both are pops, keep left as code)
310                ancestry_map[left] = right
311
312        return ancestry_map
313
314    def _replace_nan_with_none(self, array: Optional[np.ndarray]) -> Optional[np.ndarray]:
315        """
316        Replace arrays that are fully NaN with `None`.
317
318        Args:
319            array (np.ndarray): Array to check.
320
321        Returns:
322            Optional[np.ndarray]: Returns `None` if the array is fully NaN, otherwise returns the original array.
323        """
324        if array is not None:
325            if array.size == 0:  # Check if the array is empty
326                return None
327            if np.issubdtype(array.dtype, np.number):  # Check for numeric types
328                if np.isnan(array).all():  # Fully NaN numeric array
329                    return None
330            elif array.dtype == np.object_ or np.issubdtype(array.dtype, np.str_):  # String or object types
331                if np.all((array == '') | (array == None)):  # Empty or None strings
332                    return None
333        return array
334
335    def read(self) -> 'LocalAncestryObject':
336        """
337        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
338        `snputils.ancestry.genobj.LocalAncestryObject`.
339
340        **Expected MSP content:**
341
342        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
343        Each row should correspond to a genomic window and include the following columns:
344
345        - `#chm`: Chromosome numbers corresponding to each genomic window.
346        - `spos`: Start physical position for each window.
347        - `epos`: End physical position for each window.
348        - `sgpos`: Start centimorgan position for each window.
349        - `egpos`: End centimorgan position for each window.
350        - `n snps`: Number of SNPs in each genomic window.
351        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
352        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
353
354        Returns:
355            **LocalAncestryObject:**
356                A LocalAncestryObject instance.
357        """
358        log.info(f"Reading '{self.file}'...")
359        metadata = self.read_metadata()
360        comment = metadata.comment
361        header = metadata.header
362
363        # Read the main data into a DataFrame, skipping comment lines
364        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
365
366        # Extract chromosomes data
367        chromosomes = msp_df['#chm'].astype(str).to_numpy()
368
369        # Extract physical positions (if available)
370        column_counter = metadata.first_lai_col_indx
371        if metadata.has_physical_pos:
372            physical_pos = msp_df[['spos', 'epos']].to_numpy()
373        else:
374            physical_pos = None
375            log.warning("Physical positions ('spos' and 'epos') not found.")
376        
377        # Extract centimorgan positions (if available)
378        if metadata.has_centimorgan_pos:
379            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
380        else:
381            centimorgan_pos = None
382            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
383
384        # Extract window sizes (if available)
385        if metadata.has_window_sizes:
386            window_sizes = msp_df['n snps'].to_numpy()
387        else:
388            window_sizes = None
389            log.warning("Window sizes ('n snps') not found.")
390        
391        # Extract LAI data (haplotype-level)
392        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
393
394        # Extract haplotype identifiers
395        haplotypes = metadata.haplotypes
396
397        # Extract haplotype identifiers and sample identifiers
398        samples = metadata.samples
399        del msp_df
400        gc.collect()
401
402        # Validate the number of samples matches the LAI data dimensions
403        n_samples = len(samples)
404        if n_samples != int(lai.shape[1] / 2):
405            raise ValueError(
406                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
407                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
408            )
409        
410        # Count number of unique ancestries in the LAI data
411        n_ancestries = len(np.unique(lai))
412
413        # Parse ancestry map from the comment (if available)
414        ancestry_map = None
415        if comment is not None:
416            ancestry_map = metadata.ancestry_map
417            if len(ancestry_map) != n_ancestries:
418                warnings.warn(
419                    "Mismatch between the number of unique ancestries in the LAI data "
420                    f"({n_ancestries}) and the number of classes in the ancestry map "
421                    f"({len(ancestry_map)})."
422                )
423        else:
424            # Provide default ancestry mapping if no comment is provided
425            ancestry_map = None
426            warnings.warn(
427                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
428                "map as a comment in the first line."
429            )
430
431        # Replace fully NaN attributes with None
432        window_sizes = self._replace_nan_with_none(window_sizes)
433        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
434        chromosomes = self._replace_nan_with_none(chromosomes)
435        physical_pos = self._replace_nan_with_none(physical_pos)
436
437        return LocalAncestryObject(
438            haplotypes=haplotypes,
439            lai=lai,
440            samples=samples,
441            ancestry_map=ancestry_map,
442            window_sizes=window_sizes,
443            centimorgan_pos=centimorgan_pos,
444            chromosomes=chromosomes,
445            physical_pos=physical_pos
446        )

A reader class for parsing Local Ancestry Inference (LAI) data from an .msp or msp.tsv file and constructing a snputils.ancestry.genobj.LocalAncestryObject.

MSPReader(file: str | pathlib.Path)
35    def __init__(self, file: Union[str, Path]) -> None:
36        """
37        Args:
38            file (str or pathlib.Path): 
39                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
40        """
41        self.__file = Path(file)
Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
file: pathlib.Path
43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
51        """
52        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file to be read. It should end with .msp or .msp.tsv.

def read_metadata(self) -> snputils.ancestry.io.local.read.msp.MSPMetadata:
113    def read_metadata(self) -> MSPMetadata:
114        comment, header = self._parse_header_and_comment()
115
116        if len(header) != len(set(header)):
117            raise ValueError("Duplicate columns detected in the header.")
118
119        first_lai_col_indx = self._get_first_lai_col_indx(header)
120        haplotypes = header[first_lai_col_indx:]
121        samples = self._get_samples_from_haplotypes(haplotypes)
122        ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None
123
124        return MSPMetadata(
125            header=header,
126            comment=comment,
127            first_lai_col_indx=first_lai_col_indx,
128            haplotypes=haplotypes,
129            samples=samples,
130            ancestry_map=ancestry_map,
131            has_physical_pos=("spos" in header and "epos" in header),
132            has_centimorgan_pos=("sgpos" in header and "egpos" in header),
133            has_window_sizes=("n snps" in header),
134        )
def iter_windows( self, chunk_size: int = 1024, sample_indices: numpy.ndarray | None = None) -> Iterator[Dict[str, numpy.ndarray]]:
136    def iter_windows(
137        self,
138        chunk_size: int = 1024,
139        sample_indices: Optional[np.ndarray] = None,
140    ) -> Iterator[Dict[str, np.ndarray]]:
141        metadata = self.read_metadata()
142
143        if chunk_size < 1:
144            raise ValueError("chunk_size must be >= 1.")
145
146        header = metadata.header
147        first_lai_col_indx = metadata.first_lai_col_indx
148        column_index = {name: i for i, name in enumerate(header)}
149        chrom_col_idx = column_index["#chm"]
150
151        spos_col_idx: Optional[int] = None
152        epos_col_idx: Optional[int] = None
153        if metadata.has_physical_pos:
154            spos_col_idx = column_index["spos"]
155            epos_col_idx = column_index["epos"]
156
157        if sample_indices is None:
158            hap_col_indices = list(range(first_lai_col_indx, len(header)))
159        else:
160            sample_indices = np.asarray(sample_indices, dtype=np.int64)
161            if sample_indices.size == 0:
162                raise ValueError("sample_indices cannot be empty.")
163            if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)):
164                raise ValueError("sample_indices contain out-of-bounds sample indexes.")
165
166            hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64)
167            hap_indices[0::2] = 2 * sample_indices
168            hap_indices[1::2] = 2 * sample_indices + 1
169            hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist()
170
171        n_selected_haps = len(hap_col_indices)
172        n_total_haps = len(metadata.haplotypes)
173        all_haps_selected = (
174            n_selected_haps == n_total_haps
175            and n_selected_haps > 0
176            and hap_col_indices[0] == first_lai_col_indx
177            and hap_col_indices[-1] == (len(header) - 1)
178        )
179
180        # Pre-compute relative indices for the sample-subset path so the
181        # inner loop can use np.fromstring (C-level) + numpy fancy indexing
182        # instead of a Python for-loop over potentially millions of columns.
183        if not all_haps_selected:
184            _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx
185        else:
186            _relative_hap_idx = None
187
188        row_in_chunk = 0
189        window_start = 0
190        chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
191        lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
192        physical_pos_chunk = (
193            np.empty((int(chunk_size), 2), dtype=np.int64)
194            if metadata.has_physical_pos
195            else None
196        )
197
198        with open(self.file, "r", encoding="utf-8") as handle:
199            for line_no, raw_line in enumerate(handle, start=1):
200                if not raw_line:
201                    continue
202                if raw_line.startswith("#"):
203                    continue
204
205                line = raw_line.rstrip("\n")
206                if not line:
207                    continue
208
209                # Both paths split only at the metadata/haplotype boundary,
210                # then use np.fromstring (C parser) for the haplotype tail.
211                fields = line.split("\t", first_lai_col_indx)
212                if len(fields) != (first_lai_col_indx + 1):
213                    raise ValueError(
214                        f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} "
215                        f"prefix segments when parsing haplotypes."
216                    )
217
218                chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx]
219                if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None:
220                    physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx])
221                    physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx])
222
223                lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8)
224
225                if all_haps_selected:
226                    if lai_row.size != n_selected_haps:
227                        raise ValueError(
228                            f"Malformed MSP haplotype row at line {line_no}: expected "
229                            f"{n_selected_haps} haplotype values, got {lai_row.size}."
230                        )
231                    lai_chunk[row_in_chunk, :] = lai_row
232                else:
233                    if lai_row.size < n_total_haps:
234                        raise ValueError(
235                            f"Malformed MSP haplotype row at line {line_no}: expected at least "
236                            f"{n_total_haps} haplotype values, got {lai_row.size}."
237                        )
238                    lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx]
239
240                row_in_chunk += 1
241                if row_in_chunk == chunk_size:
242                    window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
243                    yield {
244                        "window_indexes": window_indexes,
245                        "chromosomes": chromosomes_chunk,
246                        "physical_pos": physical_pos_chunk,
247                        "lai": lai_chunk,
248                    }
249
250                    window_start += row_in_chunk
251                    row_in_chunk = 0
252                    chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
253                    lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
254                    if metadata.has_physical_pos:
255                        physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64)
256                    else:
257                        physical_pos_chunk = None
258
259        if row_in_chunk > 0:
260            window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
261            yield {
262                "window_indexes": window_indexes,
263                "chromosomes": chromosomes_chunk[:row_in_chunk],
264                "physical_pos": (
265                    physical_pos_chunk[:row_in_chunk]
266                    if physical_pos_chunk is not None
267                    else None
268                ),
269                "lai": lai_chunk[:row_in_chunk],
270            }
def read(self) -> LocalAncestryObject:
335    def read(self) -> 'LocalAncestryObject':
336        """
337        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
338        `snputils.ancestry.genobj.LocalAncestryObject`.
339
340        **Expected MSP content:**
341
342        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
343        Each row should correspond to a genomic window and include the following columns:
344
345        - `#chm`: Chromosome numbers corresponding to each genomic window.
346        - `spos`: Start physical position for each window.
347        - `epos`: End physical position for each window.
348        - `sgpos`: Start centimorgan position for each window.
349        - `egpos`: End centimorgan position for each window.
350        - `n snps`: Number of SNPs in each genomic window.
351        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
352        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
353
354        Returns:
355            **LocalAncestryObject:**
356                A LocalAncestryObject instance.
357        """
358        log.info(f"Reading '{self.file}'...")
359        metadata = self.read_metadata()
360        comment = metadata.comment
361        header = metadata.header
362
363        # Read the main data into a DataFrame, skipping comment lines
364        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
365
366        # Extract chromosomes data
367        chromosomes = msp_df['#chm'].astype(str).to_numpy()
368
369        # Extract physical positions (if available)
370        column_counter = metadata.first_lai_col_indx
371        if metadata.has_physical_pos:
372            physical_pos = msp_df[['spos', 'epos']].to_numpy()
373        else:
374            physical_pos = None
375            log.warning("Physical positions ('spos' and 'epos') not found.")
376        
377        # Extract centimorgan positions (if available)
378        if metadata.has_centimorgan_pos:
379            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
380        else:
381            centimorgan_pos = None
382            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
383
384        # Extract window sizes (if available)
385        if metadata.has_window_sizes:
386            window_sizes = msp_df['n snps'].to_numpy()
387        else:
388            window_sizes = None
389            log.warning("Window sizes ('n snps') not found.")
390        
391        # Extract LAI data (haplotype-level)
392        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
393
394        # Extract haplotype identifiers
395        haplotypes = metadata.haplotypes
396
397        # Extract haplotype identifiers and sample identifiers
398        samples = metadata.samples
399        del msp_df
400        gc.collect()
401
402        # Validate the number of samples matches the LAI data dimensions
403        n_samples = len(samples)
404        if n_samples != int(lai.shape[1] / 2):
405            raise ValueError(
406                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
407                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
408            )
409        
410        # Count number of unique ancestries in the LAI data
411        n_ancestries = len(np.unique(lai))
412
413        # Parse ancestry map from the comment (if available)
414        ancestry_map = None
415        if comment is not None:
416            ancestry_map = metadata.ancestry_map
417            if len(ancestry_map) != n_ancestries:
418                warnings.warn(
419                    "Mismatch between the number of unique ancestries in the LAI data "
420                    f"({n_ancestries}) and the number of classes in the ancestry map "
421                    f"({len(ancestry_map)})."
422                )
423        else:
424            # Provide default ancestry mapping if no comment is provided
425            ancestry_map = None
426            warnings.warn(
427                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
428                "map as a comment in the first line."
429            )
430
431        # Replace fully NaN attributes with None
432        window_sizes = self._replace_nan_with_none(window_sizes)
433        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
434        chromosomes = self._replace_nan_with_none(chromosomes)
435        physical_pos = self._replace_nan_with_none(physical_pos)
436
437        return LocalAncestryObject(
438            haplotypes=haplotypes,
439            lai=lai,
440            samples=samples,
441            ancestry_map=ancestry_map,
442            window_sizes=window_sizes,
443            centimorgan_pos=centimorgan_pos,
444            chromosomes=chromosomes,
445            physical_pos=physical_pos
446        )

Read data from the provided .msp or msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Expected MSP content:

The .msp file should contain local ancestry assignments for each haplotype across genomic windows. Each row should correspond to a genomic window and include the following columns:

  • #chm: Chromosome numbers corresponding to each genomic window.
  • spos: Start physical position for each window.
  • epos: End physical position for each window.
  • sgpos: Start centimorgan position for each window.
  • egpos: End centimorgan position for each window.
  • n snps: Number of SNPs in each genomic window.
  • SampleID.0: Local ancestry for the first haplotype of the sample for each window.
  • SampleID.1: Local ancestry for the second haplotype of the sample for each window.
Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class MSPWriter(snputils.ancestry.io.local.write.base.LAIBaseWriter):
 15class MSPWriter(LAIBaseWriter):
 16    """
 17    A writer class for exporting local ancestry data from a `snputils.ancestry.genobj.LocalAncestryObject` 
 18    into an `.msp` or `.msp.tsv` file.
 19    """
 20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
 21        """
 22        Args:
 23            laiobj (LocalAncestryObject):
 24                A LocalAncestryObject instance.
 25            file (str or pathlib.Path): 
 26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 28        """
 29        self.__laiobj = laiobj
 30        self.__file = Path(file)
 31
 32    @property
 33    def laiobj(self) -> LocalAncestryObject:
 34        """
 35        Retrieve `laiobj`. 
 36
 37        Returns:
 38            **LocalAncestryObject:** 
 39                A LocalAncestryObject instance.
 40        """
 41        return self.__laiobj
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 52        """
 53        return self.__file
 54
 55    @file.setter
 56    def file(self, x: Union[str, Path]):
 57        """
 58        Update `file`.
 59        """
 60        self.__file = Path(x)
 61    
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

A writer class for exporting local ancestry data from a snputils.ancestry.genobj.LocalAncestryObject into an .msp or .msp.tsv file.

MSPWriter( laiobj: LocalAncestryObject, file: str | pathlib.Path)
20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
21        """
22        Args:
23            laiobj (LocalAncestryObject):
24                A LocalAncestryObject instance.
25            file (str or pathlib.Path): 
26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
28        """
29        self.__laiobj = laiobj
30        self.__file = Path(file)
Arguments:
  • laiobj (LocalAncestryObject): A LocalAncestryObject instance.
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.
laiobj: LocalAncestryObject
32    @property
33    def laiobj(self) -> LocalAncestryObject:
34        """
35        Retrieve `laiobj`. 
36
37        Returns:
38            **LocalAncestryObject:** 
39                A LocalAncestryObject instance.
40        """
41        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib.Path
43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
52        """
53        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

def write(self) -> None:
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

Write the data contained in the laiobj instance to the specified output file. If the file already exists, it will be overwritten.

Output MSP content:

The output .msp file will contain local ancestry assignments for each haplotype across genomic windows. Each row corresponds to a genomic window and includes the following columns:

  • #chm: Chromosome numbers corresponding to each genomic window.
  • spos: Start physical position for each window.
  • epos: End physical position for each window.
  • sgpos: Start centimorgan position for each window.
  • egpos: End centimorgan position for each window.
  • n snps: Number of SNPs in each genomic window.
  • SampleID.0: Local ancestry for the first haplotype of the sample for each window.
  • SampleID.1: Local ancestry for the second haplotype of the sample for each window.
class AdmixtureMappingVCFWriter:
 16class AdmixtureMappingVCFWriter:
 17    """
 18    A writer class for converting and writing local ancestry data into ancestry-specific 
 19    VCF/BCF files for ADMIXTURE mapping.
 20    """
 21    def __init__(
 22            self, 
 23            laiobj: LocalAncestryObject, 
 24            file: Union[str, Path], 
 25            ancestry_map: Optional[Dict[str, str]] = None
 26        ):
 27        """
 28        Args:
 29            laiobj (LocalAncestryObject): 
 30                A LocalAncestryObject instance.
 31            file (str or pathlib.Path): 
 32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 34            ancestry_map (dict of str to str, optional): 
 35                A dictionary mapping ancestry codes to region names. If not explicitly 
 36                provided, it will default to the `ancestry_map` from `laiobj`.
 37        """
 38        self.__laiobj = laiobj
 39        self.__file = Path(file)
 40        self.__ancestry_map = ancestry_map
 41
 42    @property
 43    def laiobj(self) -> LocalAncestryObject:
 44        """
 45        Retrieve `laiobj`. 
 46
 47        Returns:
 48            **LocalAncestryObject:** 
 49                A LocalAncestryObject instance.
 50        """
 51        return self.__laiobj
 52
 53    @property
 54    def file(self) -> Path:
 55        """
 56        Retrieve `file`.
 57
 58        Returns:
 59            **pathlib.Path:** 
 60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 62        """
 63        return self.__file
 64
 65    @property
 66    def ancestry_map(self) -> Dict[str, str]:
 67        """
 68        Retrieve `ancestry_map`.
 69
 70        Returns:
 71            **dict of str to str:** 
 72                A dictionary mapping ancestry codes to region names. If not explicitly 
 73                provided, it will default to the `ancestry_map` from `laiobj`.
 74        """
 75        if self.__ancestry_map is not None:
 76            return self.__ancestry_map
 77        elif self.laiobj.ancestry_map is not None:
 78            return self.laiobj.ancestry_map
 79        else:
 80            raise ValueError(
 81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
 82                "during initialization or ensure `laiobj.ancestry_map` is set."
 83            )
 84
 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            if self.laiobj.physical_pos is not None:
138                pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64)
139                variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142                variants_info = None
143
144            # Modify LAI data values to simulate a SNP file
145            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
146            
147            match = (self.laiobj.lai == ancestry)
148            match = match.view(np.int8)
149            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
150
151
152            # Set up VCF-related data
153            calldata_gt = match
154            del match
155            gc.collect()
156            samples = np.array(self.laiobj.samples)
157            variants_chrom = self.laiobj.chromosomes
158            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
159            variants_id = np.array(variants_list)
160            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
161            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
162
163            # Create the SNPObject
164            variant_data_obj = SNPObject(
165                calldata_gt=calldata_gt,
166                samples=samples,
167                variants_chrom=variants_chrom,
168                variants_id=variants_id,
169                variants_ref = variants_ref,
170                variants_alt = variants_alt,
171                variants_pos = pos_list,
172            )
173
174            # Log the start of the VCF file writing process
175            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
176
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write(variants_info=variants_info)
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

A writer class for converting and writing local ancestry data into ancestry-specific VCF/BCF files for ADMIXTURE mapping.

AdmixtureMappingVCFWriter( laiobj: LocalAncestryObject, file: str | pathlib.Path, ancestry_map: Dict[str, str] | None = None)
21    def __init__(
22            self, 
23            laiobj: LocalAncestryObject, 
24            file: Union[str, Path], 
25            ancestry_map: Optional[Dict[str, str]] = None
26        ):
27        """
28        Args:
29            laiobj (LocalAncestryObject): 
30                A LocalAncestryObject instance.
31            file (str or pathlib.Path): 
32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
34            ancestry_map (dict of str to str, optional): 
35                A dictionary mapping ancestry codes to region names. If not explicitly 
36                provided, it will default to the `ancestry_map` from `laiobj`.
37        """
38        self.__laiobj = laiobj
39        self.__file = Path(file)
40        self.__ancestry_map = ancestry_map
Arguments:
  • laiobj (LocalAncestryObject): A LocalAncestryObject instance.
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.
  • ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.
laiobj: LocalAncestryObject
42    @property
43    def laiobj(self) -> LocalAncestryObject:
44        """
45        Retrieve `laiobj`. 
46
47        Returns:
48            **LocalAncestryObject:** 
49                A LocalAncestryObject instance.
50        """
51        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib.Path
53    @property
54    def file(self) -> Path:
55        """
56        Retrieve `file`.
57
58        Returns:
59            **pathlib.Path:** 
60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
62        """
63        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.

ancestry_map: Dict[str, str]
65    @property
66    def ancestry_map(self) -> Dict[str, str]:
67        """
68        Retrieve `ancestry_map`.
69
70        Returns:
71            **dict of str to str:** 
72                A dictionary mapping ancestry codes to region names. If not explicitly 
73                provided, it will default to the `ancestry_map` from `laiobj`.
74        """
75        if self.__ancestry_map is not None:
76            return self.__ancestry_map
77        elif self.laiobj.ancestry_map is not None:
78            return self.laiobj.ancestry_map
79        else:
80            raise ValueError(
81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
82                "during initialization or ensure `laiobj.ancestry_map` is set."
83            )

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

def write(self) -> None:
 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            if self.laiobj.physical_pos is not None:
138                pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64)
139                variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142                variants_info = None
143
144            # Modify LAI data values to simulate a SNP file
145            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
146            
147            match = (self.laiobj.lai == ancestry)
148            match = match.view(np.int8)
149            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
150
151
152            # Set up VCF-related data
153            calldata_gt = match
154            del match
155            gc.collect()
156            samples = np.array(self.laiobj.samples)
157            variants_chrom = self.laiobj.chromosomes
158            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
159            variants_id = np.array(variants_list)
160            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
161            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
162
163            # Create the SNPObject
164            variant_data_obj = SNPObject(
165                calldata_gt=calldata_gt,
166                samples=samples,
167                variants_chrom=variants_chrom,
168                variants_id=variants_id,
169                variants_ref = variants_ref,
170                variants_alt = variants_alt,
171                variants_pos = pos_list,
172            )
173
174            # Log the start of the VCF file writing process
175            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
176
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write(variants_info=variants_info)
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

Write VCF or BCF files for each ancestry type defined in the ancestry map. If the file already exists, it will be overwritten.

Output VCF/BCF content:

For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. SNPs are encoded as follows:

  • 1: Indicates positions that match the specified ancestry.
  • 0: Indicates positions that do not match the specified ancestry.

The VCF/BCF files will contain the following fields:

  • CHROM: Chromosome for each variant.
  • POS: Chromosomal positions for each variant.
  • ID: Unique identifier for each variant.
  • REF: Reference allele for each variant.
  • ALT: Alternate allele for each variant.
  • QUAL: Phred-scaled quality score for each variant.
  • FILTER: Status indicating whether each SNP passed control checks.
  • INFO: When physical positions are available, contains END=<end_pos> for the segment end; otherwise '.'.
  • FORMAT: Genotype format. Set to 'GT', representing the genotype as phased alleles.
  • <SampleID>: One column per sample, containing the genotype data (1|0, 0|1, etc.).

Output files:

  • A separate VCF file is written for each ancestry type, with filenames formatted as: <filename>_<ancestry>.vcf (e.g., output_African.vcf).
class AdmixtureReader(snputils.ancestry.io.wide.read.base.WideBaseReader):
 13class AdmixtureReader(WideBaseReader):
 14    """
 15    A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`.
 16    """
 17    def __init__(
 18        self,
 19        Q_file: Union[str, Path],
 20        P_file: Optional[Union[str, Path]] = None,
 21        sample_file: Optional[Union[str, Path]] = None,
 22        snp_file: Optional[Union[str, Path]] = None,
 23        ancestry_file: Optional[Union[str, Path]] = None,
 24    ) -> None:
 25        """
 26        Args:
 27            Q_file (str or pathlib.Path):
 28                Path to the file containing the Q matrix (per-sample ancestry proportions).
 29                It should end with .Q or .txt.
 30                The file should use space (' ') as the delimiter.
 31            P_file (str or pathlib.Path, optional):
 32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 33                It should end with .P or .txt.
 34                The file should use space (' ') as the delimiter. If None, P is not loaded.
 35            sample_file (str or pathlib.Path, optional):
 36                Path to the single-column file containing sample identifiers. 
 37                It should end with .fam or .txt.
 38                If None, sample identifiers are not loaded.
 39            snp_file (str or pathlib.Path, optional):
 40                Path to the single-column file containing SNP identifiers. 
 41                It should end with .bim or .txt.
 42                If None, SNP identifiers are not loaded.
 43            ancestry_file (str or pathlib.Path, optional):
 44                Path to the single-column file containing ancestry labels for each sample.
 45                It should end with .map or .txt.
 46                If None, ancestries are not loaded.
 47        """
 48        self.__Q_file = Path(Q_file)
 49        self.__P_file = Path(P_file) if P_file is not None else None
 50        self.__sample_file = Path(sample_file) if sample_file is not None else None
 51        self.__snp_file = Path(snp_file) if snp_file is not None else None
 52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
 53
 54    @property
 55    def Q_file(self) -> Path:
 56        """
 57        Retrieve Q_file.
 58
 59        Returns:
 60            **pathlib.Path:** 
 61                Path to the file containing the Q matrix (per-sample ancestry proportions).
 62                It should end with .Q or .txt.
 63                The file should use space (' ') as the delimiter.
 64        """
 65        return self.__Q_file
 66
 67    @property
 68    def P_file(self) -> Optional[Path]:
 69        """
 70        Retrieve P_file.
 71
 72        Returns:
 73            **pathlib.Path or None:** 
 74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 75                It should end with .P or .txt.
 76                The file should use space (' ') as the delimiter. If None, P is not loaded.
 77        """
 78        return self.__P_file
 79
 80    @property
 81    def sample_file(self) -> Optional[Path]:
 82        """
 83        Retrieve sample_file.
 84
 85        Returns:
 86            **pathlib.Path:** 
 87                Path to the single-column file containing sample identifiers. 
 88                It should end with .fam or .txt.
 89                If None, sample identifiers are not loaded.
 90        """
 91        return self.__sample_file
 92    
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file
105
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file
118
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        if self.P_file is not None:
145            log.info(f"Reading P matrix from '{self.P_file}'...")
146            P_mat = np.genfromtxt(self.P_file, delimiter=' ')
147        else:
148            P_mat = None
149
150        samples = self._read_sample_ids()
151        snps = self._read_snps()
152        ancestries = self._read_ancestries()
153
154        return GlobalAncestryObject(
155            Q_mat,
156            P_mat,
157            samples=samples,
158            snps=snps,
159            ancestries=ancestries
160        )

A reader class for parsing ADMIXTURE files and constructing a snputils.ancestry.genobj.GlobalAncestryObject.

AdmixtureReader( Q_file: str | pathlib.Path, P_file: str | pathlib.Path | None = None, sample_file: str | pathlib.Path | None = None, snp_file: str | pathlib.Path | None = None, ancestry_file: str | pathlib.Path | None = None)
17    def __init__(
18        self,
19        Q_file: Union[str, Path],
20        P_file: Optional[Union[str, Path]] = None,
21        sample_file: Optional[Union[str, Path]] = None,
22        snp_file: Optional[Union[str, Path]] = None,
23        ancestry_file: Optional[Union[str, Path]] = None,
24    ) -> None:
25        """
26        Args:
27            Q_file (str or pathlib.Path):
28                Path to the file containing the Q matrix (per-sample ancestry proportions).
29                It should end with .Q or .txt.
30                The file should use space (' ') as the delimiter.
31            P_file (str or pathlib.Path, optional):
32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
33                It should end with .P or .txt.
34                The file should use space (' ') as the delimiter. If None, P is not loaded.
35            sample_file (str or pathlib.Path, optional):
36                Path to the single-column file containing sample identifiers. 
37                It should end with .fam or .txt.
38                If None, sample identifiers are not loaded.
39            snp_file (str or pathlib.Path, optional):
40                Path to the single-column file containing SNP identifiers. 
41                It should end with .bim or .txt.
42                If None, SNP identifiers are not loaded.
43            ancestry_file (str or pathlib.Path, optional):
44                Path to the single-column file containing ancestry labels for each sample.
45                It should end with .map or .txt.
46                If None, ancestries are not loaded.
47        """
48        self.__Q_file = Path(Q_file)
49        self.__P_file = Path(P_file) if P_file is not None else None
50        self.__sample_file = Path(sample_file) if sample_file is not None else None
51        self.__snp_file = Path(snp_file) if snp_file is not None else None
52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Q_file: pathlib.Path
54    @property
55    def Q_file(self) -> Path:
56        """
57        Retrieve Q_file.
58
59        Returns:
60            **pathlib.Path:** 
61                Path to the file containing the Q matrix (per-sample ancestry proportions).
62                It should end with .Q or .txt.
63                The file should use space (' ') as the delimiter.
64        """
65        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.

P_file: pathlib.Path | None
67    @property
68    def P_file(self) -> Optional[Path]:
69        """
70        Retrieve P_file.
71
72        Returns:
73            **pathlib.Path or None:** 
74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
75                It should end with .P or .txt.
76                The file should use space (' ') as the delimiter. If None, P is not loaded.
77        """
78        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path or None: Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.

sample_file: pathlib.Path | None
80    @property
81    def sample_file(self) -> Optional[Path]:
82        """
83        Retrieve sample_file.
84
85        Returns:
86            **pathlib.Path:** 
87                Path to the single-column file containing sample identifiers. 
88                It should end with .fam or .txt.
89                If None, sample identifiers are not loaded.
90        """
91        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.

snp_file: pathlib.Path | None
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.

ancestry_file: pathlib.Path | None
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

def read(self) -> GlobalAncestryObject:
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        if self.P_file is not None:
145            log.info(f"Reading P matrix from '{self.P_file}'...")
146            P_mat = np.genfromtxt(self.P_file, delimiter=' ')
147        else:
148            P_mat = None
149
150        samples = self._read_sample_ids()
151        snps = self._read_snps()
152        ancestries = self._read_ancestries()
153
154        return GlobalAncestryObject(
155            Q_mat,
156            P_mat,
157            samples=samples,
158            snps=snps,
159            ancestries=ancestries
160        )

Read data from the provided ADMIXTURE files and construct a snputils.ancestry.genobj.GlobalAncestryObject instance.

Expected ADMIXTURE files content:

  • Q_file: A text file containing the Q matrix with per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
  • P_file: A text file containing the P matrix with per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

Optional files (if provided):

  • sample_file: A single-column text file containing sample identifiers in order.
  • snp_file: A single-column text file containing SNP identifiers in order.
  • ancestry_file: A single-column text file containing ancestry labels for each sample.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

class AdmixtureWriter(snputils.ancestry.io.wide.write.base.WideBaseWriter):
 13class AdmixtureWriter(WideBaseWriter):
 14    """
 15    A writer class for exporting global ancestry data from a 
 16    `snputils.ancestry.genobj.GlobalAncestryObject` into multiple ADMIXTURE files.
 17    """
 18    def __init__(
 19        self, 
 20        wideobj: GlobalAncestryObject, 
 21        file_prefix: Union[str, Path]
 22    ) -> None:
 23        """
 24        Args:
 25            wideobj (GlobalAncestryObject): 
 26                A GlobalAncestryObject instance.
 27            file_prefix (str or pathlib.Path): 
 28                Prefix for output file names, including directory path but excluding file extensions. 
 29                The prefix is used to generate specific file names for each output, with file-specific 
 30                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
 31        """
 32        super(AdmixtureWriter, self).__init__(wideobj, file_prefix)
 33        self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q")
 34        self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P")
 35
 36        self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None
 37        self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None
 38        self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None
 39
 40    @property
 41    def wideobj(self) -> GlobalAncestryObject:
 42        """
 43        Retrieve `wideobj`.
 44
 45        Returns:
 46            **GlobalAncestryObject:** A GlobalAncestryObject instance.
 47        """
 48        return self.__wideobj
 49
 50    @property
 51    def file_prefix(self) -> Path:
 52        """
 53        Retrieve `file_prefix`.
 54
 55        Returns:
 56            **pathlib.Path:** 
 57                Prefix for output file names, including directory path but excluding file extensions. 
 58                The prefix is used to generate specific file names for each output, with file-specific 
 59                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
 60        """
 61        return self.__file_prefix
 62
 63    @property
 64    def Q_file(self) -> Path:
 65        """
 66        Retrieve `Q_file`.
 67
 68        Returns:
 69            **pathlib.Path:** 
 70                Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions).
 71        """
 72        return self.__Q_file
 73    
 74    @property
 75    def P_file(self) -> Path:
 76        """
 77        Retrieve `P_file`.
 78
 79        Returns:
 80            **pathlib.Path:** 
 81                Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies).
 82        """
 83        return self.__P_file
 84    
 85    @property
 86    def sample_file(self) -> Optional[Path]:
 87        """
 88        Retrieve `sample_file`.
 89
 90        Returns:
 91            **pathlib.Path:** 
 92                Path to the `.txt` the file that will store sample identifiers. 
 93                If None, sample identifiers are not saved.
 94        """
 95        return self.__sample_file
 96    
 97    @property
 98    def snp_file(self) -> Optional[Path]:
 99        """
100        Retrieve `snp_file`.
101
102        Returns:
103            **pathlib.Path:** 
104                Path to the `.txt` file that will store SNP identifiers. 
105                If None, SNP identifiers are not saved.
106        """
107        return self.__snp_file
108    
109    @property
110    def ancestry_file(self) -> Optional[Path]:
111        """
112        Retrieve `ancestry_file`.
113
114        Returns:
115            **pathlib.Path:** 
116                Path to the `.map` file that will store ancestry labels for each sample. 
117                If None, ancestries are not saved.
118        """
119        return self.__ancestry_file
120
121    def _write_Q(self):
122        log.info(f"Writing Q matrix to '{self.Q_file}'...")
123        np.savetxt(self.Q_file, self.wideobj.Q, delimiter=" ")
124        log.info(f"Finished writing Q matrix to '{self.Q_file}'.")
125
126    def _write_P(self):
127        log.info(f"Writing P matrix to '{self.P_file}'...")
128        np.savetxt(self.P_file, self.wideobj.P, delimiter=" ")
129        log.info(f"Finished writing P matrix to '{self.P_file}'.")
130
131    def _write_sample_ids(self):
132        if self.wideobj.samples is not None:
133            log.info(f"Writing sample IDs to '{self.sample_file}'...")
134            np.savetxt(self.sample_file, self.wideobj.samples, fmt="%s")
135            log.info(f"Finished writing sample IDs to '{self.sample_file}'.")
136
137    def _write_snps(self):
138        if self.wideobj.snps is not None:
139            log.info(f"Writing SNP IDs to '{self.snp_file}'...")
140            np.savetxt(self.snp_file, self.wideobj.snps, fmt="%s")
141            log.info(f"Finished writing SNP IDs to '{self.snp_file}'.")
142
143    def _write_ancestries(self):
144        if self.wideobj.ancestries is not None:
145            log.info(f"Writing ancestry information to '{self.ancestry_file}'...")
146            np.savetxt(self.ancestry_file, self.wideobj.ancestries, fmt="%s")
147            log.info(f"Finished writing ancestry information to '{self.ancestry_file}'.")
148
149    def write(self) -> None:
150        """
151        Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files
152        with the specified `file_prefix`. If the files already exist, they will be overwritten.
153
154        **Output files:**
155
156        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
157        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
158        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
159        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
160        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
161
162        where `K` is the total number of ancestries.
163        """
164        log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...")
165        
166        self.file_prefix.parent.mkdir(parents=True, exist_ok=True)
167        
168        self._write_Q()
169        self._write_P()
170        self._write_sample_ids()
171        self._write_snps()
172        self._write_ancestries()
173
174        log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")

A writer class for exporting global ancestry data from a snputils.ancestry.genobj.GlobalAncestryObject into multiple ADMIXTURE files.

AdmixtureWriter( wideobj: GlobalAncestryObject, file_prefix: str | pathlib.Path)
18    def __init__(
19        self, 
20        wideobj: GlobalAncestryObject, 
21        file_prefix: Union[str, Path]
22    ) -> None:
23        """
24        Args:
25            wideobj (GlobalAncestryObject): 
26                A GlobalAncestryObject instance.
27            file_prefix (str or pathlib.Path): 
28                Prefix for output file names, including directory path but excluding file extensions. 
29                The prefix is used to generate specific file names for each output, with file-specific 
30                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
31        """
32        super(AdmixtureWriter, self).__init__(wideobj, file_prefix)
33        self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q")
34        self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P")
35
36        self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None
37        self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None
38        self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None
Arguments:
  • wideobj (GlobalAncestryObject): A GlobalAncestryObject instance.
  • file_prefix (str or pathlib.Path): Prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).
wideobj: GlobalAncestryObject
40    @property
41    def wideobj(self) -> GlobalAncestryObject:
42        """
43        Retrieve `wideobj`.
44
45        Returns:
46            **GlobalAncestryObject:** A GlobalAncestryObject instance.
47        """
48        return self.__wideobj

Retrieve wideobj.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

file_prefix: pathlib.Path
50    @property
51    def file_prefix(self) -> Path:
52        """
53        Retrieve `file_prefix`.
54
55        Returns:
56            **pathlib.Path:** 
57                Prefix for output file names, including directory path but excluding file extensions. 
58                The prefix is used to generate specific file names for each output, with file-specific 
59                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
60        """
61        return self.__file_prefix

Retrieve file_prefix.

Returns:

pathlib.Path: Prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).

Q_file: pathlib.Path
63    @property
64    def Q_file(self) -> Path:
65        """
66        Retrieve `Q_file`.
67
68        Returns:
69            **pathlib.Path:** 
70                Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions).
71        """
72        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the .Q file that will store the Q matrix (per-sample ancestry proportions).

P_file: pathlib.Path
74    @property
75    def P_file(self) -> Path:
76        """
77        Retrieve `P_file`.
78
79        Returns:
80            **pathlib.Path:** 
81                Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies).
82        """
83        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path: Path to the .P file that will store the P/F matrix (per-ancestry SNP frequencies).

sample_file: pathlib.Path | None
85    @property
86    def sample_file(self) -> Optional[Path]:
87        """
88        Retrieve `sample_file`.
89
90        Returns:
91            **pathlib.Path:** 
92                Path to the `.txt` the file that will store sample identifiers. 
93                If None, sample identifiers are not saved.
94        """
95        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the .txt the file that will store sample identifiers. If None, sample identifiers are not saved.

snp_file: pathlib.Path | None
 97    @property
 98    def snp_file(self) -> Optional[Path]:
 99        """
100        Retrieve `snp_file`.
101
102        Returns:
103            **pathlib.Path:** 
104                Path to the `.txt` file that will store SNP identifiers. 
105                If None, SNP identifiers are not saved.
106        """
107        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the .txt file that will store SNP identifiers. If None, SNP identifiers are not saved.

ancestry_file: pathlib.Path | None
109    @property
110    def ancestry_file(self) -> Optional[Path]:
111        """
112        Retrieve `ancestry_file`.
113
114        Returns:
115            **pathlib.Path:** 
116                Path to the `.map` file that will store ancestry labels for each sample. 
117                If None, ancestries are not saved.
118        """
119        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the .map file that will store ancestry labels for each sample. If None, ancestries are not saved.

def write(self) -> None:
149    def write(self) -> None:
150        """
151        Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files
152        with the specified `file_prefix`. If the files already exist, they will be overwritten.
153
154        **Output files:**
155
156        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
157        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
158        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
159        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
160        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
161
162        where `K` is the total number of ancestries.
163        """
164        log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...")
165        
166        self.file_prefix.parent.mkdir(parents=True, exist_ok=True)
167        
168        self._write_Q()
169        self._write_P()
170        self._write_sample_ids()
171        self._write_snps()
172        self._write_ancestries()
173
174        log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")

Write the data contained in the wideobj instance into the multiple ADMIXTURE files with the specified file_prefix. If the files already exist, they will be overwritten.

Output files:

  • <file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).
  • <file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).
  • <file_prefix>.map: Ancestry file (if ancestries information is available).

where K is the total number of ancestries.

def read_lai( file: str | pathlib.Path, **kwargs) -> LocalAncestryObject:
 8def read_lai(file: Union[str, Path], **kwargs) -> LocalAncestryObject:
 9    """
10    Automatically detect the local ancestry data file format from the file's extension and 
11    read it into a `snputils.ancestry.genobj.LocalAncestryObject`.
12
13    **Supported formats:**
14
15    - `.msp`: Text-based MSP format.
16    - `.msp.tsv`: Text-based MSP format with TSV extension.
17    
18    Args:
19        file (str or pathlib.Path): 
20            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
21        **kwargs: Additional arguments passed to the reader method.
22    """
23    from snputils.ancestry.io.local.read.auto import LAIReader
24
25    return LAIReader(file).read(**kwargs)

Automatically detect the local ancestry data file format from the file's extension and read it into a snputils.ancestry.genobj.LocalAncestryObject.

Supported formats:

  • .msp: Text-based MSP format.
  • .msp.tsv: Text-based MSP format with TSV extension.
Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
  • **kwargs: Additional arguments passed to the reader method.
def read_msp( file: str | pathlib.Path) -> LocalAncestryObject:
28def read_msp(file: Union[str, Path]) -> 'LocalAncestryObject':
29    """
30    Read data from an `.msp` or `.msp.tsv` file and construct a `snputils.ancestry.genobj.LocalAncestryObject`.
31
32    Args:
33        file (str or pathlib.Path): 
34            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
35
36    Returns:
37        **LocalAncestryObject:**
38            A LocalAncestryObject instance.
39    """
40    from snputils.ancestry.io.local.read.msp import MSPReader
41
42    return MSPReader(file).read()

Read data from an .msp or .msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
Returns:

LocalAncestryObject: A LocalAncestryObject instance.

def read_adm( Q_file: str | pathlib.Path, P_file: str | pathlib.Path | None = None, sample_file: str | pathlib.Path | None = None, snp_file: str | pathlib.Path | None = None, ancestry_file: str | pathlib.Path | None = None) -> GlobalAncestryObject:
 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Optional[Union[str, Path]] = None,
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path, optional):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter. If None, P is not loaded.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_admixture( Q_file: str | pathlib.Path, P_file: str | pathlib.Path | None = None, sample_file: str | pathlib.Path | None = None, snp_file: str | pathlib.Path | None = None, ancestry_file: str | pathlib.Path | None = None) -> GlobalAncestryObject:
 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Optional[Union[str, Path]] = None,
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path, optional):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter. If None, P is not loaded.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

class IBDObject:
 12class IBDObject:
 13    """
 14    A class for Identity-By-Descent (IBD) segment data.
 15    """
 16
 17    def __init__(
 18        self,
 19        sample_id_1: np.ndarray,
 20        haplotype_id_1: np.ndarray,
 21        sample_id_2: np.ndarray,
 22        haplotype_id_2: np.ndarray,
 23        chrom: np.ndarray,
 24        start: np.ndarray,
 25        end: np.ndarray,
 26        length_cm: Optional[np.ndarray] = None,
 27        segment_type: Optional[np.ndarray] = None,
 28    ) -> None:
 29        """
 30        Args:
 31            sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual.
 32            haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown).
 33            sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual.
 34            haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown).
 35            chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment.
 36            start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment.
 37            end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment.
 38            length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available.
 39        """
 40        # Store attributes
 41        self.__sample_id_1 = np.asarray(sample_id_1)
 42        self.__haplotype_id_1 = np.asarray(haplotype_id_1)
 43        self.__sample_id_2 = np.asarray(sample_id_2)
 44        self.__haplotype_id_2 = np.asarray(haplotype_id_2)
 45        self.__chrom = np.asarray(chrom)
 46        self.__start = np.asarray(start)
 47        self.__end = np.asarray(end)
 48        self.__length_cm = None if length_cm is None else np.asarray(length_cm)
 49        self.__segment_type = None if segment_type is None else np.asarray(segment_type)
 50
 51        self._sanity_check()
 52
 53    def __getitem__(self, key: str) -> Any:
 54        """
 55        To access an attribute of the class using the square bracket notation,
 56        similar to a dictionary.
 57        """
 58        try:
 59            return getattr(self, key)
 60        except Exception:
 61            raise KeyError(f"Invalid key: {key}.")
 62
 63    def __setitem__(self, key: str, value: Any) -> None:
 64        """
 65        To set an attribute of the class using the square bracket notation,
 66        similar to a dictionary.
 67        """
 68        try:
 69            setattr(self, key, value)
 70        except Exception:
 71            raise KeyError(f"Invalid key: {key}.")
 72
 73    @property
 74    def sample_id_1(self) -> np.ndarray:
 75        """
 76        Retrieve `sample_id_1`.
 77
 78        Returns:
 79            **array of shape (n_segments,):** Sample identifiers for the first individual.
 80        """
 81        return self.__sample_id_1
 82
 83    @sample_id_1.setter
 84    def sample_id_1(self, x: Sequence) -> None:
 85        """
 86        Update `sample_id_1`.
 87        """
 88        self.__sample_id_1 = np.asarray(x)
 89
 90    @property
 91    def haplotype_id_1(self) -> np.ndarray:
 92        """
 93        Retrieve `haplotype_id_1`.
 94
 95        Returns:
 96            **array of shape (n_segments,):** Haplotype identifiers for the first individual (values in {1, 2}).
 97        """
 98        return self.__haplotype_id_1
 99
100    @haplotype_id_1.setter
101    def haplotype_id_1(self, x: Sequence) -> None:
102        """
103        Update `haplotype_id_1`.
104        """
105        self.__haplotype_id_1 = np.asarray(x)
106
107    @property
108    def sample_id_2(self) -> np.ndarray:
109        """
110        Retrieve `sample_id_2`.
111
112        Returns:
113            **array of shape (n_segments,):** Sample identifiers for the second individual.
114        """
115        return self.__sample_id_2
116
117    @sample_id_2.setter
118    def sample_id_2(self, x: Sequence) -> None:
119        """
120        Update `sample_id_2`.
121        """
122        self.__sample_id_2 = np.asarray(x)
123
124    @property
125    def haplotype_id_2(self) -> np.ndarray:
126        """
127        Retrieve `haplotype_id_2`.
128
129        Returns:
130            **array of shape (n_segments,):** Haplotype identifiers for the second individual (values in {1, 2}).
131        """
132        return self.__haplotype_id_2
133
134    @haplotype_id_2.setter
135    def haplotype_id_2(self, x: Sequence) -> None:
136        """
137        Update `haplotype_id_2`.
138        """
139        self.__haplotype_id_2 = np.asarray(x)
140
141    @property
142    def chrom(self) -> np.ndarray:
143        """
144        Retrieve `chrom`.
145
146        Returns:
147            **array of shape (n_segments,):** Chromosome identifier for each IBD segment.
148        """
149        return self.__chrom
150
151    @chrom.setter
152    def chrom(self, x: Sequence) -> None:
153        """
154        Update `chrom`.
155        """
156        self.__chrom = np.asarray(x)
157
158    @property
159    def start(self) -> np.ndarray:
160        """
161        Retrieve `start`.
162
163        Returns:
164            **array of shape (n_segments,):** Start physical position (1-based, bp) for each IBD segment.
165        """
166        return self.__start
167
168    @start.setter
169    def start(self, x: Sequence) -> None:
170        """
171        Update `start`.
172        """
173        self.__start = np.asarray(x)
174
175    @property
176    def end(self) -> np.ndarray:
177        """
178        Retrieve `end`.
179
180        Returns:
181            **array of shape (n_segments,):** End physical position (1-based, bp) for each IBD segment.
182        """
183        return self.__end
184
185    @end.setter
186    def end(self, x: Sequence) -> None:
187        """
188        Update `end`.
189        """
190        self.__end = np.asarray(x)
191
192    @property
193    def length_cm(self) -> Optional[np.ndarray]:
194        """
195        Retrieve `length_cm`.
196
197        Returns:
198            **array of shape (n_segments,):** Genetic length (cM) for each segment if available; otherwise None.
199        """
200        return self.__length_cm
201
202    @length_cm.setter
203    def length_cm(self, x: Optional[Sequence]) -> None:
204        """
205        Update `length_cm`.
206        """
207        self.__length_cm = None if x is None else np.asarray(x)
208
209    @property
210    def segment_type(self) -> Optional[np.ndarray]:
211        """
212        Retrieve `segment_type`.
213
214        Returns:
215            **array of shape (n_segments,):** Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable.
216        """
217        return self.__segment_type
218
219    @segment_type.setter
220    def segment_type(self, x: Optional[Sequence]) -> None:
221        """
222        Update `segment_type`.
223        """
224        self.__segment_type = None if x is None else np.asarray(x)
225
226    @property
227    def n_segments(self) -> int:
228        """
229        Retrieve `n_segments`.
230
231        Returns:
232            **int:** The total number of IBD segments.
233        """
234        return self.__chrom.shape[0]
235
236    @property
237    def pairs(self) -> np.ndarray:
238        """
239        Retrieve `pairs`.
240
241        Returns:
242            **array of shape (n_segments, 2):** Per-segment sample identifier pairs.
243        """
244        return np.column_stack([self.__sample_id_1, self.__sample_id_2])
245
246    @property
247    def haplotype_pairs(self) -> np.ndarray:
248        """
249        Retrieve `haplotype_pairs`.
250
251        Returns:
252            **array of shape (n_segments, 2):** Per-segment haplotype identifier pairs.
253        """
254        return np.column_stack([self.__haplotype_id_1, self.__haplotype_id_2])
255
256    def copy(self) -> 'IBDObject':
257        """
258        Create and return a copy of `self`.
259
260        Returns:
261            **IBDObject:** A new instance of the current object.
262        """
263        return copy.deepcopy(self)
264
265    def keys(self) -> List[str]:
266        """
267        Retrieve a list of public attribute names for `self`.
268
269        Returns:
270            **list of str:** A list of attribute names, with internal name-mangling removed.
271        """
272        return [attr.replace('_IBDObject__', '') for attr in vars(self)]
273
274    def filter_segments(
275        self,
276        chrom: Optional[Sequence[str]] = None,
277        samples: Optional[Sequence[str]] = None,
278        min_length_cm: Optional[float] = None,
279        segment_types: Optional[Sequence[str]] = None,
280        inplace: bool = False,
281    ) -> Optional['IBDObject']:
282        """
283        Filter IBD segments by chromosome, sample names, and/or minimum genetic length.
284
285        Args:
286            chrom (sequence of str, optional): Chromosome(s) to include.
287            samples (sequence of str, optional): Sample names to include if present in either column.
288            min_length_cm (float, optional): Minimum cM length threshold.
289            inplace (bool, default=False): If True, modifies `self` in place. If False, returns a new `IBDObject`.
290
291        Returns:
292            **Optional[IBDObject]:** A filtered IBDObject if `inplace=False`. If `inplace=True`, returns None.
293        """
294        mask = np.ones(self.n_segments, dtype=bool)
295
296        if chrom is not None:
297            chrom = np.atleast_1d(chrom)
298            mask &= np.isin(self.__chrom, chrom)
299
300        if samples is not None:
301            samples = np.atleast_1d(samples)
302            mask &= np.isin(self.__sample_id_1, samples) | np.isin(self.__sample_id_2, samples)
303
304        if min_length_cm is not None and self.__length_cm is not None:
305            mask &= self.__length_cm >= float(min_length_cm)
306
307        if segment_types is not None and self.__segment_type is not None:
308            segment_types = np.atleast_1d(segment_types)
309            mask &= np.isin(self.__segment_type, segment_types)
310
311        def _apply_mask(x: Optional[np.ndarray]) -> Optional[np.ndarray]:
312            return None if x is None else np.asarray(x)[mask]
313
314        if inplace:
315            self.__sample_id_1 = _apply_mask(self.__sample_id_1)
316            self.__haplotype_id_1 = _apply_mask(self.__haplotype_id_1)
317            self.__sample_id_2 = _apply_mask(self.__sample_id_2)
318            self.__haplotype_id_2 = _apply_mask(self.__haplotype_id_2)
319            self.__chrom = _apply_mask(self.__chrom)
320            self.__start = _apply_mask(self.__start)
321            self.__end = _apply_mask(self.__end)
322            self.__length_cm = _apply_mask(self.__length_cm)
323            self.__segment_type = _apply_mask(self.__segment_type)
324            return None
325        else:
326            return IBDObject(
327                sample_id_1=_apply_mask(self.__sample_id_1),
328                haplotype_id_1=_apply_mask(self.__haplotype_id_1),
329                sample_id_2=_apply_mask(self.__sample_id_2),
330                haplotype_id_2=_apply_mask(self.__haplotype_id_2),
331                chrom=_apply_mask(self.__chrom),
332                start=_apply_mask(self.__start),
333                end=_apply_mask(self.__end),
334                length_cm=_apply_mask(self.__length_cm),
335                segment_type=_apply_mask(self.__segment_type),
336            )
337
338    def restrict_to_ancestry(
339        self,
340        *,
341        laiobj: Any,
342        ancestry: Any,
343        require_both_haplotypes: bool = False,
344        min_bp: Optional[int] = None,
345        min_cm: Optional[float] = None,
346        inplace: bool = False,
347        method: str = 'clip',
348    ) -> Optional['IBDObject']:
349        """
350        Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry
351        according to a `LocalAncestryObject`.
352
353        This performs an interval intersection per segment against ancestry tracts:
354        - If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific
355          haplotype of each individual.
356        - If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is
357          considered present for an individual if at least one of their haplotypes matches
358          the requested ancestry (unless `require_both_haplotypes=True`).
359
360        Method 'strict':
361            Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry
362            for either individual. No trimming occurs - segments are kept whole or dropped completely.
363
364        Method 'clip':
365            Trim IBD segments to contiguous regions where both individuals have the target ancestry.
366            Resulting subsegments are clipped to LAI window boundaries and original IBD start/end,
367            with optional length filtering by bp or cM.
368
369        Args:
370            laiobj: LocalAncestryObject containing 2D `lai` of shape (n_windows, n_haplotypes),
371                `physical_pos` (n_windows, 2), and `chromosomes` (n_windows,).
372            ancestry: Target ancestry code or label. Compared as string, so both int and str work.
373            require_both_haplotypes: If True, require both haplotypes of each individual to have
374                the target ancestry within a window. When haplotypes are known per segment, this
375                only affects cases with unknown haplotypes (== -1) or IBD2 segments.
376            min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip).
377            min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip).
378            inplace: If True, replace `self` with the restricted object; else return a new object.
379            method: Method to use for filtering. 'strict' drops entire segments that overlap with
380                non-target ancestry. 'clip' trims segments to target ancestry regions.
381
382        Returns:
383            Optional[IBDObject]: A restricted IBDObject if `inplace=False`. If `inplace=True`,
384                returns None.
385        """
386        if method not in ['strict', 'clip']:
387            raise ValueError(f"Method must be 'strict' or 'clip', got '{method}'")
388
389        # Basic LAI shape/metadata checks
390        lai = getattr(laiobj, 'lai', None)
391        physical_pos = getattr(laiobj, 'physical_pos', None)
392        chromosomes = getattr(laiobj, 'chromosomes', None)
393        centimorgan_pos = getattr(laiobj, 'centimorgan_pos', None)
394        haplotypes = getattr(laiobj, 'haplotypes', None)
395
396        if lai is None or physical_pos is None or chromosomes is None or haplotypes is None:
397            raise ValueError(
398                "`laiobj` must provide `lai`, `physical_pos`, `chromosomes`, and `haplotypes`."
399            )
400
401        if lai.ndim != 2:
402            raise ValueError("`laiobj.lai` must be 2D with shape (n_windows, n_haplotypes).")
403
404        # Build haplotype label -> column index map (labels like 'Sample.0', 'Sample.1')
405        hap_to_col = {str(h): i for i, h in enumerate(haplotypes)}
406
407        # Coerce ancestry to str for robust comparisons
408        anc_str = str(ancestry)
409
410        # Coerce LAI values to str once for comparisons
411        lai_str = lai.astype(str)
412
413        # Prepare arrays for the restricted segments
414        out_sample_id_1: List[str] = []
415        out_haplotype_id_1: List[int] = []
416        out_sample_id_2: List[str] = []
417        out_haplotype_id_2: List[int] = []
418        out_chrom: List[str] = []
419        out_start: List[int] = []
420        out_end: List[int] = []
421        out_length_cm: List[float] = []
422        out_segment_type: List[str] = [] if self.__segment_type is not None else None  # type: ignore
423
424        # Vectorize chrom compare by making LAI chromosome strings
425        chr_lai = np.asarray(chromosomes).astype(str)
426
427        # Helper to compute cM length for a trimmed interval using LAI windows
428        def _approx_cm_len(chr_mask: np.ndarray, start_bp: int, end_bp: int) -> Optional[float]:
429            if centimorgan_pos is None:
430                return None
431            win_st = physical_pos[chr_mask, 0]
432            win_en = physical_pos[chr_mask, 1]
433            win_cm_st = centimorgan_pos[chr_mask, 0]
434            win_cm_en = centimorgan_pos[chr_mask, 1]
435            cm_total = 0.0
436            for ws, we, cs, ce in zip(win_st, win_en, win_cm_st, win_cm_en):
437                # Overlap with [start_bp, end_bp]
438                overlap_start = max(int(ws), int(start_bp))
439                overlap_end = min(int(we), int(end_bp))
440                if overlap_start > overlap_end:
441                    continue
442                wlen_bp = max(1, int(we) - int(ws) + 1)
443                olen_bp = int(overlap_end) - int(overlap_start) + 1
444                frac = float(olen_bp) / float(wlen_bp)
445                cm_total += frac * float(ce - cs)
446            return cm_total
447
448        # Iterate over segments
449        for i in range(self.n_segments):
450            chrom = str(self.__chrom[i])
451            seg_start = int(self.__start[i])
452            seg_end = int(self.__end[i])
453            if seg_end < seg_start:
454                continue
455
456            # Subset LAI windows on this chromosome that overlap the segment
457            idx_chr = (chr_lai == chrom)
458            if not np.any(idx_chr):
459                continue
460            lai_st = physical_pos[idx_chr, 0]
461            lai_en = physical_pos[idx_chr, 1]
462            overlaps = (lai_en >= seg_start) & (lai_st <= seg_end)
463            if not np.any(overlaps):
464                continue
465
466            # Build per-window ancestry mask for both individuals
467            s1 = str(self.__sample_id_1[i])
468            s2 = str(self.__sample_id_2[i])
469            h1 = int(self.__haplotype_id_1[i]) if self.__haplotype_id_1 is not None else -1
470            h2 = int(self.__haplotype_id_2[i]) if self.__haplotype_id_2 is not None else -1
471
472            # Resolve haplotype column indices for each sample
473            # Known haplotypes are 1-based in inputs; convert to {0,1}
474            def _get_cols(sample: str) -> Tuple[int, int]:
475                a = hap_to_col.get(f"{sample}.0")
476                b = hap_to_col.get(f"{sample}.1")
477                if a is None or b is None:
478                    raise ValueError(f"Sample '{sample}' not found in LAI haplotypes.")
479                return a, b
480
481            s1_a, s1_b = _get_cols(s1)
482            s2_a, s2_b = _get_cols(s2)
483
484            # LAI rows for this chromosome
485            lai_rows = lai_str[idx_chr, :]
486
487            # Determine ancestry presence per window for each individual
488            if h1 in (1, 2) and h2 in (1, 2):
489                # Use specific haplotypes
490                s1_col = s1_a if (h1 - 1) == 0 else s1_b
491                s2_col = s2_a if (h2 - 1) == 0 else s2_b
492                s1_mask = (lai_rows[:, s1_col] == anc_str)
493                s2_mask = (lai_rows[:, s2_col] == anc_str)
494                if require_both_haplotypes:
495                    # Additionally require the other hap of each sample to match
496                    s1_other = s1_b if s1_col == s1_a else s1_a
497                    s2_other = s2_b if s2_col == s2_a else s2_a
498                    s1_mask = s1_mask & (lai_rows[:, s1_other] == anc_str)
499                    s2_mask = s2_mask & (lai_rows[:, s2_other] == anc_str)
500            else:
501                # Unknown hap IDs: require at least one hap to match (or both if requested)
502                if require_both_haplotypes:
503                    s1_mask = (lai_rows[:, s1_a] == anc_str) & (lai_rows[:, s1_b] == anc_str)
504                    s2_mask = (lai_rows[:, s2_a] == anc_str) & (lai_rows[:, s2_b] == anc_str)
505                else:
506                    s1_mask = (lai_rows[:, s1_a] == anc_str) | (lai_rows[:, s1_b] == anc_str)
507                    s2_mask = (lai_rows[:, s2_a] == anc_str) | (lai_rows[:, s2_b] == anc_str)
508
509            keep = overlaps & s1_mask & s2_mask
510
511            if method == 'strict':
512                # In strict mode, ALL overlapping windows must have target ancestry
513                if not np.array_equal(overlaps, keep):
514                    continue  # Drop entire segment
515
516                # Apply length filters to original segment
517                if min_bp is not None and (seg_end - seg_start + 1) < int(min_bp):
518                    continue
519
520                # In strict mode, preserve original length_cm
521                cm_len = float(self.__length_cm[i]) if self.__length_cm is not None else None
522
523                if min_cm is not None:
524                    if cm_len is None or cm_len < float(min_cm):
525                        continue
526
527                # Keep entire original segment
528                out_sample_id_1.append(s1)
529                out_sample_id_2.append(s2)
530                out_haplotype_id_1.append(h1)
531                out_haplotype_id_2.append(h2)
532                out_chrom.append(chrom)
533                out_start.append(seg_start)
534                out_end.append(seg_end)
535                out_length_cm.append(float(cm_len) if cm_len is not None else float('nan'))
536                if out_segment_type is not None:
537                    out_segment_type.append(str(self.__segment_type[i]))  # type: ignore
538
539            else:  # method == 'clip'
540                if not np.any(keep):
541                    continue
542
543                # Identify contiguous windows where keep=True
544                idx_keep = np.where(keep)[0]
545                # Split into runs of consecutive indices
546                breaks = np.where(np.diff(idx_keep) > 1)[0]
547                run_starts = np.r_[0, breaks + 1]
548                run_ends = np.r_[breaks, idx_keep.size - 1]
549
550                # Create subsegments for each contiguous run
551                for rs, re in zip(run_starts, run_ends):
552                    i0 = idx_keep[rs]
553                    i1 = idx_keep[re]
554                    sub_start = int(max(seg_start, int(lai_st[i0])))
555                    sub_end = int(min(seg_end, int(lai_en[i1])))
556                    if sub_end < sub_start:
557                        continue
558
559                    # Length filters: bp first
560                    if min_bp is not None and (sub_end - sub_start + 1) < int(min_bp):
561                        continue
562
563                    # Compute cM length if possible, else approximate or None
564                    cm_len = _approx_cm_len(idx_chr, sub_start, sub_end)
565                    if cm_len is None and self.__length_cm is not None:
566                        # Scale the original segment length by bp fraction
567                        total_bp = max(1, int(seg_end - seg_start + 1))
568                        frac_bp = float(sub_end - sub_start + 1) / float(total_bp)
569                        try:
570                            cm_len = float(self.__length_cm[i]) * frac_bp
571                        except Exception:
572                            cm_len = None
573
574                    # Apply cM filter if requested (treat None as 0)
575                    if min_cm is not None:
576                        if cm_len is None or cm_len < float(min_cm):
577                            continue
578
579                    # Append trimmed segment
580                    out_sample_id_1.append(s1)
581                    out_sample_id_2.append(s2)
582                    out_haplotype_id_1.append(h1)
583                    out_haplotype_id_2.append(h2)
584                    out_chrom.append(chrom)
585                    out_start.append(sub_start)
586                    out_end.append(sub_end)
587                    out_length_cm.append(float(cm_len) if cm_len is not None else float('nan'))
588                    if out_segment_type is not None:
589                        out_segment_type.append(str(self.__segment_type[i]))  # type: ignore
590
591        # If nothing remains, return empty object with zero segments
592        if len(out_start) == 0:
593            # Build minimal arrays
594            empty = IBDObject(
595                sample_id_1=np.array([], dtype=object),
596                haplotype_id_1=np.array([], dtype=int),
597                sample_id_2=np.array([], dtype=object),
598                haplotype_id_2=np.array([], dtype=int),
599                chrom=np.array([], dtype=object),
600                start=np.array([], dtype=int),
601                end=np.array([], dtype=int),
602                length_cm=None,
603                segment_type=None if out_segment_type is None else np.array([], dtype=object),
604            )
605            if inplace:
606                self.__sample_id_1 = empty.sample_id_1
607                self.__haplotype_id_1 = empty.haplotype_id_1
608                self.__sample_id_2 = empty.sample_id_2
609                self.__haplotype_id_2 = empty.haplotype_id_2
610                self.__chrom = empty.chrom
611                self.__start = empty.start
612                self.__end = empty.end
613                self.__length_cm = empty.length_cm
614                self.__segment_type = empty.segment_type
615                return None
616            return empty
617
618        # Assemble outputs
619        out_length_array: Optional[np.ndarray]
620        if len(out_length_cm) > 0:
621            # Convert NaNs to None-equivalent by using np.array with dtype float
622            out_length_array = np.asarray(out_length_cm, dtype=float)
623        else:
624            out_length_array = None
625
626        new_obj = IBDObject(
627            sample_id_1=np.asarray(out_sample_id_1, dtype=object),
628            haplotype_id_1=np.asarray(out_haplotype_id_1, dtype=int),
629            sample_id_2=np.asarray(out_sample_id_2, dtype=object),
630            haplotype_id_2=np.asarray(out_haplotype_id_2, dtype=int),
631            chrom=np.asarray(out_chrom, dtype=object),
632            start=np.asarray(out_start, dtype=int),
633            end=np.asarray(out_end, dtype=int),
634            length_cm=out_length_array,
635            segment_type=None if out_segment_type is None else np.asarray(out_segment_type, dtype=object),
636        )
637
638        if inplace:
639            self.__sample_id_1 = new_obj.sample_id_1
640            self.__haplotype_id_1 = new_obj.haplotype_id_1
641            self.__sample_id_2 = new_obj.sample_id_2
642            self.__haplotype_id_2 = new_obj.haplotype_id_2
643            self.__chrom = new_obj.chrom
644            self.__start = new_obj.start
645            self.__end = new_obj.end
646            self.__length_cm = new_obj.length_cm
647            self.__segment_type = new_obj.segment_type
648            return None
649        return new_obj
650
651    def _sanity_check(self) -> None:
652        """
653        Perform sanity checks on the parsed data to ensure data integrity.
654        """
655        n = self.__chrom.shape[0]
656        arrays = [
657            self.__sample_id_1,
658            self.__haplotype_id_1,
659            self.__sample_id_2,
660            self.__haplotype_id_2,
661            self.__start,
662            self.__end,
663        ]
664        if any(arr.shape[0] != n for arr in arrays):
665            raise ValueError("All input arrays must have the same length.")
666
667        if self.__length_cm is not None and self.__length_cm.shape[0] != n:
668            raise ValueError("`length_cm` must have the same length as other arrays.")
669
670        if self.__segment_type is not None and self.__segment_type.shape[0] != n:
671            raise ValueError("`segment_type` must have the same length as other arrays.")
672
673        # Validate haplotype identifiers are 1 or 2, or -1 when unknown
674        valid_values = np.array([1, 2, -1])
675        if not np.isin(self.__haplotype_id_1, valid_values).all() or not np.isin(self.__haplotype_id_2, valid_values).all():
676            raise ValueError("Haplotype identifiers must be in {1, 2} or -1 if unknown.")

A class for Identity-By-Descent (IBD) segment data.

IBDObject( sample_id_1: numpy.ndarray, haplotype_id_1: numpy.ndarray, sample_id_2: numpy.ndarray, haplotype_id_2: numpy.ndarray, chrom: numpy.ndarray, start: numpy.ndarray, end: numpy.ndarray, length_cm: numpy.ndarray | None = None, segment_type: numpy.ndarray | None = None)
17    def __init__(
18        self,
19        sample_id_1: np.ndarray,
20        haplotype_id_1: np.ndarray,
21        sample_id_2: np.ndarray,
22        haplotype_id_2: np.ndarray,
23        chrom: np.ndarray,
24        start: np.ndarray,
25        end: np.ndarray,
26        length_cm: Optional[np.ndarray] = None,
27        segment_type: Optional[np.ndarray] = None,
28    ) -> None:
29        """
30        Args:
31            sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual.
32            haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown).
33            sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual.
34            haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown).
35            chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment.
36            start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment.
37            end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment.
38            length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available.
39        """
40        # Store attributes
41        self.__sample_id_1 = np.asarray(sample_id_1)
42        self.__haplotype_id_1 = np.asarray(haplotype_id_1)
43        self.__sample_id_2 = np.asarray(sample_id_2)
44        self.__haplotype_id_2 = np.asarray(haplotype_id_2)
45        self.__chrom = np.asarray(chrom)
46        self.__start = np.asarray(start)
47        self.__end = np.asarray(end)
48        self.__length_cm = None if length_cm is None else np.asarray(length_cm)
49        self.__segment_type = None if segment_type is None else np.asarray(segment_type)
50
51        self._sanity_check()
Arguments:
  • sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual.
  • haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown).
  • sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual.
  • haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown).
  • chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment.
  • start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment.
  • end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment.
  • length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available.
sample_id_1: numpy.ndarray
73    @property
74    def sample_id_1(self) -> np.ndarray:
75        """
76        Retrieve `sample_id_1`.
77
78        Returns:
79            **array of shape (n_segments,):** Sample identifiers for the first individual.
80        """
81        return self.__sample_id_1

Retrieve sample_id_1.

Returns:

array of shape (n_segments,): Sample identifiers for the first individual.

haplotype_id_1: numpy.ndarray
90    @property
91    def haplotype_id_1(self) -> np.ndarray:
92        """
93        Retrieve `haplotype_id_1`.
94
95        Returns:
96            **array of shape (n_segments,):** Haplotype identifiers for the first individual (values in {1, 2}).
97        """
98        return self.__haplotype_id_1

Retrieve haplotype_id_1.

Returns:

array of shape (n_segments,): Haplotype identifiers for the first individual (values in {1, 2}).

sample_id_2: numpy.ndarray
107    @property
108    def sample_id_2(self) -> np.ndarray:
109        """
110        Retrieve `sample_id_2`.
111
112        Returns:
113            **array of shape (n_segments,):** Sample identifiers for the second individual.
114        """
115        return self.__sample_id_2

Retrieve sample_id_2.

Returns:

array of shape (n_segments,): Sample identifiers for the second individual.

haplotype_id_2: numpy.ndarray
124    @property
125    def haplotype_id_2(self) -> np.ndarray:
126        """
127        Retrieve `haplotype_id_2`.
128
129        Returns:
130            **array of shape (n_segments,):** Haplotype identifiers for the second individual (values in {1, 2}).
131        """
132        return self.__haplotype_id_2

Retrieve haplotype_id_2.

Returns:

array of shape (n_segments,): Haplotype identifiers for the second individual (values in {1, 2}).

chrom: numpy.ndarray
141    @property
142    def chrom(self) -> np.ndarray:
143        """
144        Retrieve `chrom`.
145
146        Returns:
147            **array of shape (n_segments,):** Chromosome identifier for each IBD segment.
148        """
149        return self.__chrom

Retrieve chrom.

Returns:

array of shape (n_segments,): Chromosome identifier for each IBD segment.

start: numpy.ndarray
158    @property
159    def start(self) -> np.ndarray:
160        """
161        Retrieve `start`.
162
163        Returns:
164            **array of shape (n_segments,):** Start physical position (1-based, bp) for each IBD segment.
165        """
166        return self.__start

Retrieve start.

Returns:

array of shape (n_segments,): Start physical position (1-based, bp) for each IBD segment.

end: numpy.ndarray
175    @property
176    def end(self) -> np.ndarray:
177        """
178        Retrieve `end`.
179
180        Returns:
181            **array of shape (n_segments,):** End physical position (1-based, bp) for each IBD segment.
182        """
183        return self.__end

Retrieve end.

Returns:

array of shape (n_segments,): End physical position (1-based, bp) for each IBD segment.

length_cm: numpy.ndarray | None
192    @property
193    def length_cm(self) -> Optional[np.ndarray]:
194        """
195        Retrieve `length_cm`.
196
197        Returns:
198            **array of shape (n_segments,):** Genetic length (cM) for each segment if available; otherwise None.
199        """
200        return self.__length_cm

Retrieve length_cm.

Returns:

array of shape (n_segments,): Genetic length (cM) for each segment if available; otherwise None.

segment_type: numpy.ndarray | None
209    @property
210    def segment_type(self) -> Optional[np.ndarray]:
211        """
212        Retrieve `segment_type`.
213
214        Returns:
215            **array of shape (n_segments,):** Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable.
216        """
217        return self.__segment_type

Retrieve segment_type.

Returns:

array of shape (n_segments,): Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable.

n_segments: int
226    @property
227    def n_segments(self) -> int:
228        """
229        Retrieve `n_segments`.
230
231        Returns:
232            **int:** The total number of IBD segments.
233        """
234        return self.__chrom.shape[0]

Retrieve n_segments.

Returns:

int: The total number of IBD segments.

pairs: numpy.ndarray
236    @property
237    def pairs(self) -> np.ndarray:
238        """
239        Retrieve `pairs`.
240
241        Returns:
242            **array of shape (n_segments, 2):** Per-segment sample identifier pairs.
243        """
244        return np.column_stack([self.__sample_id_1, self.__sample_id_2])

Retrieve pairs.

Returns:

array of shape (n_segments, 2): Per-segment sample identifier pairs.

haplotype_pairs: numpy.ndarray
246    @property
247    def haplotype_pairs(self) -> np.ndarray:
248        """
249        Retrieve `haplotype_pairs`.
250
251        Returns:
252            **array of shape (n_segments, 2):** Per-segment haplotype identifier pairs.
253        """
254        return np.column_stack([self.__haplotype_id_1, self.__haplotype_id_2])

Retrieve haplotype_pairs.

Returns:

array of shape (n_segments, 2): Per-segment haplotype identifier pairs.

def copy(self) -> IBDObject:
256    def copy(self) -> 'IBDObject':
257        """
258        Create and return a copy of `self`.
259
260        Returns:
261            **IBDObject:** A new instance of the current object.
262        """
263        return copy.deepcopy(self)

Create and return a copy of self.

Returns:

IBDObject: A new instance of the current object.

def keys(self) -> List[str]:
265    def keys(self) -> List[str]:
266        """
267        Retrieve a list of public attribute names for `self`.
268
269        Returns:
270            **list of str:** A list of attribute names, with internal name-mangling removed.
271        """
272        return [attr.replace('_IBDObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed.

def filter_segments( self, chrom: Sequence[str] | None = None, samples: Sequence[str] | None = None, min_length_cm: float | None = None, segment_types: Sequence[str] | None = None, inplace: bool = False) -> IBDObject | None:
274    def filter_segments(
275        self,
276        chrom: Optional[Sequence[str]] = None,
277        samples: Optional[Sequence[str]] = None,
278        min_length_cm: Optional[float] = None,
279        segment_types: Optional[Sequence[str]] = None,
280        inplace: bool = False,
281    ) -> Optional['IBDObject']:
282        """
283        Filter IBD segments by chromosome, sample names, and/or minimum genetic length.
284
285        Args:
286            chrom (sequence of str, optional): Chromosome(s) to include.
287            samples (sequence of str, optional): Sample names to include if present in either column.
288            min_length_cm (float, optional): Minimum cM length threshold.
289            inplace (bool, default=False): If True, modifies `self` in place. If False, returns a new `IBDObject`.
290
291        Returns:
292            **Optional[IBDObject]:** A filtered IBDObject if `inplace=False`. If `inplace=True`, returns None.
293        """
294        mask = np.ones(self.n_segments, dtype=bool)
295
296        if chrom is not None:
297            chrom = np.atleast_1d(chrom)
298            mask &= np.isin(self.__chrom, chrom)
299
300        if samples is not None:
301            samples = np.atleast_1d(samples)
302            mask &= np.isin(self.__sample_id_1, samples) | np.isin(self.__sample_id_2, samples)
303
304        if min_length_cm is not None and self.__length_cm is not None:
305            mask &= self.__length_cm >= float(min_length_cm)
306
307        if segment_types is not None and self.__segment_type is not None:
308            segment_types = np.atleast_1d(segment_types)
309            mask &= np.isin(self.__segment_type, segment_types)
310
311        def _apply_mask(x: Optional[np.ndarray]) -> Optional[np.ndarray]:
312            return None if x is None else np.asarray(x)[mask]
313
314        if inplace:
315            self.__sample_id_1 = _apply_mask(self.__sample_id_1)
316            self.__haplotype_id_1 = _apply_mask(self.__haplotype_id_1)
317            self.__sample_id_2 = _apply_mask(self.__sample_id_2)
318            self.__haplotype_id_2 = _apply_mask(self.__haplotype_id_2)
319            self.__chrom = _apply_mask(self.__chrom)
320            self.__start = _apply_mask(self.__start)
321            self.__end = _apply_mask(self.__end)
322            self.__length_cm = _apply_mask(self.__length_cm)
323            self.__segment_type = _apply_mask(self.__segment_type)
324            return None
325        else:
326            return IBDObject(
327                sample_id_1=_apply_mask(self.__sample_id_1),
328                haplotype_id_1=_apply_mask(self.__haplotype_id_1),
329                sample_id_2=_apply_mask(self.__sample_id_2),
330                haplotype_id_2=_apply_mask(self.__haplotype_id_2),
331                chrom=_apply_mask(self.__chrom),
332                start=_apply_mask(self.__start),
333                end=_apply_mask(self.__end),
334                length_cm=_apply_mask(self.__length_cm),
335                segment_type=_apply_mask(self.__segment_type),
336            )

Filter IBD segments by chromosome, sample names, and/or minimum genetic length.

Arguments:
  • chrom (sequence of str, optional): Chromosome(s) to include.
  • samples (sequence of str, optional): Sample names to include if present in either column.
  • min_length_cm (float, optional): Minimum cM length threshold.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new IBDObject.
Returns:

Optional[IBDObject]: A filtered IBDObject if inplace=False. If inplace=True, returns None.

def restrict_to_ancestry( self, *, laiobj: Any, ancestry: Any, require_both_haplotypes: bool = False, min_bp: int | None = None, min_cm: float | None = None, inplace: bool = False, method: str = 'clip') -> IBDObject | None:
338    def restrict_to_ancestry(
339        self,
340        *,
341        laiobj: Any,
342        ancestry: Any,
343        require_both_haplotypes: bool = False,
344        min_bp: Optional[int] = None,
345        min_cm: Optional[float] = None,
346        inplace: bool = False,
347        method: str = 'clip',
348    ) -> Optional['IBDObject']:
349        """
350        Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry
351        according to a `LocalAncestryObject`.
352
353        This performs an interval intersection per segment against ancestry tracts:
354        - If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific
355          haplotype of each individual.
356        - If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is
357          considered present for an individual if at least one of their haplotypes matches
358          the requested ancestry (unless `require_both_haplotypes=True`).
359
360        Method 'strict':
361            Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry
362            for either individual. No trimming occurs - segments are kept whole or dropped completely.
363
364        Method 'clip':
365            Trim IBD segments to contiguous regions where both individuals have the target ancestry.
366            Resulting subsegments are clipped to LAI window boundaries and original IBD start/end,
367            with optional length filtering by bp or cM.
368
369        Args:
370            laiobj: LocalAncestryObject containing 2D `lai` of shape (n_windows, n_haplotypes),
371                `physical_pos` (n_windows, 2), and `chromosomes` (n_windows,).
372            ancestry: Target ancestry code or label. Compared as string, so both int and str work.
373            require_both_haplotypes: If True, require both haplotypes of each individual to have
374                the target ancestry within a window. When haplotypes are known per segment, this
375                only affects cases with unknown haplotypes (== -1) or IBD2 segments.
376            min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip).
377            min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip).
378            inplace: If True, replace `self` with the restricted object; else return a new object.
379            method: Method to use for filtering. 'strict' drops entire segments that overlap with
380                non-target ancestry. 'clip' trims segments to target ancestry regions.
381
382        Returns:
383            Optional[IBDObject]: A restricted IBDObject if `inplace=False`. If `inplace=True`,
384                returns None.
385        """
386        if method not in ['strict', 'clip']:
387            raise ValueError(f"Method must be 'strict' or 'clip', got '{method}'")
388
389        # Basic LAI shape/metadata checks
390        lai = getattr(laiobj, 'lai', None)
391        physical_pos = getattr(laiobj, 'physical_pos', None)
392        chromosomes = getattr(laiobj, 'chromosomes', None)
393        centimorgan_pos = getattr(laiobj, 'centimorgan_pos', None)
394        haplotypes = getattr(laiobj, 'haplotypes', None)
395
396        if lai is None or physical_pos is None or chromosomes is None or haplotypes is None:
397            raise ValueError(
398                "`laiobj` must provide `lai`, `physical_pos`, `chromosomes`, and `haplotypes`."
399            )
400
401        if lai.ndim != 2:
402            raise ValueError("`laiobj.lai` must be 2D with shape (n_windows, n_haplotypes).")
403
404        # Build haplotype label -> column index map (labels like 'Sample.0', 'Sample.1')
405        hap_to_col = {str(h): i for i, h in enumerate(haplotypes)}
406
407        # Coerce ancestry to str for robust comparisons
408        anc_str = str(ancestry)
409
410        # Coerce LAI values to str once for comparisons
411        lai_str = lai.astype(str)
412
413        # Prepare arrays for the restricted segments
414        out_sample_id_1: List[str] = []
415        out_haplotype_id_1: List[int] = []
416        out_sample_id_2: List[str] = []
417        out_haplotype_id_2: List[int] = []
418        out_chrom: List[str] = []
419        out_start: List[int] = []
420        out_end: List[int] = []
421        out_length_cm: List[float] = []
422        out_segment_type: List[str] = [] if self.__segment_type is not None else None  # type: ignore
423
424        # Vectorize chrom compare by making LAI chromosome strings
425        chr_lai = np.asarray(chromosomes).astype(str)
426
427        # Helper to compute cM length for a trimmed interval using LAI windows
428        def _approx_cm_len(chr_mask: np.ndarray, start_bp: int, end_bp: int) -> Optional[float]:
429            if centimorgan_pos is None:
430                return None
431            win_st = physical_pos[chr_mask, 0]
432            win_en = physical_pos[chr_mask, 1]
433            win_cm_st = centimorgan_pos[chr_mask, 0]
434            win_cm_en = centimorgan_pos[chr_mask, 1]
435            cm_total = 0.0
436            for ws, we, cs, ce in zip(win_st, win_en, win_cm_st, win_cm_en):
437                # Overlap with [start_bp, end_bp]
438                overlap_start = max(int(ws), int(start_bp))
439                overlap_end = min(int(we), int(end_bp))
440                if overlap_start > overlap_end:
441                    continue
442                wlen_bp = max(1, int(we) - int(ws) + 1)
443                olen_bp = int(overlap_end) - int(overlap_start) + 1
444                frac = float(olen_bp) / float(wlen_bp)
445                cm_total += frac * float(ce - cs)
446            return cm_total
447
448        # Iterate over segments
449        for i in range(self.n_segments):
450            chrom = str(self.__chrom[i])
451            seg_start = int(self.__start[i])
452            seg_end = int(self.__end[i])
453            if seg_end < seg_start:
454                continue
455
456            # Subset LAI windows on this chromosome that overlap the segment
457            idx_chr = (chr_lai == chrom)
458            if not np.any(idx_chr):
459                continue
460            lai_st = physical_pos[idx_chr, 0]
461            lai_en = physical_pos[idx_chr, 1]
462            overlaps = (lai_en >= seg_start) & (lai_st <= seg_end)
463            if not np.any(overlaps):
464                continue
465
466            # Build per-window ancestry mask for both individuals
467            s1 = str(self.__sample_id_1[i])
468            s2 = str(self.__sample_id_2[i])
469            h1 = int(self.__haplotype_id_1[i]) if self.__haplotype_id_1 is not None else -1
470            h2 = int(self.__haplotype_id_2[i]) if self.__haplotype_id_2 is not None else -1
471
472            # Resolve haplotype column indices for each sample
473            # Known haplotypes are 1-based in inputs; convert to {0,1}
474            def _get_cols(sample: str) -> Tuple[int, int]:
475                a = hap_to_col.get(f"{sample}.0")
476                b = hap_to_col.get(f"{sample}.1")
477                if a is None or b is None:
478                    raise ValueError(f"Sample '{sample}' not found in LAI haplotypes.")
479                return a, b
480
481            s1_a, s1_b = _get_cols(s1)
482            s2_a, s2_b = _get_cols(s2)
483
484            # LAI rows for this chromosome
485            lai_rows = lai_str[idx_chr, :]
486
487            # Determine ancestry presence per window for each individual
488            if h1 in (1, 2) and h2 in (1, 2):
489                # Use specific haplotypes
490                s1_col = s1_a if (h1 - 1) == 0 else s1_b
491                s2_col = s2_a if (h2 - 1) == 0 else s2_b
492                s1_mask = (lai_rows[:, s1_col] == anc_str)
493                s2_mask = (lai_rows[:, s2_col] == anc_str)
494                if require_both_haplotypes:
495                    # Additionally require the other hap of each sample to match
496                    s1_other = s1_b if s1_col == s1_a else s1_a
497                    s2_other = s2_b if s2_col == s2_a else s2_a
498                    s1_mask = s1_mask & (lai_rows[:, s1_other] == anc_str)
499                    s2_mask = s2_mask & (lai_rows[:, s2_other] == anc_str)
500            else:
501                # Unknown hap IDs: require at least one hap to match (or both if requested)
502                if require_both_haplotypes:
503                    s1_mask = (lai_rows[:, s1_a] == anc_str) & (lai_rows[:, s1_b] == anc_str)
504                    s2_mask = (lai_rows[:, s2_a] == anc_str) & (lai_rows[:, s2_b] == anc_str)
505                else:
506                    s1_mask = (lai_rows[:, s1_a] == anc_str) | (lai_rows[:, s1_b] == anc_str)
507                    s2_mask = (lai_rows[:, s2_a] == anc_str) | (lai_rows[:, s2_b] == anc_str)
508
509            keep = overlaps & s1_mask & s2_mask
510
511            if method == 'strict':
512                # In strict mode, ALL overlapping windows must have target ancestry
513                if not np.array_equal(overlaps, keep):
514                    continue  # Drop entire segment
515
516                # Apply length filters to original segment
517                if min_bp is not None and (seg_end - seg_start + 1) < int(min_bp):
518                    continue
519
520                # In strict mode, preserve original length_cm
521                cm_len = float(self.__length_cm[i]) if self.__length_cm is not None else None
522
523                if min_cm is not None:
524                    if cm_len is None or cm_len < float(min_cm):
525                        continue
526
527                # Keep entire original segment
528                out_sample_id_1.append(s1)
529                out_sample_id_2.append(s2)
530                out_haplotype_id_1.append(h1)
531                out_haplotype_id_2.append(h2)
532                out_chrom.append(chrom)
533                out_start.append(seg_start)
534                out_end.append(seg_end)
535                out_length_cm.append(float(cm_len) if cm_len is not None else float('nan'))
536                if out_segment_type is not None:
537                    out_segment_type.append(str(self.__segment_type[i]))  # type: ignore
538
539            else:  # method == 'clip'
540                if not np.any(keep):
541                    continue
542
543                # Identify contiguous windows where keep=True
544                idx_keep = np.where(keep)[0]
545                # Split into runs of consecutive indices
546                breaks = np.where(np.diff(idx_keep) > 1)[0]
547                run_starts = np.r_[0, breaks + 1]
548                run_ends = np.r_[breaks, idx_keep.size - 1]
549
550                # Create subsegments for each contiguous run
551                for rs, re in zip(run_starts, run_ends):
552                    i0 = idx_keep[rs]
553                    i1 = idx_keep[re]
554                    sub_start = int(max(seg_start, int(lai_st[i0])))
555                    sub_end = int(min(seg_end, int(lai_en[i1])))
556                    if sub_end < sub_start:
557                        continue
558
559                    # Length filters: bp first
560                    if min_bp is not None and (sub_end - sub_start + 1) < int(min_bp):
561                        continue
562
563                    # Compute cM length if possible, else approximate or None
564                    cm_len = _approx_cm_len(idx_chr, sub_start, sub_end)
565                    if cm_len is None and self.__length_cm is not None:
566                        # Scale the original segment length by bp fraction
567                        total_bp = max(1, int(seg_end - seg_start + 1))
568                        frac_bp = float(sub_end - sub_start + 1) / float(total_bp)
569                        try:
570                            cm_len = float(self.__length_cm[i]) * frac_bp
571                        except Exception:
572                            cm_len = None
573
574                    # Apply cM filter if requested (treat None as 0)
575                    if min_cm is not None:
576                        if cm_len is None or cm_len < float(min_cm):
577                            continue
578
579                    # Append trimmed segment
580                    out_sample_id_1.append(s1)
581                    out_sample_id_2.append(s2)
582                    out_haplotype_id_1.append(h1)
583                    out_haplotype_id_2.append(h2)
584                    out_chrom.append(chrom)
585                    out_start.append(sub_start)
586                    out_end.append(sub_end)
587                    out_length_cm.append(float(cm_len) if cm_len is not None else float('nan'))
588                    if out_segment_type is not None:
589                        out_segment_type.append(str(self.__segment_type[i]))  # type: ignore
590
591        # If nothing remains, return empty object with zero segments
592        if len(out_start) == 0:
593            # Build minimal arrays
594            empty = IBDObject(
595                sample_id_1=np.array([], dtype=object),
596                haplotype_id_1=np.array([], dtype=int),
597                sample_id_2=np.array([], dtype=object),
598                haplotype_id_2=np.array([], dtype=int),
599                chrom=np.array([], dtype=object),
600                start=np.array([], dtype=int),
601                end=np.array([], dtype=int),
602                length_cm=None,
603                segment_type=None if out_segment_type is None else np.array([], dtype=object),
604            )
605            if inplace:
606                self.__sample_id_1 = empty.sample_id_1
607                self.__haplotype_id_1 = empty.haplotype_id_1
608                self.__sample_id_2 = empty.sample_id_2
609                self.__haplotype_id_2 = empty.haplotype_id_2
610                self.__chrom = empty.chrom
611                self.__start = empty.start
612                self.__end = empty.end
613                self.__length_cm = empty.length_cm
614                self.__segment_type = empty.segment_type
615                return None
616            return empty
617
618        # Assemble outputs
619        out_length_array: Optional[np.ndarray]
620        if len(out_length_cm) > 0:
621            # Convert NaNs to None-equivalent by using np.array with dtype float
622            out_length_array = np.asarray(out_length_cm, dtype=float)
623        else:
624            out_length_array = None
625
626        new_obj = IBDObject(
627            sample_id_1=np.asarray(out_sample_id_1, dtype=object),
628            haplotype_id_1=np.asarray(out_haplotype_id_1, dtype=int),
629            sample_id_2=np.asarray(out_sample_id_2, dtype=object),
630            haplotype_id_2=np.asarray(out_haplotype_id_2, dtype=int),
631            chrom=np.asarray(out_chrom, dtype=object),
632            start=np.asarray(out_start, dtype=int),
633            end=np.asarray(out_end, dtype=int),
634            length_cm=out_length_array,
635            segment_type=None if out_segment_type is None else np.asarray(out_segment_type, dtype=object),
636        )
637
638        if inplace:
639            self.__sample_id_1 = new_obj.sample_id_1
640            self.__haplotype_id_1 = new_obj.haplotype_id_1
641            self.__sample_id_2 = new_obj.sample_id_2
642            self.__haplotype_id_2 = new_obj.haplotype_id_2
643            self.__chrom = new_obj.chrom
644            self.__start = new_obj.start
645            self.__end = new_obj.end
646            self.__length_cm = new_obj.length_cm
647            self.__segment_type = new_obj.segment_type
648            return None
649        return new_obj

Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry according to a LocalAncestryObject.

This performs an interval intersection per segment against ancestry tracts:

  • If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific haplotype of each individual.
  • If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is considered present for an individual if at least one of their haplotypes matches the requested ancestry (unless require_both_haplotypes=True).

Method 'strict': Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry for either individual. No trimming occurs - segments are kept whole or dropped completely.

Method 'clip': Trim IBD segments to contiguous regions where both individuals have the target ancestry. Resulting subsegments are clipped to LAI window boundaries and original IBD start/end, with optional length filtering by bp or cM.

Arguments:
  • laiobj: LocalAncestryObject containing 2D lai of shape (n_windows, n_haplotypes), physical_pos (n_windows, 2), and chromosomes (n_windows,).
  • ancestry: Target ancestry code or label. Compared as string, so both int and str work.
  • require_both_haplotypes: If True, require both haplotypes of each individual to have the target ancestry within a window. When haplotypes are known per segment, this only affects cases with unknown haplotypes (== -1) or IBD2 segments.
  • min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip).
  • min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip).
  • inplace: If True, replace self with the restricted object; else return a new object.
  • method: Method to use for filtering. 'strict' drops entire segments that overlap with non-target ancestry. 'clip' trims segments to target ancestry regions.
Returns:

Optional[IBDObject]: A restricted IBDObject if inplace=False. If inplace=True, returns None.

def read_ibd( file: str | pathlib.Path, **kwargs) -> IBDObject:
 8def read_ibd(file: Union[str, Path], **kwargs) -> IBDObject:
 9    """
10    Automatically detect the IBD data file format from the file's extension and read it into an `IBDObject`.
11
12    Supported formats:
13    - Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
14    - ancIBD (template only).
15
16    Args:
17        file (str or pathlib.Path): Path to the file to be read.
18        **kwargs: Additional arguments passed to the reader method.
19    """
20    from snputils.ibd.io.read.auto import IBDReader
21
22    return IBDReader(file).read(**kwargs)

Automatically detect the IBD data file format from the file's extension and read it into an IBDObject.

Supported formats:

  • Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
  • ancIBD (template only).
Arguments:
  • file (str or pathlib.Path): Path to the file to be read.
  • **kwargs: Additional arguments passed to the reader method.
class HapIBDReader(snputils.ibd.io.read.base.IBDBaseReader):
 18class HapIBDReader(IBDBaseReader):
 19    """
 20    Reads an IBD file in Hap-IBD format and processes it into an `IBDObject`.
 21    """
 22
 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Reads an IBD file in Hap-IBD format and processes it into an IBDObject.

def read( self, separator: str | None = None) -> IBDObject:
 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Read a Hap-IBD file into an IBDObject.

The Hap-IBD format is a delimited text without a header with columns: sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm

Notes:

  • Haplotype identifiers are 1-based and take values in {1, 2}.
Arguments:
  • separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
Returns:

IBDObject: An IBDObject instance.

class AncIBDReader(snputils.ibd.io.read.base.IBDBaseReader):
 17class AncIBDReader(IBDBaseReader):
 18    """
 19    Reads IBD data from ancIBD outputs (TSV), accepting a file (`ch_all.tsv` or `ch*.tsv`) or a directory.
 20    """
 21
 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Reads IBD data from ancIBD outputs (TSV), accepting a file (ch_all.tsv or ch*.tsv) or a directory.

def read( self, path: str | pathlib.Path | None = None, include_segment_types: Sequence[str] | None = ('IBD1', 'IBD2')) -> IBDObject:
 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Read ancIBD outputs and convert to IBDObject.

Inputs accepted:

  • A single TSV (optionally gzipped), e.g. ch_all.tsv[.gz] or ch{CHR}.tsv[.gz].
  • A directory containing per-chromosome TSVs or ch_all.tsv.

Column schema (tab-separated with header): iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type

Notes:

  • Haplotype indices are not provided by ancIBD; set to -1.
  • Positions in IBDObject use base-pair StartBP/EndBP.
  • Length uses centiMorgan as lengthM * 100.
Arguments:
  • path (str or Path, optional): Override input path. Defaults to self.file.
  • include_segment_types (sequence of str, optional): Filter by segment_type (e.g., IBD1, IBD2). None to disable.
Returns:

IBDObject: An IBDObject instance.

class IBDReader:
 8class IBDReader:
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)
IBDReader(file: str | pathlib.Path)
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)

A factory class that attempts to detect the IBD file format and returns the corresponding reader.

Supported detections:

  • Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
  • ancIBD: directories with ch_all.tsv/ch*.tsv or files *.tsv / *.tsv.gz with ancIBD schema
class MultiPhenotypeObject:
  9class MultiPhenotypeObject():
 10    """
 11    A class for multi-phenotype data.
 12
 13    This class serves as a container for phenotype data, allowing for
 14    operations such as filtering samples and accessing phenotype information.
 15    It uses a DataFrame to store the data, with the first column reserved for the sample identifers.
 16    """
 17    def __init__(
 18        self,
 19        phen_df: pd.DataFrame
 20    ) -> None:
 21        """
 22        Args:
 23            phen_df (pd.DataFrame): 
 24                A Pandas DataFrame containing phenotype data, with the first column 
 25                representing sample identifiers.
 26        """
 27        self.__phen_df = phen_df
 28
 29    def __getitem__(self, key):
 30        """
 31        To access an attribute of the class using the square bracket notation,
 32        similar to a dictionary.
 33        """
 34        try:
 35            return getattr(self, key)
 36        except:
 37            raise KeyError(f'Invalid key: {key}')
 38
 39    def __setitem__(self, key, value):
 40        """
 41        To set an attribute of the class using the square bracket notation,
 42        similar to a dictionary.
 43        """
 44        try:
 45            setattr(self, key, value)
 46        except AttributeError:
 47            raise KeyError(f'Invalid key: {key}')
 48
 49    @property
 50    def phen_df(self) -> pd.DataFrame:
 51        """
 52        Retrieve `phen_df`.
 53
 54        Returns:
 55            pd.DataFrame: 
 56                A Pandas DataFrame containing phenotype data, with the first column 
 57                representing sample identifiers.
 58        """
 59        return self.__phen_df
 60    
 61    @phen_df.setter
 62    def phen_df(self, x: pd.DataFrame):
 63        """
 64        Update `phen_df`.
 65        """
 66        self.__phen_df = x
 67    
 68    @property
 69    def n_samples(self) -> int:
 70        """
 71        Retrieve `n_samples`.
 72
 73        Returns:
 74            int: The total number of samples.
 75        """
 76        return len(self.phen_df)
 77
 78    def copy(self):
 79        """
 80        Create and return a copy of the current `MultiPhenotypeObject` instance.
 81
 82        Returns:
 83            MultiPhenotypeObject: A new instance of the current object.
 84        """
 85        return copy.copy(self)
 86    
 87    def filter_samples(
 88            self, 
 89            samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 
 90            indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 
 91            include: bool = True, 
 92            reorder: bool = False, 
 93            inplace: bool = False
 94        ) -> Optional['MultiPhenotypeObject']:
 95        """
 96        Filter samples in the `MultiPhenotypeObject` based on sample names or indexes.
 97
 98        This method allows you to include or exclude specific samples by their names,
 99        indexes, or both. When both samples and indexes are provided, the union of
100        the specified samples is used. Negative indexes are supported and follow NumPy's indexing 
101        conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or
102        `indexes` lists when including.
103
104        Args:
105            samples (str or array_like of str, optional): 
106                 Names of the samples to include or exclude. Can be a single sample name or a
107                 sequence of sample names. Default is None.
108            indexes (int or array_like of int, optional):
109                Indexes of the samples to include or exclude. Can be a single index or a sequence
110                of indexes. Negative indexes are supported. Default is None.
111            include (bool, default=True): 
112                If True, includes only the specified samples. If False, excludes the specified
113                samples. Default is True.
114            inplace (bool, default=False): 
115                If True, modifies the object in place. If False, returns a new
116                `MultiPhenotypeObject` with the samples filtered. Default is False.
117
118        Returns:
119            Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 
120            filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None.
121        """
122        # Ensure at least one of samples or indexes is provided
123        if samples is None and indexes is None:
124            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
125
126        n_samples = self.n_samples
127
128        # Create mask based on sample names
129        if samples is not None:
130            samples = np.asarray(samples).ravel()
131            # Extract sample names from the DataFrame
132            sample_names = self.__phen_df.iloc[:, 0].values
133            # Create mask for samples belonging to specified names
134            mask_samples = np.isin(sample_names, samples)
135        else:
136            mask_samples = np.zeros(n_samples, dtype=bool)
137
138        # Create mask based on sample indexes
139        if indexes is not None:
140            indexes = np.asarray(indexes).ravel()
141            # Adjust negative indexes
142            indexes = np.mod(indexes, n_samples)
143            if np.any((indexes < 0) | (indexes >= n_samples)):
144                raise IndexError("One or more sample indexes are out of bounds.")
145            # Create mask for samples at specified indexes
146            mask_indexes = np.zeros(n_samples, dtype=bool)
147            mask_indexes[indexes] = True
148        else:
149            mask_indexes = np.zeros(n_samples, dtype=bool)
150
151        # Combine masks using logical OR (union of samples)
152        mask_combined = mask_samples | mask_indexes
153
154        if not include:
155            # Invert mask if excluding samples
156            mask_combined = ~mask_combined
157
158        # If requested, compute an ordering of selected rows that follows the provided lists
159        ordered_indices = None
160        if include and reorder:
161            sel_indices = np.where(mask_combined)[0]
162            sample_names = self.__phen_df.iloc[:, 0].values
163            ordered_list = []
164            added = np.zeros(n_samples, dtype=bool)
165
166            # Respect the order provided in `samples` (supports duplicate sample names)
167            if samples is not None:
168                for s in samples:
169                    matches = np.where(sample_names == s)[0]
170                    for idx in matches:
171                        if mask_combined[idx] and not added[idx]:
172                            ordered_list.append(int(idx))
173                            added[idx] = True
174
175            # Then respect the order in `indexes`
176            if indexes is not None:
177                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
178                for idx in adj_idx:
179                    if mask_combined[idx] and not added[idx]:
180                        ordered_list.append(int(idx))
181                        added[idx] = True
182
183            # Finally, append any remaining selected rows in their original order
184            for idx in sel_indices:
185                if not added[idx]:
186                    ordered_list.append(int(idx))
187
188            ordered_indices = np.asarray(ordered_list, dtype=int)
189
190        # Filter the phenotype DataFrame
191        if inplace:
192            if ordered_indices is not None:
193                self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True)
194            else:
195                self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True)
196            return None
197        else:
198            phen_obj = self.copy()
199            if ordered_indices is not None:
200                phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True)
201            else:
202                phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True)
203            return phen_obj

A class for multi-phenotype data.

This class serves as a container for phenotype data, allowing for operations such as filtering samples and accessing phenotype information. It uses a DataFrame to store the data, with the first column reserved for the sample identifers.

MultiPhenotypeObject(phen_df: pandas.DataFrame)
17    def __init__(
18        self,
19        phen_df: pd.DataFrame
20    ) -> None:
21        """
22        Args:
23            phen_df (pd.DataFrame): 
24                A Pandas DataFrame containing phenotype data, with the first column 
25                representing sample identifiers.
26        """
27        self.__phen_df = phen_df
Arguments:
  • phen_df (pd.DataFrame): A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.
phen_df: pandas.DataFrame
49    @property
50    def phen_df(self) -> pd.DataFrame:
51        """
52        Retrieve `phen_df`.
53
54        Returns:
55            pd.DataFrame: 
56                A Pandas DataFrame containing phenotype data, with the first column 
57                representing sample identifiers.
58        """
59        return self.__phen_df

Retrieve phen_df.

Returns:

pd.DataFrame: A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.

n_samples: int
68    @property
69    def n_samples(self) -> int:
70        """
71        Retrieve `n_samples`.
72
73        Returns:
74            int: The total number of samples.
75        """
76        return len(self.phen_df)

Retrieve n_samples.

Returns:

int: The total number of samples.

def copy(self):
78    def copy(self):
79        """
80        Create and return a copy of the current `MultiPhenotypeObject` instance.
81
82        Returns:
83            MultiPhenotypeObject: A new instance of the current object.
84        """
85        return copy.copy(self)

Create and return a copy of the current MultiPhenotypeObject instance.

Returns:

MultiPhenotypeObject: A new instance of the current object.

def filter_samples( self, samples: str | Sequence[str] | numpy.ndarray | None = None, indexes: int | Sequence[int] | numpy.ndarray | None = None, include: bool = True, reorder: bool = False, inplace: bool = False) -> MultiPhenotypeObject | None:
 87    def filter_samples(
 88            self, 
 89            samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 
 90            indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 
 91            include: bool = True, 
 92            reorder: bool = False, 
 93            inplace: bool = False
 94        ) -> Optional['MultiPhenotypeObject']:
 95        """
 96        Filter samples in the `MultiPhenotypeObject` based on sample names or indexes.
 97
 98        This method allows you to include or exclude specific samples by their names,
 99        indexes, or both. When both samples and indexes are provided, the union of
100        the specified samples is used. Negative indexes are supported and follow NumPy's indexing 
101        conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or
102        `indexes` lists when including.
103
104        Args:
105            samples (str or array_like of str, optional): 
106                 Names of the samples to include or exclude. Can be a single sample name or a
107                 sequence of sample names. Default is None.
108            indexes (int or array_like of int, optional):
109                Indexes of the samples to include or exclude. Can be a single index or a sequence
110                of indexes. Negative indexes are supported. Default is None.
111            include (bool, default=True): 
112                If True, includes only the specified samples. If False, excludes the specified
113                samples. Default is True.
114            inplace (bool, default=False): 
115                If True, modifies the object in place. If False, returns a new
116                `MultiPhenotypeObject` with the samples filtered. Default is False.
117
118        Returns:
119            Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 
120            filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None.
121        """
122        # Ensure at least one of samples or indexes is provided
123        if samples is None and indexes is None:
124            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
125
126        n_samples = self.n_samples
127
128        # Create mask based on sample names
129        if samples is not None:
130            samples = np.asarray(samples).ravel()
131            # Extract sample names from the DataFrame
132            sample_names = self.__phen_df.iloc[:, 0].values
133            # Create mask for samples belonging to specified names
134            mask_samples = np.isin(sample_names, samples)
135        else:
136            mask_samples = np.zeros(n_samples, dtype=bool)
137
138        # Create mask based on sample indexes
139        if indexes is not None:
140            indexes = np.asarray(indexes).ravel()
141            # Adjust negative indexes
142            indexes = np.mod(indexes, n_samples)
143            if np.any((indexes < 0) | (indexes >= n_samples)):
144                raise IndexError("One or more sample indexes are out of bounds.")
145            # Create mask for samples at specified indexes
146            mask_indexes = np.zeros(n_samples, dtype=bool)
147            mask_indexes[indexes] = True
148        else:
149            mask_indexes = np.zeros(n_samples, dtype=bool)
150
151        # Combine masks using logical OR (union of samples)
152        mask_combined = mask_samples | mask_indexes
153
154        if not include:
155            # Invert mask if excluding samples
156            mask_combined = ~mask_combined
157
158        # If requested, compute an ordering of selected rows that follows the provided lists
159        ordered_indices = None
160        if include and reorder:
161            sel_indices = np.where(mask_combined)[0]
162            sample_names = self.__phen_df.iloc[:, 0].values
163            ordered_list = []
164            added = np.zeros(n_samples, dtype=bool)
165
166            # Respect the order provided in `samples` (supports duplicate sample names)
167            if samples is not None:
168                for s in samples:
169                    matches = np.where(sample_names == s)[0]
170                    for idx in matches:
171                        if mask_combined[idx] and not added[idx]:
172                            ordered_list.append(int(idx))
173                            added[idx] = True
174
175            # Then respect the order in `indexes`
176            if indexes is not None:
177                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
178                for idx in adj_idx:
179                    if mask_combined[idx] and not added[idx]:
180                        ordered_list.append(int(idx))
181                        added[idx] = True
182
183            # Finally, append any remaining selected rows in their original order
184            for idx in sel_indices:
185                if not added[idx]:
186                    ordered_list.append(int(idx))
187
188            ordered_indices = np.asarray(ordered_list, dtype=int)
189
190        # Filter the phenotype DataFrame
191        if inplace:
192            if ordered_indices is not None:
193                self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True)
194            else:
195                self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True)
196            return None
197        else:
198            phen_obj = self.copy()
199            if ordered_indices is not None:
200                phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True)
201            else:
202                phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True)
203            return phen_obj

Filter samples in the MultiPhenotypeObject based on sample names or indexes.

This method allows you to include or exclude specific samples by their names, indexes, or both. When both samples and indexes are provided, the union of the specified samples is used. Negative indexes are supported and follow NumPy's indexing conventions. Set reorder=True to match the ordering of the provided samples and/or indexes lists when including.

Arguments:
  • samples (str or array_like of str, optional): Names of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
  • indexes (int or array_like of int, optional): Indexes of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
  • include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
  • inplace (bool, default=False): If True, modifies the object in place. If False, returns a new MultiPhenotypeObject with the samples filtered. Default is False.
Returns:

Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples filtered if inplace=False. If inplace=True, modifies the object in place and returns None.

class PhenotypeObject:
  8class PhenotypeObject:
  9    """
 10    Generic phenotype container for single-trait analyses.
 11
 12    The object stores sample IDs, normalized phenotype values, inferred/declared
 13    trait type, and binary case/control convenience attributes.
 14    """
 15
 16    def __init__(
 17        self,
 18        samples: Sequence[str],
 19        values: Sequence[float],
 20        phenotype_name: str = "PHENO",
 21        quantitative: Optional[bool] = None,
 22    ) -> None:
 23        sample_ids = [str(sample) for sample in samples]
 24        if len(sample_ids) == 0:
 25            raise ValueError("Phenotype file contains no samples.")
 26        if len(set(sample_ids)) != len(sample_ids):
 27            raise ValueError("Phenotype sample IDs must be unique.")
 28
 29        try:
 30            values_f64 = np.asarray(values, dtype=np.float64)
 31        except (TypeError, ValueError) as exc:
 32            raise ValueError("Phenotype values must be numeric.") from exc
 33
 34        if values_f64.ndim != 1:
 35            raise ValueError("Phenotype values must be a 1-dimensional array.")
 36        if values_f64.size != len(sample_ids):
 37            raise ValueError(
 38                "Phenotype sample/value length mismatch: "
 39                f"{len(sample_ids)} samples but {values_f64.size} values."
 40            )
 41        if not np.all(np.isfinite(values_f64)):
 42            raise ValueError("Phenotype contains non-finite values (NaN/Inf).")
 43
 44        trait_is_quantitative = (
 45            self._infer_quantitative(values_f64)
 46            if quantitative is None
 47            else bool(quantitative)
 48        )
 49
 50        if trait_is_quantitative:
 51            if float(np.var(values_f64)) <= 0.0:
 52                raise ValueError("Quantitative phenotype has zero variance.")
 53            normalized_values = values_f64
 54            cases: List[str] = []
 55            controls: List[str] = sample_ids.copy()
 56        else:
 57            normalized_values = self._normalize_binary(values_f64)
 58            case_mask = normalized_values == 1
 59            control_mask = normalized_values == 0
 60            cases = [sample_ids[idx] for idx in np.where(case_mask)[0].tolist()]
 61            controls = [sample_ids[idx] for idx in np.where(control_mask)[0].tolist()]
 62            if len(cases) == 0:
 63                raise ValueError("No case data available.")
 64            if len(controls) == 0:
 65                raise ValueError("No control data available.")
 66
 67        self._samples = sample_ids
 68        self._values = normalized_values
 69        self._phenotype_name = str(phenotype_name)
 70        self._is_quantitative = trait_is_quantitative
 71
 72        self._cases = cases
 73        self._controls = controls
 74        self._all_haplotypes = [f"{sample}.0" for sample in sample_ids] + [
 75            f"{sample}.1" for sample in sample_ids
 76        ]
 77        self._cases_haplotypes = [f"{sample}.0" for sample in cases] + [
 78            f"{sample}.1" for sample in cases
 79        ]
 80        self._controls_haplotypes = [f"{sample}.0" for sample in controls] + [
 81            f"{sample}.1" for sample in controls
 82        ]
 83
 84    @staticmethod
 85    def _matches_binary_encoding(values_f64: np.ndarray, encoding: Sequence[float]) -> bool:
 86        unique_vals = np.unique(values_f64)
 87        if unique_vals.size != 2:
 88            return False
 89        target = np.asarray(sorted(float(v) for v in encoding), dtype=np.float64)
 90        observed = np.asarray(sorted(unique_vals.tolist()), dtype=np.float64)
 91        return bool(np.allclose(observed, target, rtol=0.0, atol=1e-8))
 92
 93    @staticmethod
 94    def _infer_quantitative(values_f64: np.ndarray) -> bool:
 95        return not (
 96            PhenotypeObject._matches_binary_encoding(values_f64, (0.0, 1.0))
 97            or PhenotypeObject._matches_binary_encoding(values_f64, (1.0, 2.0))
 98        )
 99
100    @staticmethod
101    def _normalize_binary(values_f64: np.ndarray) -> np.ndarray:
102        unique_vals = np.unique(values_f64)
103        if PhenotypeObject._matches_binary_encoding(values_f64, (1.0, 2.0)):
104            return np.isclose(values_f64, 2.0, rtol=0.0, atol=1e-8).astype(np.int8)
105        if PhenotypeObject._matches_binary_encoding(values_f64, (0.0, 1.0)):
106            return values_f64.astype(np.int8)
107        raise ValueError(
108            "Binary phenotype must use exactly two levels encoded as {1,2} or {0,1}. "
109            f"Observed unique values: {sorted(unique_vals.tolist())}"
110        )
111
112    def __getitem__(self, key):
113        try:
114            return getattr(self, key)
115        except AttributeError as exc:
116            raise KeyError(f"Invalid key: {key}") from exc
117
118    def __setitem__(self, key, value):
119        try:
120            setattr(self, key, value)
121        except AttributeError as exc:
122            raise KeyError(f"Invalid key: {key}") from exc
123
124    @property
125    def samples(self) -> List[str]:
126        return self._samples
127
128    @property
129    def n_samples(self) -> int:
130        return len(self._samples)
131
132    @property
133    def values(self) -> np.ndarray:
134        return self._values
135
136    @property
137    def y(self) -> np.ndarray:
138        return self._values
139
140    @property
141    def phenotype_name(self) -> str:
142        return self._phenotype_name
143
144    @property
145    def is_quantitative(self) -> bool:
146        return self._is_quantitative
147
148    @property
149    def quantitative(self) -> bool:
150        return self._is_quantitative
151
152    @property
153    def cases(self) -> List[str]:
154        return self._cases
155
156    @property
157    def n_cases(self) -> int:
158        return len(self._cases)
159
160    @property
161    def controls(self) -> List[str]:
162        return self._controls
163
164    @property
165    def n_controls(self) -> int:
166        return len(self._controls)
167
168    @property
169    def all_haplotypes(self) -> List[str]:
170        return self._all_haplotypes
171
172    @property
173    def cases_haplotypes(self) -> List[str]:
174        return self._cases_haplotypes
175
176    @property
177    def controls_haplotypes(self) -> List[str]:
178        return self._controls_haplotypes
179
180    def copy(self):
181        return copy.copy(self)
182
183    def keys(self) -> List[str]:
184        return [
185            "samples",
186            "n_samples",
187            "values",
188            "y",
189            "phenotype_name",
190            "is_quantitative",
191            "quantitative",
192            "cases",
193            "n_cases",
194            "controls",
195            "n_controls",
196            "all_haplotypes",
197            "cases_haplotypes",
198            "controls_haplotypes",
199        ]

Generic phenotype container for single-trait analyses.

The object stores sample IDs, normalized phenotype values, inferred/declared trait type, and binary case/control convenience attributes.

PhenotypeObject( samples: Sequence[str], values: Sequence[float], phenotype_name: str = 'PHENO', quantitative: bool | None = None)
16    def __init__(
17        self,
18        samples: Sequence[str],
19        values: Sequence[float],
20        phenotype_name: str = "PHENO",
21        quantitative: Optional[bool] = None,
22    ) -> None:
23        sample_ids = [str(sample) for sample in samples]
24        if len(sample_ids) == 0:
25            raise ValueError("Phenotype file contains no samples.")
26        if len(set(sample_ids)) != len(sample_ids):
27            raise ValueError("Phenotype sample IDs must be unique.")
28
29        try:
30            values_f64 = np.asarray(values, dtype=np.float64)
31        except (TypeError, ValueError) as exc:
32            raise ValueError("Phenotype values must be numeric.") from exc
33
34        if values_f64.ndim != 1:
35            raise ValueError("Phenotype values must be a 1-dimensional array.")
36        if values_f64.size != len(sample_ids):
37            raise ValueError(
38                "Phenotype sample/value length mismatch: "
39                f"{len(sample_ids)} samples but {values_f64.size} values."
40            )
41        if not np.all(np.isfinite(values_f64)):
42            raise ValueError("Phenotype contains non-finite values (NaN/Inf).")
43
44        trait_is_quantitative = (
45            self._infer_quantitative(values_f64)
46            if quantitative is None
47            else bool(quantitative)
48        )
49
50        if trait_is_quantitative:
51            if float(np.var(values_f64)) <= 0.0:
52                raise ValueError("Quantitative phenotype has zero variance.")
53            normalized_values = values_f64
54            cases: List[str] = []
55            controls: List[str] = sample_ids.copy()
56        else:
57            normalized_values = self._normalize_binary(values_f64)
58            case_mask = normalized_values == 1
59            control_mask = normalized_values == 0
60            cases = [sample_ids[idx] for idx in np.where(case_mask)[0].tolist()]
61            controls = [sample_ids[idx] for idx in np.where(control_mask)[0].tolist()]
62            if len(cases) == 0:
63                raise ValueError("No case data available.")
64            if len(controls) == 0:
65                raise ValueError("No control data available.")
66
67        self._samples = sample_ids
68        self._values = normalized_values
69        self._phenotype_name = str(phenotype_name)
70        self._is_quantitative = trait_is_quantitative
71
72        self._cases = cases
73        self._controls = controls
74        self._all_haplotypes = [f"{sample}.0" for sample in sample_ids] + [
75            f"{sample}.1" for sample in sample_ids
76        ]
77        self._cases_haplotypes = [f"{sample}.0" for sample in cases] + [
78            f"{sample}.1" for sample in cases
79        ]
80        self._controls_haplotypes = [f"{sample}.0" for sample in controls] + [
81            f"{sample}.1" for sample in controls
82        ]
samples: List[str]
124    @property
125    def samples(self) -> List[str]:
126        return self._samples
n_samples: int
128    @property
129    def n_samples(self) -> int:
130        return len(self._samples)
values: numpy.ndarray
132    @property
133    def values(self) -> np.ndarray:
134        return self._values
y: numpy.ndarray
136    @property
137    def y(self) -> np.ndarray:
138        return self._values
phenotype_name: str
140    @property
141    def phenotype_name(self) -> str:
142        return self._phenotype_name
is_quantitative: bool
144    @property
145    def is_quantitative(self) -> bool:
146        return self._is_quantitative
quantitative: bool
148    @property
149    def quantitative(self) -> bool:
150        return self._is_quantitative
cases: List[str]
152    @property
153    def cases(self) -> List[str]:
154        return self._cases
n_cases: int
156    @property
157    def n_cases(self) -> int:
158        return len(self._cases)
controls: List[str]
160    @property
161    def controls(self) -> List[str]:
162        return self._controls
n_controls: int
164    @property
165    def n_controls(self) -> int:
166        return len(self._controls)
all_haplotypes: List[str]
168    @property
169    def all_haplotypes(self) -> List[str]:
170        return self._all_haplotypes
cases_haplotypes: List[str]
172    @property
173    def cases_haplotypes(self) -> List[str]:
174        return self._cases_haplotypes
controls_haplotypes: List[str]
176    @property
177    def controls_haplotypes(self) -> List[str]:
178        return self._controls_haplotypes
def copy(self):
180    def copy(self):
181        return copy.copy(self)
def keys(self) -> List[str]:
183    def keys(self) -> List[str]:
184        return [
185            "samples",
186            "n_samples",
187            "values",
188            "y",
189            "phenotype_name",
190            "is_quantitative",
191            "quantitative",
192            "cases",
193            "n_cases",
194            "controls",
195            "n_controls",
196            "all_haplotypes",
197            "cases_haplotypes",
198            "controls_haplotypes",
199        ]
class MultiPhenReader(snputils.phenotype.io.read.base.PhenotypeBaseReader):
 17class MultiPhenReader(PhenotypeBaseReader):
 18    """
 19    Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen),
 20    constructing a `MultiPhenotypeObject`.
 21    """
 22    def __init__(self, file: Union[str, Path]) -> None:
 23        """
 24        Args:
 25            file (str or pathlib.Path):
 26                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
 27        """
 28        self.__file = file
 29
 30    @property
 31    def file(self) -> Path:
 32        """
 33        Retrieve `file`.
 34
 35        Returns:
 36            pathlib.Path:
 37                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
 38        """
 39        return self.__file
 40
 41    def read(
 42            self,
 43            samples_idx: int = 0,
 44            phen_names: Optional[List] = None,
 45            sep: str = ',',
 46            header: int = 0,
 47            drop: bool = False
 48        ) -> 'MultiPhenotypeObject':
 49        """
 50        Read data from `file` and construct a `MultiPhenotypeObject`.
 51
 52        Args:
 53            samples_idx (int, default=0): Index of the column containing sample identifiers.
 54                Default is 0, assuming the first column contains sample identifiers.
 55            phen_names (list of str, optional): List of phenotype column names. If provided,
 56                these columns will be renamed to the specified names.
 57            sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`,
 58                `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited.
 59            header (int, default=0): Row index to use as the column names. By default,
 60                uses the first row (`header=0`). Set to `None` if column names are provided
 61                explicitly.
 62            drop (bool, default=False): If True, removes columns not listed in `phen_names`
 63                (except the samples column).
 64
 65        Returns:
 66            MultiPhenotypeObject:
 67                A multi-phenotype object instance.
 68        """
 69        file_extension = os.path.splitext(self.file)[1]
 70
 71        log.info(f"Reading '{file_extension}' file from '{self.file}'...")
 72
 73        if file_extension == '.xlsx':
 74            phen_df = pd.read_excel(self.file, header=0, index_col=None)
 75        elif file_extension == '.csv':
 76            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 77        elif file_extension in ['.map', '.smap']:
 78            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 79        elif file_extension == '.tsv':
 80            phen_df = pd.read_csv(self.file, sep='\t')
 81        elif file_extension in ['.txt', '.phe', '.pheno']:
 82            phen_df = pd.read_csv(self.file, sep=r'\s+', header=header)
 83        elif file_extension == '.phen':
 84            with open(self.file, 'r') as f:
 85                contents = f.readlines()
 86            phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]}
 87            phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())})
 88        else:
 89            raise ValueError(
 90                f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}."
 91            )
 92
 93        phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True)
 94
 95        if samples_idx != 0:
 96            cols = ['samples'] + [col for col in phen_df.columns if col != 'samples']
 97            phen_df = phen_df[cols]
 98
 99        if phen_names is not None:
100            if drop:
101                non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names))
102                phen_df = phen_df.drop(non_phen_columns, axis=1)
103
104            phenotype_col_count = phen_df.shape[1] - 1
105            if phenotype_col_count == len(phen_names):
106                phen_df.columns.values[1:] = phen_names
107            else:
108                raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) "
109                                 f"and length of `phen_names` ({len(phen_names)}).")
110
111        return MultiPhenotypeObject(phen_df=phen_df)

Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen), constructing a MultiPhenotypeObject.

MultiPhenReader(file: str | pathlib.Path)
22    def __init__(self, file: Union[str, Path]) -> None:
23        """
24        Args:
25            file (str or pathlib.Path):
26                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
27        """
28        self.__file = file
Arguments:
  • file (str or pathlib.Path): Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
file: pathlib.Path
30    @property
31    def file(self) -> Path:
32        """
33        Retrieve `file`.
34
35        Returns:
36            pathlib.Path:
37                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
38        """
39        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.

def read( self, samples_idx: int = 0, phen_names: List | None = None, sep: str = ',', header: int = 0, drop: bool = False) -> MultiPhenotypeObject:
 41    def read(
 42            self,
 43            samples_idx: int = 0,
 44            phen_names: Optional[List] = None,
 45            sep: str = ',',
 46            header: int = 0,
 47            drop: bool = False
 48        ) -> 'MultiPhenotypeObject':
 49        """
 50        Read data from `file` and construct a `MultiPhenotypeObject`.
 51
 52        Args:
 53            samples_idx (int, default=0): Index of the column containing sample identifiers.
 54                Default is 0, assuming the first column contains sample identifiers.
 55            phen_names (list of str, optional): List of phenotype column names. If provided,
 56                these columns will be renamed to the specified names.
 57            sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`,
 58                `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited.
 59            header (int, default=0): Row index to use as the column names. By default,
 60                uses the first row (`header=0`). Set to `None` if column names are provided
 61                explicitly.
 62            drop (bool, default=False): If True, removes columns not listed in `phen_names`
 63                (except the samples column).
 64
 65        Returns:
 66            MultiPhenotypeObject:
 67                A multi-phenotype object instance.
 68        """
 69        file_extension = os.path.splitext(self.file)[1]
 70
 71        log.info(f"Reading '{file_extension}' file from '{self.file}'...")
 72
 73        if file_extension == '.xlsx':
 74            phen_df = pd.read_excel(self.file, header=0, index_col=None)
 75        elif file_extension == '.csv':
 76            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 77        elif file_extension in ['.map', '.smap']:
 78            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 79        elif file_extension == '.tsv':
 80            phen_df = pd.read_csv(self.file, sep='\t')
 81        elif file_extension in ['.txt', '.phe', '.pheno']:
 82            phen_df = pd.read_csv(self.file, sep=r'\s+', header=header)
 83        elif file_extension == '.phen':
 84            with open(self.file, 'r') as f:
 85                contents = f.readlines()
 86            phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]}
 87            phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())})
 88        else:
 89            raise ValueError(
 90                f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}."
 91            )
 92
 93        phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True)
 94
 95        if samples_idx != 0:
 96            cols = ['samples'] + [col for col in phen_df.columns if col != 'samples']
 97            phen_df = phen_df[cols]
 98
 99        if phen_names is not None:
100            if drop:
101                non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names))
102                phen_df = phen_df.drop(non_phen_columns, axis=1)
103
104            phenotype_col_count = phen_df.shape[1] - 1
105            if phenotype_col_count == len(phen_names):
106                phen_df.columns.values[1:] = phen_names
107            else:
108                raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) "
109                                 f"and length of `phen_names` ({len(phen_names)}).")
110
111        return MultiPhenotypeObject(phen_df=phen_df)

Read data from file and construct a MultiPhenotypeObject.

Arguments:
  • samples_idx (int, default=0): Index of the column containing sample identifiers. Default is 0, assuming the first column contains sample identifiers.
  • phen_names (list of str, optional): List of phenotype column names. If provided, these columns will be renamed to the specified names.
  • sep (str, default=','): The delimiter for separating values in .csv, .tsv, .txt, .phe, .pheno, or .map files. Default is ','; use sep=r'\s+' for whitespace-delimited.
  • header (int, default=0): Row index to use as the column names. By default, uses the first row (header=0). Set to None if column names are provided explicitly.
  • drop (bool, default=False): If True, removes columns not listed in phen_names (except the samples column).
Returns:

MultiPhenotypeObject: A multi-phenotype object instance.

class PhenotypeReader(snputils.phenotype.io.read.base.PhenotypeBaseReader):
 11class PhenotypeReader(PhenotypeBaseReader):
 12    """
 13    Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno).
 14
 15    Expected format (headered, whitespace-delimited):
 16      - Must include `IID` (optionally preceded by `FID`)
 17      - First phenotype column after `IID` is used by default
 18    """
 19
 20    def __init__(self, file: Union[str, Path]) -> None:
 21        super().__init__(file)
 22
 23    @property
 24    def file(self) -> Path:
 25        return Path(self._file)
 26
 27    @staticmethod
 28    def _has_header_with_iid(file_path: Path) -> bool:
 29        with open(file_path, "r", encoding="utf-8") as handle:
 30            for raw_line in handle:
 31                line = raw_line.strip()
 32                if not line:
 33                    continue
 34                tokens = line.split()
 35                return any(token.lstrip("#").upper() == "IID" for token in tokens)
 36        raise ValueError("Empty phenotype file.")
 37
 38    @staticmethod
 39    def _resolve_column(columns, normalized_columns, requested: str) -> Optional[str]:
 40        requested_norm = str(requested).lstrip("#").upper()
 41        for col, col_norm in zip(columns, normalized_columns):
 42            if str(col) == str(requested) or col_norm == requested_norm:
 43                return str(col)
 44        return None
 45
 46    def read(
 47        self,
 48        phenotype_col: Optional[str] = None,
 49        quantitative: Optional[bool] = None,
 50    ) -> PhenotypeObject:
 51        file_path = self.file
 52        if not file_path.exists():
 53            raise FileNotFoundError(f"Phenotype file not found: '{file_path}'")
 54
 55        has_iid_header = self._has_header_with_iid(file_path)
 56        if has_iid_header:
 57            phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str)
 58        else:
 59            warnings.warn(
 60                (
 61                    "Phenotype file has no header/IID column. Legacy 3-column parsing "
 62                    "(FID IID PHENO) is deprecated; please switch to a headered format."
 63                ),
 64                UserWarning,
 65                stacklevel=2,
 66            )
 67            legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str)
 68            if legacy.shape[1] < 3:
 69                raise ValueError(
 70                    "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO."
 71                )
 72            phen_df = legacy.iloc[:, :3].copy()
 73            phen_df.columns = ["FID", "IID", "PHENO"]
 74
 75        if phen_df.empty:
 76            raise ValueError("Empty phenotype file.")
 77
 78        columns = [str(col) for col in phen_df.columns]
 79        normalized_columns = [col.lstrip("#").upper() for col in columns]
 80        if "IID" not in normalized_columns:
 81            raise ValueError("Phenotype file must include an IID column in the header.")
 82        iid_col = columns[normalized_columns.index("IID")]
 83
 84        iid_series = phen_df[iid_col].astype(str).str.strip()
 85        if iid_series.eq("").any():
 86            raise ValueError("Phenotype IID column contains empty values.")
 87        if iid_series.duplicated().any():
 88            raise ValueError("Phenotype IID values must be unique.")
 89
 90        if phenotype_col is not None:
 91            resolved = self._resolve_column(columns, normalized_columns, phenotype_col)
 92            if resolved is None:
 93                raise ValueError(
 94                    f"Phenotype column '{phenotype_col}' not found in header: {columns}"
 95                )
 96            target_col = resolved
 97        else:
 98            iid_idx = normalized_columns.index("IID")
 99            if iid_idx + 1 >= len(columns):
100                raise ValueError(
101                    "Phenotype file must include at least one phenotype column after IID."
102                )
103            target_col = columns[iid_idx + 1]
104
105        values = pd.to_numeric(phen_df[target_col], errors="coerce")
106        if values.isna().any():
107            bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist()
108            raise ValueError(
109                f"Phenotype column '{target_col}' contains non-numeric or missing values: "
110                f"{bad_examples}"
111            )
112
113        phenotype_name = str(target_col).lstrip("#")
114        return PhenotypeObject(
115            samples=iid_series.tolist(),
116            values=values.to_numpy(),
117            phenotype_name=phenotype_name,
118            quantitative=quantitative,
119        )

Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno).

Expected format (headered, whitespace-delimited):

  • Must include IID (optionally preceded by FID)
  • First phenotype column after IID is used by default
PhenotypeReader(file: str | pathlib.Path)
20    def __init__(self, file: Union[str, Path]) -> None:
21        super().__init__(file)
file: pathlib.Path
23    @property
24    def file(self) -> Path:
25        return Path(self._file)

Retrieve file.

Returns:

pathlib.Path: Path to the file containing phenotype data.

def read( self, phenotype_col: str | None = None, quantitative: bool | None = None) -> PhenotypeObject:
 46    def read(
 47        self,
 48        phenotype_col: Optional[str] = None,
 49        quantitative: Optional[bool] = None,
 50    ) -> PhenotypeObject:
 51        file_path = self.file
 52        if not file_path.exists():
 53            raise FileNotFoundError(f"Phenotype file not found: '{file_path}'")
 54
 55        has_iid_header = self._has_header_with_iid(file_path)
 56        if has_iid_header:
 57            phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str)
 58        else:
 59            warnings.warn(
 60                (
 61                    "Phenotype file has no header/IID column. Legacy 3-column parsing "
 62                    "(FID IID PHENO) is deprecated; please switch to a headered format."
 63                ),
 64                UserWarning,
 65                stacklevel=2,
 66            )
 67            legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str)
 68            if legacy.shape[1] < 3:
 69                raise ValueError(
 70                    "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO."
 71                )
 72            phen_df = legacy.iloc[:, :3].copy()
 73            phen_df.columns = ["FID", "IID", "PHENO"]
 74
 75        if phen_df.empty:
 76            raise ValueError("Empty phenotype file.")
 77
 78        columns = [str(col) for col in phen_df.columns]
 79        normalized_columns = [col.lstrip("#").upper() for col in columns]
 80        if "IID" not in normalized_columns:
 81            raise ValueError("Phenotype file must include an IID column in the header.")
 82        iid_col = columns[normalized_columns.index("IID")]
 83
 84        iid_series = phen_df[iid_col].astype(str).str.strip()
 85        if iid_series.eq("").any():
 86            raise ValueError("Phenotype IID column contains empty values.")
 87        if iid_series.duplicated().any():
 88            raise ValueError("Phenotype IID values must be unique.")
 89
 90        if phenotype_col is not None:
 91            resolved = self._resolve_column(columns, normalized_columns, phenotype_col)
 92            if resolved is None:
 93                raise ValueError(
 94                    f"Phenotype column '{phenotype_col}' not found in header: {columns}"
 95                )
 96            target_col = resolved
 97        else:
 98            iid_idx = normalized_columns.index("IID")
 99            if iid_idx + 1 >= len(columns):
100                raise ValueError(
101                    "Phenotype file must include at least one phenotype column after IID."
102                )
103            target_col = columns[iid_idx + 1]
104
105        values = pd.to_numeric(phen_df[target_col], errors="coerce")
106        if values.isna().any():
107            bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist()
108            raise ValueError(
109                f"Phenotype column '{target_col}' contains non-numeric or missing values: "
110                f"{bad_examples}"
111            )
112
113        phenotype_name = str(target_col).lstrip("#")
114        return PhenotypeObject(
115            samples=iid_series.tolist(),
116            values=values.to_numpy(),
117            phenotype_name=phenotype_name,
118            quantitative=quantitative,
119        )

Abstract method to read data from the provided file.

Subclasses must implement this method to read and parse the data. The implementation should construct an instance of snputils.phenotype.genobj.MultiPhenotypeObject or snputils.phenotype.genobj.PhenotypeObject based on the read data.

def load_dataset( name: str, chromosomes: List[str] | List[int] | str | int, variants_ids: List[str] | None = None, sample_ids: List[str] | None = None, verbose: bool = True, **read_kwargs) -> SNPObject:
 34def load_dataset(
 35        name: str,
 36        chromosomes: Union[List[str], List[int], str, int],
 37        variants_ids: Optional[List[str]] = None,
 38        sample_ids: Optional[List[str]] = None,
 39        verbose: bool = True,
 40        **read_kwargs
 41) -> SNPObject:
 42    """
 43    Load a genome dataset.
 44
 45    Args:
 46        name (str): Name of the dataset to load. Call `available_datasets_list()` to get the list of available datasets.
 47        chromosomes (List[str] | List[int] | str | int): Chromosomes to load.
 48        variants_ids (List[str]): List of variant IDs to load.
 49        sample_ids (List[str]): List of sample IDs to load.
 50        verbose (bool): Whether to show progress.
 51        **read_kwargs: Keyword arguments to pass to `PGENReader.read()`.
 52
 53    Returns:
 54        SNPObject: SNPObject containing the loaded dataset.
 55    """
 56    if isinstance(chromosomes, (str, int)):
 57        chromosomes = [chromosomes]
 58    chromosomes = [str(chr).lower().replace("chr", "") for chr in chromosomes]
 59
 60    if variants_ids is not None:
 61        variants_ids_txt = tempfile.NamedTemporaryFile(mode='w')
 62        variants_ids_txt.write("\n".join(variants_ids))
 63        variants_ids_txt.flush()
 64
 65    if sample_ids is not None:
 66        sample_ids_txt = tempfile.NamedTemporaryFile(mode='w')
 67        sample_ids_txt.write("\n".join(sample_ids))
 68        sample_ids_txt.flush()
 69
 70    merge_list_txt = tempfile.NamedTemporaryFile(mode='w')
 71
 72    data_home = get_data_home()
 73
 74    if name == "1kgp":
 75        data_path = data_home / name
 76        data_path.mkdir(parents=True, exist_ok=True)
 77        for chr in chromosomes:
 78            chr_path = data_path / chr_urls[name][chr]
 79            if not Path(chr_path).exists():
 80                log.info(f"Downloading chromosome {chr}...")
 81                download_url(f"{base_urls[name]}/{chr_urls[name][chr]}", chr_path, show_progress=verbose)
 82            else:
 83                log.info(f"Chromosome {chr} already exists. Skipping download.")
 84
 85            # Filter and convert to PGEN
 86            log.info(f"Processing chromosome {chr}...")
 87            out_file = chr_urls[name][chr].replace('.vcf.gz', '')
 88            execute_plink_cmd(
 89                ["--vcf", f"{chr_urls[name][chr]}"]
 90                + (["--keep", sample_ids_txt.name] if sample_ids is not None else [])
 91                + (["--extract", variants_ids_txt.name] if variants_ids is not None else [])
 92                + [
 93                    "--set-missing-var-ids", "@:#",
 94                    "--make-pgen",
 95                    "--out", out_file,
 96                ], cwd=data_path)
 97            merge_list_txt.write(f"{out_file}\n")
 98
 99        if len(chromosomes) > 1:
100            # Merge the PGEN files into single PGEN fileset
101            log.info("Merging PGEN files...")
102            merge_list_txt.flush()
103            print(f"Merge list file contents: {open(merge_list_txt.name, 'r').read()}")
104            execute_plink_cmd(["--pmerge-list", merge_list_txt.name, "--make-pgen", "--out", "1kgp"],
105                              cwd=data_path)
106        else:
107            # Rename the single PGEN file
108            for ext in ["pgen", "psam", "pvar"]:
109                Path(data_path / f"{out_file}.{ext}").rename(data_path / f"1kgp.{ext}")
110
111        # Read PGEN fileset with PGENReader into SNPObject
112        log.info("Reading PGEN fileset...")
113        snpobj = PGENReader(data_path / "1kgp").read(**read_kwargs)
114    else:
115        raise NotImplementedError(f"Dataset {name} not implemented.")
116
117    if variants_ids is not None:
118        variants_ids_txt.close()
119    if sample_ids is not None:
120        sample_ids_txt.close()
121    merge_list_txt.close()
122
123    return snpobj

Load a genome dataset.

Arguments:
  • name (str): Name of the dataset to load. Call available_datasets_list() to get the list of available datasets.
  • chromosomes (List[str] | List[int] | str | int): Chromosomes to load.
  • variants_ids (List[str]): List of variant IDs to load.
  • sample_ids (List[str]): List of sample IDs to load.
  • verbose (bool): Whether to show progress.
  • **read_kwargs: Keyword arguments to pass to PGENReader.read().
Returns:

SNPObject: SNPObject containing the loaded dataset.