snputils
1from importlib import import_module 2from importlib.metadata import PackageNotFoundError, version 3from typing import Dict, Tuple 4 5try: 6 __version__ = version("snputils") 7except PackageNotFoundError: 8 __version__ = "unknown" 9 10_LAZY_ATTRS: Dict[str, Tuple[str, str]] = { 11 "SNPObject": (".snp", "SNPObject"), 12 "GRGObject": (".snp", "GRGObject"), 13 "SNPReader": (".snp", "SNPReader"), 14 "BEDReader": (".snp", "BEDReader"), 15 "GRGReader": (".snp", "GRGReader"), 16 "GRGWriter": (".snp", "GRGWriter"), 17 "PGENReader": (".snp", "PGENReader"), 18 "VCFReader": (".snp", "VCFReader"), 19 "BEDWriter": (".snp", "BEDWriter"), 20 "PGENWriter": (".snp", "PGENWriter"), 21 "VCFWriter": (".snp", "VCFWriter"), 22 "read_snp": (".snp", "read_snp"), 23 "read_bed": (".snp", "read_bed"), 24 "read_pgen": (".snp", "read_pgen"), 25 "read_vcf": (".snp", "read_vcf"), 26 "read_grg": (".snp", "read_grg"), 27 "LocalAncestryObject": (".ancestry", "LocalAncestryObject"), 28 "GlobalAncestryObject": (".ancestry", "GlobalAncestryObject"), 29 "MSPReader": (".ancestry", "MSPReader"), 30 "MSPWriter": (".ancestry", "MSPWriter"), 31 "AdmixtureMappingVCFWriter": (".ancestry", "AdmixtureMappingVCFWriter"), 32 "AdmixtureReader": (".ancestry", "AdmixtureReader"), 33 "AdmixtureWriter": (".ancestry", "AdmixtureWriter"), 34 "read_lai": (".ancestry", "read_lai"), 35 "read_msp": (".ancestry", "read_msp"), 36 "read_adm": (".ancestry", "read_adm"), 37 "read_admixture": (".ancestry", "read_admixture"), 38 "IBDObject": (".ibd", "IBDObject"), 39 "read_ibd": (".ibd", "read_ibd"), 40 "HapIBDReader": (".ibd", "HapIBDReader"), 41 "AncIBDReader": (".ibd", "AncIBDReader"), 42 "IBDReader": (".ibd", "IBDReader"), 43 "MultiPhenotypeObject": (".phenotype", "MultiPhenotypeObject"), 44 "PhenotypeObject": (".phenotype", "PhenotypeObject"), 45 "MultiPhenReader": (".phenotype", "MultiPhenReader"), 46 "PhenotypeReader": (".phenotype", "PhenotypeReader"), 47 "load_dataset": (".datasets", "load_dataset"), 48 "viz": (".visualization", ""), 49} 50 51__all__ = list(_LAZY_ATTRS.keys()) 52 53 54def __getattr__(name): 55 target = _LAZY_ATTRS.get(name) 56 if target is None: 57 raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 58 59 module_name, attr_name = target 60 module = import_module(module_name, package=__name__) 61 value = module if attr_name == "" else getattr(module, attr_name) 62 globals()[name] = value 63 return value 64 65 66def __dir__(): 67 return sorted(set(globals().keys()) | set(__all__))
21class SNPObject: 22 """ 23 A class for Single Nucleotide Polymorphism (SNP) data, with optional support for 24 SNP-level Local Ancestry Information (LAI). 25 """ 26 def __init__( 27 self, 28 calldata_gt: Optional[np.ndarray] = None, 29 samples: Optional[np.ndarray] = None, 30 variants_ref: Optional[np.ndarray] = None, 31 variants_alt: Optional[np.ndarray] = None, 32 variants_chrom: Optional[np.ndarray] = None, 33 variants_filter_pass: Optional[np.ndarray] = None, 34 variants_id: Optional[np.ndarray] = None, 35 variants_pos: Optional[np.ndarray] = None, 36 variants_qual: Optional[np.ndarray] = None, 37 calldata_lai: Optional[np.ndarray] = None, 38 ancestry_map: Optional[Dict[str, str]] = None 39 ) -> None: 40 """ 41 Args: 42 calldata_gt (array, optional): 43 An array containing genotype data for each sample. This array can be either 2D with shape 44 `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 45 `(n_snps, n_samples, 2)` if the strands are kept separate. 46 samples (array of shape (n_samples,), optional): 47 An array containing unique sample identifiers. 48 variants_ref (array of shape (n_snps,), optional): 49 An array containing the reference allele for each SNP. 50 variants_alt (array of shape (n_snps,), optional): 51 An array containing the alternate allele for each SNP. 52 variants_chrom (array of shape (n_snps,), optional): 53 An array containing the chromosome for each SNP. 54 variants_filter_pass (array of shape (n_snps,), optional): 55 An array indicating whether each SNP passed control checks. 56 variants_id (array of shape (n_snps,), optional): 57 An array containing unique identifiers (IDs) for each SNP. 58 variants_pos (array of shape (n_snps,), optional): 59 An array containing the chromosomal positions for each SNP. 60 variants_qual (array of shape (n_snps,), optional): 61 An array containing the Phred-scaled quality score for each SNP. 62 calldata_lai (array, optional): 63 An array containing the ancestry for each SNP. This array can be either 2D with shape 64 `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2). 65 ancestry_map (dict of str to str, optional): 66 A dictionary mapping ancestry codes to region names. 67 """ 68 self.__calldata_gt = calldata_gt 69 self.__samples = samples 70 self.__variants_ref = variants_ref 71 self.__variants_alt = variants_alt 72 self.__variants_chrom = variants_chrom 73 self.__variants_filter_pass = variants_filter_pass 74 self.__variants_id = variants_id 75 self.__variants_pos = variants_pos 76 self.__variants_qual = variants_qual 77 self.__calldata_lai = calldata_lai 78 self.__ancestry_map = ancestry_map 79 80 self._sanity_check() 81 82 def __getitem__(self, key: str) -> Any: 83 """ 84 To access an attribute of the class using the square bracket notation, 85 similar to a dictionary. 86 """ 87 try: 88 return getattr(self, key) 89 except: 90 raise KeyError(f'Invalid key: {key}.') 91 92 def __setitem__(self, key: str, value: Any): 93 """ 94 To set an attribute of the class using the square bracket notation, 95 similar to a dictionary. 96 """ 97 try: 98 setattr(self, key, value) 99 except: 100 raise KeyError(f'Invalid key: {key}.') 101 102 @property 103 def calldata_gt(self) -> np.ndarray: 104 """ 105 Retrieve `calldata_gt`. 106 107 Returns: 108 **array:** 109 An array containing genotype data for each sample. This array can be either 2D with shape 110 `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 111 `(n_snps, n_samples, 2)` if the strands are kept separate. 112 """ 113 return self.__calldata_gt 114 115 @calldata_gt.setter 116 def calldata_gt(self, x: np.ndarray): 117 """ 118 Update `calldata_gt`. 119 """ 120 self.__calldata_gt = x 121 122 @property 123 def samples(self) -> Optional[np.ndarray]: 124 """ 125 Retrieve `samples`. 126 127 Returns: 128 **array of shape (n_samples,):** 129 An array containing unique sample identifiers. 130 """ 131 return self.__samples 132 133 @samples.setter 134 def samples(self, x: Union[List, np.ndarray]): 135 """ 136 Update `samples`. 137 """ 138 self.__samples = np.asarray(x) 139 140 @property 141 def variants_ref(self) -> Optional[np.ndarray]: 142 """ 143 Retrieve `variants_ref`. 144 145 Returns: 146 **array of shape (n_snps,):** An array containing the reference allele for each SNP. 147 """ 148 return self.__variants_ref 149 150 @variants_ref.setter 151 def variants_ref(self, x: np.ndarray): 152 """ 153 Update `variants_ref`. 154 """ 155 self.__variants_ref = x 156 157 @property 158 def variants_alt(self) -> Optional[np.ndarray]: 159 """ 160 Retrieve `variants_alt`. 161 162 Returns: 163 **array of shape (n_snps,):** An array containing the alternate allele for each SNP. 164 """ 165 return self.__variants_alt 166 167 @variants_alt.setter 168 def variants_alt(self, x: np.ndarray): 169 """ 170 Update `variants_alt`. 171 """ 172 self.__variants_alt = x 173 174 @property 175 def variants_chrom(self) -> Optional[np.ndarray]: 176 """ 177 Retrieve `variants_chrom`. 178 179 Returns: 180 **array of shape (n_snps,):** An array containing the chromosome for each SNP. 181 """ 182 return self.__variants_chrom 183 184 @variants_chrom.setter 185 def variants_chrom(self, x: np.ndarray): 186 """ 187 Update `variants_chrom`. 188 """ 189 self.__variants_chrom = x 190 191 @property 192 def variants_filter_pass(self) -> Optional[np.ndarray]: 193 """ 194 Retrieve `variants_filter_pass`. 195 196 Returns: 197 **array of shape (n_snps,):** An array indicating whether each SNP passed control checks. 198 """ 199 return self.__variants_filter_pass 200 201 @variants_filter_pass.setter 202 def variants_filter_pass(self, x: np.ndarray): 203 """ 204 Update `variants_filter_pass`. 205 """ 206 self.__variants_filter_pass = x 207 208 @property 209 def variants_id(self) -> Optional[np.ndarray]: 210 """ 211 Retrieve `variants_id`. 212 213 Returns: 214 **array of shape (n_snps,):** An array containing unique identifiers (IDs) for each SNP. 215 """ 216 return self.__variants_id 217 218 @variants_id.setter 219 def variants_id(self, x: np.ndarray): 220 """ 221 Update `variants_id`. 222 """ 223 self.__variants_id = x 224 225 @property 226 def variants_pos(self) -> Optional[np.ndarray]: 227 """ 228 Retrieve `variants_pos`. 229 230 Returns: 231 **array of shape (n_snps,):** An array containing the chromosomal positions for each SNP. 232 """ 233 return self.__variants_pos 234 235 @variants_pos.setter 236 def variants_pos(self, x: np.ndarray): 237 """ 238 Update `variants_pos`. 239 """ 240 self.__variants_pos = x 241 242 @property 243 def variants_qual(self) -> Optional[np.ndarray]: 244 """ 245 Retrieve `variants_qual`. 246 247 Returns: 248 **array of shape (n_snps,):** An array containing the Phred-scaled quality score for each SNP. 249 """ 250 return self.__variants_qual 251 252 @variants_qual.setter 253 def variants_qual(self, x: np.ndarray): 254 """ 255 Update `variants_qual`. 256 """ 257 self.__variants_qual = x 258 259 @property 260 def calldata_lai(self) -> Optional[np.ndarray]: 261 """ 262 Retrieve `calldata_lai`. 263 264 Returns: 265 **array:** 266 An array containing the ancestry for each SNP. This array can be either 2D with shape 267 `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2). 268 """ 269 return self.__calldata_lai 270 271 @calldata_lai.setter 272 def calldata_lai(self, x: np.ndarray): 273 """ 274 Update `calldata_lai`. 275 """ 276 self.__calldata_lai = x 277 278 @property 279 def ancestry_map(self) -> Optional[Dict[str, str]]: 280 """ 281 Retrieve `ancestry_map`. 282 283 Returns: 284 **dict of str to str:** A dictionary mapping ancestry codes to region names. 285 """ 286 return self.__ancestry_map 287 288 @ancestry_map.setter 289 def ancestry_map(self, x): 290 """ 291 Update `ancestry_map`. 292 """ 293 self.__ancestry_map = x 294 295 @property 296 def n_samples(self) -> int: 297 """ 298 Retrieve `n_samples`. 299 300 Returns: 301 **int:** The total number of samples. 302 """ 303 if self.__samples is not None: 304 return len(self.__samples) 305 elif self.__calldata_gt is not None: 306 return self.__calldata_gt.shape[1] 307 elif self.__calldata_lai is not None: 308 if self.__calldata_lai.ndim == 2: 309 return self.__calldata_lai.shape[1] // 2 310 elif self.__calldata_lai.ndim == 3: 311 return self.__calldata_lai.shape[1] 312 else: 313 raise ValueError("Unable to determine the total number of samples: no relevant data is available.") 314 315 @property 316 def n_snps(self) -> int: 317 """ 318 Retrieve `n_snps`. 319 320 Returns: 321 **int:** The total number of SNPs. 322 """ 323 # List of attributes that can indicate the number of SNPs 324 potential_attributes = [ 325 self.__calldata_gt, 326 self.__variants_ref, 327 self.__variants_alt, 328 self.__variants_chrom, 329 self.__variants_filter_pass, 330 self.__variants_id, 331 self.__variants_pos, 332 self.__variants_qual, 333 self.__calldata_lai 334 ] 335 336 # Check each attribute for its first dimension, which corresponds to `n_snps` 337 for attr in potential_attributes: 338 if attr is not None: 339 return attr.shape[0] 340 341 raise ValueError("Unable to determine the total number of SNPs: no relevant data is available.") 342 343 @property 344 def n_chrom(self) -> Optional[int]: 345 """ 346 Retrieve `n_chrom`. 347 348 Returns: 349 **int:** The total number of unique chromosomes in `variants_chrom`. 350 """ 351 if self.variants_chrom is None: 352 warnings.warn("Chromosome data `variants_chrom` is None.") 353 return None 354 355 return len(self.unique_chrom) 356 357 @property 358 def n_ancestries(self) -> int: 359 """ 360 Retrieve `n_ancestries`. 361 362 Returns: 363 **int:** The total number of unique ancestries. 364 """ 365 if self.__calldata_lai is not None: 366 return len(np.unique(self.__calldata_lai)) 367 else: 368 raise ValueError("Unable to determine the total number of ancestries: no relevant data is available.") 369 370 @property 371 def unique_chrom(self) -> Optional[np.ndarray]: 372 """ 373 Retrieve `unique_chrom`. 374 375 Returns: 376 **array:** The unique chromosome names in `variants_chrom`, preserving their order of appearance. 377 """ 378 if self.variants_chrom is None: 379 warnings.warn("Chromosome data `variants_chrom` is None.") 380 return None 381 382 # Identify unique chromosome names and their first indexes of occurrence 383 _, idx = np.unique(self.variants_chrom, return_index=True) 384 # Return chromosome names sorted by their first occurrence to maintain original order 385 return self.variants_chrom[np.sort(idx)] 386 387 @property 388 def are_strands_summed(self) -> bool: 389 """ 390 Retrieve `are_strands_summed`. 391 392 Returns: 393 **bool:** 394 True if the maternal and paternal strands have been summed together, which is indicated by 395 `calldata_gt` having shape `(n_samples, n_snps)`. False if the strands are stored separately, 396 indicated by `calldata_gt` having shape `(n_samples, n_snps, 2)`. 397 """ 398 if self.calldata_gt is None: 399 warnings.warn("Genotype data `calldata_gt` is None.") 400 return None 401 402 return self.calldata_gt.ndim == 2 403 404 def copy(self) -> SNPObject: 405 """ 406 Create and return a copy of `self`. 407 408 Returns: 409 **SNPObject:** 410 A new instance of the current object. 411 """ 412 return copy.deepcopy(self) 413 414 def keys(self) -> List[str]: 415 """ 416 Retrieve a list of public attribute names for `self`. 417 418 Returns: 419 **list of str:** 420 A list of attribute names, with internal name-mangling removed, 421 for easier reference to public attributes in the instance. 422 """ 423 return [attr.replace('_SNPObject__', '') for attr in vars(self)] 424 425 def allele_freq( 426 self, 427 sample_labels: Optional[Sequence[Any]] = None, 428 ancestry: Optional[Union[str, int]] = None, 429 laiobj: Optional["LocalAncestryObject"] = None, 430 return_counts: bool = False, 431 as_dataframe: bool = False, 432 ) -> Any: 433 """ 434 Compute per-SNP alternate allele frequencies from `calldata_gt`. 435 436 Args: 437 sample_labels (sequence, optional): 438 Population label per sample. If None, computes cohort-level frequencies. 439 ancestry (str or int, optional): 440 If provided, compute ancestry-masked frequencies using SNP-level LAI. 441 laiobj (LocalAncestryObject, optional): 442 Optional LAI object used when `self.calldata_lai` is not set. 443 return_counts (bool, default=False): 444 If True, also return called-allele counts with the same shape as frequencies. 445 as_dataframe (bool, default=False): 446 If True, return pandas DataFrame output. 447 448 Returns: 449 Frequencies as a NumPy array (or DataFrame if `as_dataframe=True`). 450 If `return_counts=True`, returns `(freq, counts)`. 451 """ 452 if self.calldata_gt is None: 453 raise ValueError("Genotype data `calldata_gt` is None.") 454 455 gt = np.asarray(self.calldata_gt) 456 if gt.ndim not in (2, 3): 457 raise ValueError("'calldata_gt' must be 2D or 3D array") 458 459 n_samples = gt.shape[1] 460 461 grouped_output = sample_labels is not None 462 if sample_labels is None: 463 labels = np.repeat("__all__", n_samples) 464 else: 465 labels = np.asarray(sample_labels) 466 if labels.ndim != 1: 467 labels = labels.ravel() 468 if labels.shape[0] != n_samples: 469 raise ValueError( 470 "'sample_labels' must have length equal to the number of samples in `calldata_gt`." 471 ) 472 473 calldata_lai = None 474 if ancestry is not None: 475 if self.calldata_lai is not None: 476 calldata_lai = self.calldata_lai 477 elif laiobj is not None: 478 try: 479 converted_lai = laiobj.convert_to_snp_level(snpobject=self, lai_format="3D") 480 calldata_lai = getattr(converted_lai, "calldata_lai", None) 481 except Exception: 482 calldata_lai = None 483 484 if calldata_lai is None: 485 raise ValueError( 486 "Ancestry-specific masking requires SNP-level LAI " 487 "(provide a LocalAncestryObject via 'laiobj' or ensure 'self.calldata_lai' is set)." 488 ) 489 490 afs, counts, pops = aggregate_pop_allele_freq( 491 calldata_gt=gt, 492 sample_labels=labels, 493 ancestry=ancestry, 494 calldata_lai=calldata_lai, 495 ) 496 497 if grouped_output: 498 freq_out = afs 499 count_out = counts 500 if as_dataframe: 501 import pandas as pd 502 503 freq_out = pd.DataFrame(afs, columns=pops) 504 count_out = pd.DataFrame(counts, columns=pops) 505 else: 506 freq_out = afs[:, 0] 507 count_out = counts[:, 0] 508 if as_dataframe: 509 import pandas as pd 510 511 freq_out = pd.DataFrame({"allele_freq": freq_out}) 512 count_out = pd.DataFrame({"called_alleles": count_out}) 513 514 if return_counts: 515 return freq_out, count_out 516 return freq_out 517 518 def sum_strands(self, inplace: bool = False) -> Optional['SNPObject']: 519 """ 520 Sum paternal and maternal strands. 521 522 Args: 523 inplace (bool, default=False): 524 If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 525 filtered. Default is False. 526 527 Returns: 528 **Optional[SNPObject]:** 529 A new `SNPObject` with summed strands if `inplace=False`. 530 If `inplace=True`, modifies `self` in place and returns None. 531 """ 532 if self.calldata_gt is None: 533 warnings.warn("Genotype data `calldata_gt` is None.") 534 return None if not inplace else self 535 536 if self.are_strands_summed: 537 warnings.warn("Genotype data `calldata_gt` is already summed.") 538 return self if inplace else self.copy() 539 540 if inplace: 541 self.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8) 542 return self 543 else: 544 snpobj = self.copy() 545 snpobj.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8) 546 return snpobj 547 548 def filter_variants( 549 self, 550 chrom: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 551 pos: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 552 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 553 include: bool = True, 554 inplace: bool = False 555 ) -> Optional['SNPObject']: 556 """ 557 Filter variants based on specified chromosome names, variant positions, or variant indexes. 558 559 This method updates the `calldata_gt`, `variants_ref`, `variants_alt`, 560 `variants_chrom`, `variants_filter_pass`, `variants_id`, `variants_pos`, 561 `variants_qual`, and `lai` attributes to include or exclude the specified variants. The filtering 562 criteria can be based on chromosome names, variant positions, or indexes. If multiple 563 criteria are provided, their union is used for filtering. The order of the variants is preserved. 564 565 Negative indexes are supported and follow 566 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 567 568 Args: 569 chrom (str or array_like of str, optional): 570 Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence 571 of chromosomes. If both `chrom` and `pos` are provided, they must either have matching lengths 572 (pairing each chromosome with a position) or `chrom` should be a single value that applies to 573 all positions in `pos`. Default is None. 574 pos (int or array_like of int, optional): 575 Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions. 576 If `chrom` is also provided, `pos` should either match `chrom` in length or `chrom` should be a 577 single value. Default is None. 578 indexes (int or array_like of int, optional): 579 Index(es) of the variants to include or exclude. Can be a single index or a sequence 580 of indexes. Negative indexes are supported. Default is None. 581 include (bool, default=True): 582 If True, includes only the specified variants. If False, excludes the specified 583 variants. Default is True. 584 inplace (bool, default=False): 585 If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 586 filtered. Default is False. 587 588 Returns: 589 **Optional[SNPObject]:** 590 A new `SNPObject` with the specified variants filtered if `inplace=False`. 591 If `inplace=True`, modifies `self` in place and returns None. 592 """ 593 if chrom is None and pos is None and indexes is None: 594 raise ValueError("At least one of 'chrom', 'pos', or 'indexes' must be provided.") 595 596 n_snps = self.n_snps 597 598 # Convert inputs to arrays for consistency 599 chrom = np.atleast_1d(chrom) if chrom is not None else None 600 pos = np.atleast_1d(pos) if pos is not None else None 601 indexes = np.atleast_1d(indexes) if indexes is not None else None 602 603 # Validate chrom and pos lengths if both are provided 604 if chrom is not None and pos is not None: 605 if len(chrom) != len(pos) and len(chrom) > 1: 606 raise ValueError( 607 "When both 'chrom' and 'pos' are provided, they must either be of the same length " 608 "or 'chrom' must be a single value." 609 ) 610 611 # Create a mask for chromosome and position filtering 612 mask_combined = np.zeros(n_snps, dtype=bool) 613 if chrom is not None and pos is not None: 614 if len(chrom) == 1: 615 # Apply single chromosome to all positions in `pos` 616 mask_combined = (self['variants_chrom'] == chrom[0]) & np.isin(self['variants_pos'], pos) 617 else: 618 # Vectorized pair matching for chrom and pos 619 query_pairs = np.array( 620 list(zip(chrom, pos)), 621 dtype=[ 622 ('chrom', self['variants_chrom'].dtype), 623 ('pos', self['variants_pos'].dtype) 624 ] 625 ) 626 data_pairs = np.array( 627 list(zip(self['variants_chrom'], self['variants_pos'])), 628 dtype=[ 629 ('chrom', self['variants_chrom'].dtype), 630 ('pos', self['variants_pos'].dtype) 631 ] 632 ) 633 mask_combined = np.isin(data_pairs, query_pairs) 634 635 elif chrom is not None: 636 # Only chromosome filtering 637 mask_combined = np.isin(self['variants_chrom'], chrom) 638 elif pos is not None: 639 # Only position filtering 640 mask_combined = np.isin(self['variants_pos'], pos) 641 642 # Create mask based on indexes if provided 643 if indexes is not None: 644 # Validate indexes, allowing negative indexes 645 out_of_bounds_indexes = indexes[(indexes < -n_snps) | (indexes >= n_snps)] 646 if out_of_bounds_indexes.size > 0: 647 raise ValueError(f"One or more sample indexes are out of bounds.") 648 649 # Handle negative indexes and check for out-of-bounds indexes 650 adjusted_indexes = np.mod(indexes, n_snps) 651 652 # Create mask for specified indexes 653 mask_indexes = np.zeros(n_snps, dtype=bool) 654 mask_indexes[adjusted_indexes] = True 655 656 # Combine with `chrom` and `pos` mask using logical OR (union of all specified criteria) 657 mask_combined = mask_combined | mask_indexes 658 659 # Invert mask if `include` is False 660 if not include: 661 mask_combined = ~mask_combined 662 663 # Define keys to filter 664 keys = [ 665 'calldata_gt', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 666 'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai' 667 ] 668 669 # Apply filtering based on inplace parameter 670 if inplace: 671 for key in keys: 672 if self[key] is not None: 673 if self[key].ndim > 1: 674 self[key] = np.asarray(self[key])[mask_combined, ...] 675 else: 676 self[key] = np.asarray(self[key])[mask_combined] 677 678 return None 679 else: 680 # Create A new `SNPObject` with filtered data 681 snpobj = self.copy() 682 for key in keys: 683 if snpobj[key] is not None: 684 if snpobj[key].ndim > 1: 685 snpobj[key] = np.asarray(snpobj[key])[mask_combined, ...] 686 else: 687 snpobj[key] = np.asarray(snpobj[key])[mask_combined] 688 689 return snpobj 690 691 def filter_samples( 692 self, 693 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 694 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 695 include: bool = True, 696 reorder: bool = False, 697 inplace: bool = False 698 ) -> Optional['SNPObject']: 699 """ 700 Filter samples based on specified names or indexes. 701 702 This method updates the `samples` and `calldata_gt` attributes to include or exclude the specified 703 samples. The order of the samples is preserved. Set `reorder=True` to match the ordering of the 704 provided `samples` and/or `indexes` lists when including. 705 706 If both samples and indexes are provided, any sample matching either a name in samples or an index in 707 indexes will be included or excluded. 708 709 This method allows inclusion or exclusion of specific samples by their names or 710 indexes. When both sample names and indexes are provided, the union of the specified samples 711 is used. Negative indexes are supported and follow 712 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 713 714 Args: 715 samples (str or array_like of str, optional): 716 Name(s) of the samples to include or exclude. Can be a single sample name or a 717 sequence of sample names. Default is None. 718 indexes (int or array_like of int, optional): 719 Index(es) of the samples to include or exclude. Can be a single index or a sequence 720 of indexes. Negative indexes are supported. Default is None. 721 include (bool, default=True): 722 If True, includes only the specified samples. If False, excludes the specified 723 samples. Default is True. 724 inplace (bool, default=False): 725 If True, modifies `self` in place. If False, returns a new `SNPObject` with the samples 726 filtered. Default is False. 727 728 Returns: 729 **Optional[SNPObject]:** 730 A new `SNPObject` with the specified samples filtered if `inplace=False`. 731 If `inplace=True`, modifies `self` in place and returns None. 732 """ 733 if samples is None and indexes is None: 734 raise ValueError("At least one of 'samples' or 'indexes' must be provided.") 735 736 n_samples = self.n_samples 737 sample_names = np.array(self['samples']) 738 739 # Create mask based on sample names 740 if samples is not None: 741 samples = np.asarray(samples).ravel() 742 mask_samples = np.isin(sample_names, samples) 743 missing_samples = samples[~np.isin(samples, sample_names)] 744 if missing_samples.size > 0: 745 raise ValueError(f"The following specified samples were not found: {missing_samples.tolist()}") 746 else: 747 mask_samples = np.zeros(n_samples, dtype=bool) 748 749 # Create mask based on sample indexes 750 if indexes is not None: 751 indexes = np.asarray(indexes).ravel() 752 753 # Validate indexes, allowing negative indexes 754 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 755 if out_of_bounds_indexes.size > 0: 756 raise ValueError(f"One or more sample indexes are out of bounds.") 757 758 # Handle negative indexes 759 adjusted_indexes = np.mod(indexes, n_samples) 760 761 mask_indexes = np.zeros(n_samples, dtype=bool) 762 mask_indexes[adjusted_indexes] = True 763 else: 764 mask_indexes = np.zeros(n_samples, dtype=bool) 765 766 # Combine masks using logical OR (union of samples) 767 mask_combined = mask_samples | mask_indexes 768 769 if not include: 770 mask_combined = ~mask_combined 771 772 # If requested, compute an ordering of selected samples that follows the provided lists. 773 ordered_indices = None 774 if include and reorder: 775 sel_indices = np.where(mask_combined)[0] 776 ordered_list: List[int] = [] 777 added = np.zeros(n_samples, dtype=bool) 778 779 # Prioritize the order in `samples` 780 if samples is not None: 781 name_to_idx = {name: idx for idx, name in enumerate(sample_names)} 782 for s in samples: 783 idx = name_to_idx.get(s) 784 if idx is not None and mask_combined[idx] and not added[idx]: 785 ordered_list.append(idx) 786 added[idx] = True 787 788 # Then respect the order in `indexes` 789 if indexes is not None: 790 adj_idx = np.mod(np.atleast_1d(indexes), n_samples) 791 for idx in adj_idx: 792 if mask_combined[idx] and not added[idx]: 793 ordered_list.append(int(idx)) 794 added[idx] = True 795 796 # Finally, append any remaining selected samples in their original order 797 for idx in sel_indices: 798 if not added[idx]: 799 ordered_list.append(int(idx)) 800 801 ordered_indices = np.asarray(ordered_list, dtype=int) 802 803 # Define keys to filter 804 keys = ['samples', 'calldata_gt', 'calldata_lai'] 805 806 # Apply filtering based on inplace parameter 807 if inplace: 808 for key in keys: 809 if self[key] is not None: 810 arr = np.asarray(self[key]) 811 if ordered_indices is not None: 812 if key == 'calldata_lai' and arr.ndim == 2: 813 # Haplotype-aware reordering for 2D LAI (n_snps, n_samples*2) 814 hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1]) 815 self[key] = arr[:, hap_idx] 816 elif arr.ndim > 1: 817 self[key] = arr[:, ordered_indices, ...] 818 else: 819 self[key] = arr[ordered_indices] 820 else: 821 if arr.ndim > 1: 822 self[key] = arr[:, mask_combined, ...] 823 else: 824 self[key] = arr[mask_combined] 825 826 return None 827 else: 828 # Create A new `SNPObject` with filtered data 829 snpobj = self.copy() 830 for key in keys: 831 if snpobj[key] is not None: 832 arr = np.asarray(snpobj[key]) 833 if ordered_indices is not None: 834 if key == 'calldata_lai' and arr.ndim == 2: 835 hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1]) 836 snpobj[key] = arr[:, hap_idx] 837 elif arr.ndim > 1: 838 snpobj[key] = arr[:, ordered_indices, ...] 839 else: 840 snpobj[key] = arr[ordered_indices] 841 else: 842 if arr.ndim > 1: 843 snpobj[key] = arr[:, mask_combined, ...] 844 else: 845 snpobj[key] = arr[mask_combined] 846 return snpobj 847 848 def detect_chromosome_format(self) -> str: 849 """ 850 Detect the chromosome naming convention in `variants_chrom` based on the prefix 851 of the first chromosome identifier in `unique_chrom`. 852 853 **Recognized formats:** 854 855 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 856 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 857 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 858 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 859 860 If the format does not match any recognized pattern, `'Unknown format'` is returned. 861 862 Returns: 863 **str:** 864 A string indicating the detected chromosome format (`'chr'`, `'chm'`, `'chrom'`, or `'plain'`). 865 If no recognized format is matched, returns `'Unknown format'`. 866 """ 867 # Select the first unique chromosome identifier for format detection 868 chromosome_str = self.unique_chrom[0] 869 870 # Define regular expressions to match each recognized chromosome format 871 patterns = { 872 'chr': r'^chr(\d+|X|Y|M)$', # Matches 'chr' prefixed format 873 'chm': r'^chm(\d+|X|Y|M)$', # Matches 'chm' prefixed format 874 'chrom': r'^chrom(\d+|X|Y|M)$', # Matches 'chrom' prefixed format 875 'plain': r'^(\d+|X|Y|M)$' # Matches plain format without prefix 876 } 877 878 # Iterate through the patterns to identify the chromosome format 879 for prefix, pattern in patterns.items(): 880 if re.match(pattern, chromosome_str): 881 return prefix # Return the recognized format prefix 882 883 # If no pattern matches, return 'Unknown format' 884 return 'Unknown format' 885 886 def convert_chromosome_format( 887 self, 888 from_format: str, 889 to_format: str, 890 inplace: bool = False 891 ) -> Optional['SNPObject']: 892 """ 893 Convert the chromosome format from one naming convention to another in `variants_chrom`. 894 895 **Supported formats:** 896 897 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 898 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 899 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 900 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 901 902 Args: 903 from_format (str): 904 The current chromosome format. Acceptable values are `'chr'`, `'chm'`, `'chrom'`, or `'plain'`. 905 to_format (str): 906 The target format for chromosome data conversion. Acceptable values match `from_format` options. 907 inplace (bool, default=False): 908 If True, modifies `self` in place. If False, returns a new `SNPObject` with the converted format. 909 Default is False. 910 911 Returns: 912 **Optional[SNPObject]:** A new `SNPObject` with the converted chromosome format if `inplace=False`. 913 If `inplace=True`, modifies `self` in place and returns None. 914 """ 915 # Define the list of standard chromosome identifiers 916 chrom_list = [*map(str, range(1, 23)), 'X', 'Y', 'M'] # M for mitochondrial chromosomes 917 918 # Format mappings for different chromosome naming conventions 919 format_mappings = { 920 'chr': [f'chr{i}' for i in chrom_list], 921 'chm': [f'chm{i}' for i in chrom_list], 922 'chrom': [f'chrom{i}' for i in chrom_list], 923 'plain': chrom_list, 924 } 925 926 # Verify that from_format and to_format are valid naming conventions 927 if from_format not in format_mappings or to_format not in format_mappings: 928 raise ValueError(f"Invalid format: {from_format} or {to_format}. Must be one of {list(format_mappings.keys())}.") 929 930 # Convert chromosomes to string for consistent comparison 931 variants_chrom = self['variants_chrom'].astype(str) 932 933 # Verify that all chromosomes in the object follow the specified `from_format` 934 expected_chroms = set(format_mappings[from_format]) 935 mismatched_chroms = set(variants_chrom) - expected_chroms 936 937 if mismatched_chroms: 938 raise ValueError(f"The following chromosomes do not match the `from_format` '{from_format}': {mismatched_chroms}.") 939 940 # Create conditions for selecting based on current `from_format` names 941 conditions = [variants_chrom == chrom for chrom in format_mappings[from_format]] 942 943 # Rename chromosomes based on inplace flag 944 if inplace: 945 self['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown') 946 return None 947 else: 948 snpobject = self.copy() 949 snpobject['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown') 950 return snpobject 951 952 def match_chromosome_format(self, snpobj: 'SNPObject', inplace: bool = False) -> Optional['SNPObject']: 953 """ 954 Convert the chromosome format in `variants_chrom` from `self` to match the format of a reference `snpobj`. 955 956 **Recognized formats:** 957 958 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 959 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 960 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 961 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 962 963 Args: 964 snpobj (SNPObject): 965 The reference SNPObject whose chromosome format will be matched. 966 inplace (bool, default=False): 967 If True, modifies `self` in place. If False, returns a new `SNPObject` with the 968 chromosome format matching that of `snpobj`. Default is False. 969 970 Returns: 971 **Optional[SNPObject]:** 972 A new `SNPObject` with matched chromosome format if `inplace=False`. 973 If `inplace=True`, modifies `self` in place and returns None. 974 """ 975 # Detect the chromosome naming format of the current SNPObject 976 fmt1 = self.detect_chromosome_format() 977 if fmt1 == 'Unknown format': 978 raise ValueError("The chromosome format of the current SNPObject is unrecognized.") 979 980 # Detect the chromosome naming format of the reference SNPObject 981 fmt2 = snpobj.detect_chromosome_format() 982 if fmt2 == 'Unknown format': 983 raise ValueError("The chromosome format of the reference SNPObject is unrecognized.") 984 985 # Convert the current SNPObject's chromosome format to match the reference format 986 return self.convert_chromosome_format(fmt1, fmt2, inplace=inplace) 987 988 def rename_chrom( 989 self, 990 to_replace: Union[Dict[str, str], str, List[str]] = {'^([0-9]+)$': r'chr\1', r'^chr([0-9]+)$': r'\1'}, 991 value: Optional[Union[str, List[str]]] = None, 992 regex: bool = True, 993 inplace: bool = False 994 ) -> Optional['SNPObject']: 995 """ 996 Replace chromosome values in `variants_chrom` using patterns or exact matches. 997 998 This method allows flexible chromosome replacements, using regex or exact matches, useful 999 for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'), 1000 consider `convert_chromosome_format`. 1001 1002 Args: 1003 to_replace (dict, str, or list of str): 1004 Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior 1005 transforms `<chrom_num>` to `chr<chrom_num>` or vice versa. Non-matching values 1006 remain unchanged. 1007 - If str or list of str: Matches will be replaced with `value`. 1008 - If regex (bool), then any regex matches will be replaced with `value`. 1009 - If dict: Keys defines values to replace, with corresponding replacements as values. 1010 value (str or list of str, optional): 1011 Replacement value(s) if `to_replace` is a string or list. Ignored if `to_replace` 1012 is a dictionary. 1013 regex (bool, default=True): 1014 If True, interprets `to_replace` keys as regex patterns. 1015 inplace (bool, default=False): 1016 If True, modifies `self` in place. If False, returns a new `SNPObject` with the chromosomes 1017 renamed. Default is False. 1018 1019 Returns: 1020 **Optional[SNPObject]:** A new `SNPObject` with the renamed chromosome format if `inplace=False`. 1021 If `inplace=True`, modifies `self` in place and returns None. 1022 """ 1023 # Standardize input format: convert `to_replace` and `value` to a dictionary if needed 1024 if isinstance(to_replace, (str, int)): 1025 to_replace = [to_replace] 1026 if isinstance(value, (str, int)): 1027 value = [value] 1028 if isinstance(to_replace, list) and isinstance(value, list): 1029 dictionary = dict(zip(to_replace, value)) 1030 elif isinstance(to_replace, dict) and value is None: 1031 dictionary = to_replace 1032 else: 1033 raise ValueError( 1034 "Invalid input: `to_replace` and `value` must be compatible types (both str, list of str, or dict)." 1035 ) 1036 1037 # Vectorized function for replacing values in chromosome array 1038 vec_replace_values = np.vectorize(self._match_to_replace) 1039 1040 # Rename chromosomes based on inplace flag 1041 if inplace: 1042 self.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex) 1043 return None 1044 else: 1045 snpobj = self.copy() 1046 snpobj.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex) 1047 return snpobj 1048 1049 def rename_missings( 1050 self, 1051 before: Union[int, float, str] = -1, 1052 after: Union[int, float, str] = '.', 1053 inplace: bool = False 1054 ) -> Optional['SNPObject']: 1055 """ 1056 Replace missing values in the `calldata_gt` attribute. 1057 1058 This method identifies missing values in 'calldata_gt' and replaces them with a specified 1059 value. By default, it replaces occurrences of `-1` (often used to signify missing data) with `'.'`. 1060 1061 Args: 1062 before (int, float, or str, default=-1): 1063 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 1064 Default is -1. 1065 after (int, float, or str, default='.'): 1066 The value that will replace `before`. Default is '.'. 1067 inplace (bool, default=False): 1068 If True, modifies `self` in place. If False, returns a new `SNPObject` with the applied 1069 replacements. Default is False. 1070 1071 Returns: 1072 **Optional[SNPObject]:** 1073 A new `SNPObject` with the renamed missing values if `inplace=False`. 1074 If `inplace=True`, modifies `self` in place and returns None. 1075 """ 1076 # Rename missing values in the `calldata_gt` attribute based on inplace flag 1077 if inplace: 1078 self['calldata_gt'] = np.where(self['calldata_gt'] == before, after, self['calldata_gt']) 1079 return None 1080 else: 1081 snpobj = self.copy() 1082 snpobj['calldata_gt'] = np.where(snpobj['calldata_gt'] == before, after, snpobj['calldata_gt']) 1083 return snpobj 1084 1085 def get_common_variants_intersection( 1086 self, 1087 snpobj: 'SNPObject', 1088 index_by: str = 'pos' 1089 ) -> Tuple[List[str], np.ndarray, np.ndarray]: 1090 """ 1091 Identify common variants between `self` and the `snpobj` instance based on the specified `index_by` criterion, 1092 which may match based on chromosome and position (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both. 1093 1094 This method returns the identifiers of common variants and their corresponding indices in both objects. 1095 1096 Args: 1097 snpobj (SNPObject): 1098 The reference SNPObject to compare against. 1099 index_by (str, default='pos'): 1100 Criteria for matching variants. Options: 1101 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1102 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1103 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1104 Default is 'pos'. 1105 1106 Returns: 1107 Tuple containing: 1108 - **list of str:** A list of common variant identifiers (as strings). 1109 - **array:** An array of indices in `self` where common variants are located. 1110 - **array:** An array of indices in `snpobj` where common variants are located. 1111 """ 1112 # Create unique identifiers for each variant in both SNPObjects based on the specified criterion 1113 if index_by == 'pos': 1114 query_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(self['variants_chrom'], self['variants_pos'])] 1115 reference_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(snpobj['variants_chrom'], snpobj['variants_pos'])] 1116 elif index_by == 'id': 1117 query_identifiers = self['variants_id'].tolist() 1118 reference_identifiers = snpobj['variants_id'].tolist() 1119 elif index_by == 'pos+id': 1120 query_identifiers = [ 1121 f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(self['variants_chrom'], self['variants_pos'], self['variants_id']) 1122 ] 1123 reference_identifiers = [ 1124 f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_id']) 1125 ] 1126 else: 1127 raise ValueError("`index_by` must be one of 'pos', 'id', or 'pos+id'.") 1128 1129 # Convert to sets for intersection 1130 common_ids = set(query_identifiers).intersection(reference_identifiers) 1131 1132 # Collect indices for common identifiers 1133 query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids] 1134 reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids] 1135 1136 return list(common_ids), np.array(query_idx), np.array(reference_idx) 1137 1138 def get_common_markers_intersection( 1139 self, 1140 snpobj: 'SNPObject' 1141 ) -> Tuple[List[str], np.ndarray, np.ndarray]: 1142 """ 1143 Identify common markers between between `self` and the `snpobj` instance. Common markers are identified 1144 based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 1145 and alternate (`variants_alt`) alleles. 1146 1147 This method returns the identifiers of common markers and their corresponding indices in both objects. 1148 1149 Args: 1150 snpobj (SNPObject): 1151 The reference SNPObject to compare against. 1152 1153 Returns: 1154 Tuple containing: 1155 - **list of str:** A list of common variant identifiers (as strings). 1156 - **array:** An array of indices in `self` where common variants are located. 1157 - **array:** An array of indices in `snpobj` where common variants are located. 1158 """ 1159 # Generate unique identifiers based on chrom, pos, ref, and alt alleles 1160 query_identifiers = [ 1161 f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 1162 zip(self['variants_chrom'], self['variants_pos'], self['variants_ref'], self['variants_alt']) 1163 ] 1164 reference_identifiers = [ 1165 f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 1166 zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_ref'], snpobj['variants_alt']) 1167 ] 1168 1169 # Convert to sets for intersection 1170 common_ids = set(query_identifiers).intersection(reference_identifiers) 1171 1172 # Collect indices for common identifiers in both SNPObjects 1173 query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids] 1174 reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids] 1175 1176 return list(common_ids), np.array(query_idx), np.array(reference_idx) 1177 1178 def subset_to_common_variants( 1179 self, 1180 snpobj: 'SNPObject', 1181 index_by: str = 'pos', 1182 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1183 inplace: bool = False 1184 ) -> Optional['SNPObject']: 1185 """ 1186 Subset `self` to include only the common variants with a reference `snpobj` based on 1187 the specified `index_by` criterion, which may match based on chromosome and position 1188 (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both. 1189 1190 Args: 1191 snpobj (SNPObject): 1192 The reference SNPObject to compare against. 1193 index_by (str, default='pos'): 1194 Criteria for matching variants. Options: 1195 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1196 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1197 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1198 Default is 'pos'. 1199 common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): 1200 Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 1201 computed within the function. 1202 inplace (bool, default=False): 1203 If True, modifies `self` in place. If False, returns a new `SNPObject` with the common variants 1204 subsetted. Default is False. 1205 1206 Returns: 1207 **Optional[SNPObject]:** 1208 A new `SNPObject` with the common variants subsetted if `inplace=False`. 1209 If `inplace=True`, modifies `self` in place and returns None. 1210 """ 1211 # Get indices of common variants if not provided 1212 if common_variants_intersection is None: 1213 _, query_idx, _ = self.get_common_variants_intersection(snpobj, index_by=index_by) 1214 else: 1215 query_idx, _ = common_variants_intersection 1216 1217 # Use filter_variants method with the identified indices, applying `inplace` as specified 1218 return self.filter_variants(indexes=query_idx, include=True, inplace=inplace) 1219 1220 def subset_to_common_markers( 1221 self, 1222 snpobj: 'SNPObject', 1223 common_markers_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1224 inplace: bool = False 1225 ) -> Optional['SNPObject']: 1226 """ 1227 Subset `self` to include only the common markers with a reference `snpobj`. Common markers are identified 1228 based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 1229 and alternate (`variants_alt`) alleles. 1230 1231 Args: 1232 snpobj (SNPObject): 1233 The reference SNPObject to compare against. 1234 common_markers_intersection (tuple of arrays, optional): 1235 Precomputed indices of common markers between `self` and `snpobj`. If None, intersection is 1236 computed within the function. 1237 inplace (bool, default=False): 1238 If True, modifies `self` in place. If False, returns a new `SNPObject` with the common markers 1239 subsetted. Default is False. 1240 1241 Returns: 1242 **Optional[SNPObject]:** 1243 A new `SNPObject` with the common markers subsetted if `inplace=False`. 1244 If `inplace=True`, modifies `self` in place and returns None. 1245 """ 1246 # Get indices of common markers if not provided 1247 if common_markers_intersection is None: 1248 _, query_idx, _ = self.get_common_markers_intersection(snpobj) 1249 else: 1250 query_idx, _ = common_markers_intersection 1251 1252 # Use filter_variants method with the identified indices, applying `inplace` as specified 1253 return self.filter_variants(indexes=query_idx, include=True, inplace=inplace) 1254 1255 def merge( 1256 self, 1257 snpobj: 'SNPObject', 1258 force_samples: bool = False, 1259 prefix: str = '2', 1260 inplace: bool = False 1261 ) -> Optional['SNPObject']: 1262 """ 1263 Merge `self` with `snpobj` along the sample axis. 1264 1265 This method expects both SNPObjects to contain the same set of SNPs in the same order, 1266 then combines their genotype (`calldata_gt`) and LAI (`calldata_lai`) arrays by 1267 concatenating the sample dimension. Samples from `snpobj` are appended to those in `self`. 1268 1269 Args: 1270 snpobj (SNPObject): 1271 The SNPObject to merge samples with. 1272 force_samples (bool, default=False): 1273 If True, duplicate sample names are resolved by prepending the `prefix` to duplicate sample names in 1274 `snpobj`. Otherwise, merging fails when duplicate sample names are found. Default is False. 1275 prefix (str, default='2'): 1276 A string prepended to duplicate sample names in `snpobj` when `force_samples=True`. 1277 Duplicates are renamed from `<sample_name>` to `<prefix>:<sample_name>`. For instance, 1278 if `prefix='2'` and there is a conflict with a sample called "sample_1", it becomes "2:sample_1". 1279 inplace (bool, default=False): 1280 If True, modifies `self` in place. If False, returns a new `SNPObject` with the merged samples. 1281 Default is False. 1282 1283 Returns: 1284 **Optional[SNPObject]**: A new SNPObject containing the merged sample data. 1285 """ 1286 # Merge calldata_gt if present and compatible 1287 if self.calldata_gt is not None and snpobj.calldata_gt is not None: 1288 if self.calldata_gt.shape[0] != snpobj.calldata_gt.shape[0]: 1289 raise ValueError( 1290 f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_gt`.\n" 1291 f"`self.calldata_gt` has {self.calldata_gt.shape[0]} SNPs, " 1292 f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[0]} SNPs." 1293 ) 1294 if self.are_strands_summed and not snpobj.are_strands_summed: 1295 raise ValueError( 1296 "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n" 1297 "Ensure both objects have the same genotype summation state before merging." 1298 ) 1299 if not self.are_strands_summed and snpobj.are_strands_summed: 1300 raise ValueError( 1301 "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n" 1302 "Ensure both objects have the same genotype summation state before merging." 1303 ) 1304 calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=1) 1305 else: 1306 calldata_gt = None 1307 1308 # Merge samples if present and compatible, handling duplicates if `force_samples=True` 1309 if self.samples is not None and snpobj.samples is not None: 1310 overlapping_samples = set(self.samples).intersection(set(snpobj.samples)) 1311 if overlapping_samples: 1312 if not force_samples: 1313 raise ValueError( 1314 f"Cannot merge SNPObjects: Found overlapping sample names {overlapping_samples}.\n" 1315 "Samples must be strictly non-overlapping. To allow merging with renaming, set `force_samples=True`." 1316 ) 1317 else: 1318 # Rename duplicate samples by prepending the file index 1319 renamed_samples = [f"{prefix}:{sample}" if sample in overlapping_samples else sample for sample in snpobj.samples] 1320 samples = np.concatenate([self.samples, renamed_samples], axis=0) 1321 else: 1322 samples = np.concatenate([self.samples, snpobj.samples], axis=0) 1323 else: 1324 samples = None 1325 1326 # Merge LAI data if present and compatible 1327 if self.calldata_lai is not None and snpobj.calldata_lai is not None: 1328 if self.calldata_lai.ndim != snpobj.calldata_lai.ndim: 1329 raise ValueError( 1330 f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n" 1331 f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, " 1332 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions." 1333 ) 1334 if self.calldata_lai.shape[0] != snpobj.calldata_lai.shape[0]: 1335 raise ValueError( 1336 f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_lai`.\n" 1337 f"`self.calldata_lai` has {self.calldata_lai.shape[0]} SNPs, " 1338 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[0]} SNPs." 1339 ) 1340 calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=1) 1341 else: 1342 calldata_lai = None 1343 1344 if inplace: 1345 self.calldata_gt = calldata_gt 1346 self.calldata_lai = calldata_lai 1347 self.samples = samples 1348 return self 1349 1350 # Create and return a new SNPObject containing the merged samples 1351 return SNPObject( 1352 calldata_gt=calldata_gt, 1353 samples=samples, 1354 variants_ref=self.variants_ref, 1355 variants_alt=self.variants_alt, 1356 variants_chrom=self.variants_chrom, 1357 variants_filter_pass=self.variants_filter_pass, 1358 variants_id=self.variants_id, 1359 variants_pos=self.variants_pos, 1360 variants_qual=self.variants_qual, 1361 calldata_lai=calldata_lai, 1362 ancestry_map=self.ancestry_map 1363 ) 1364 1365 def concat( 1366 self, 1367 snpobj: 'SNPObject', 1368 inplace: bool = False 1369 ) -> Optional['SNPObject']: 1370 """ 1371 Concatenate self with snpobj along the SNP axis. 1372 1373 This method expects both SNPObjects to contain the same set of samples in the same order, 1374 and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) 1375 those in self. 1376 1377 Args: 1378 snpobj (SNPObject): 1379 The SNPObject to concatenate SNPs with. 1380 inplace (bool, default=False): 1381 If True, modifies `self` in place. If False, returns a new `SNPObject` with the concatenated SNPs. 1382 Default is False. 1383 1384 Returns: 1385 **Optional[SNPObject]**: A new SNPObject containing the concatenated SNP data. 1386 """ 1387 # Merge calldata_gt if present and compatible 1388 if self.calldata_gt is not None and snpobj.calldata_gt is not None: 1389 if self.calldata_gt.shape[1] != snpobj.calldata_gt.shape[1]: 1390 raise ValueError( 1391 f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_gt`.\n" 1392 f"`self.calldata_gt` has {self.calldata_gt.shape[1]} samples, " 1393 f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[1]} samples." 1394 ) 1395 if self.are_strands_summed and not snpobj.are_strands_summed: 1396 raise ValueError( 1397 "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n" 1398 "Ensure both objects have the same genotype summation state before merging." 1399 ) 1400 if not self.are_strands_summed and snpobj.are_strands_summed: 1401 raise ValueError( 1402 "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n" 1403 "Ensure both objects have the same genotype summation state before merging." 1404 ) 1405 calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=0) 1406 else: 1407 calldata_gt = None 1408 1409 # Merge SNP-related attributes if present 1410 attributes = [ 1411 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual' 1412 ] 1413 merged_attrs = {} 1414 for attr in attributes: 1415 self_attr = getattr(self, attr, None) 1416 obj_attr = getattr(snpobj, attr, None) 1417 1418 # Concatenate if both present 1419 if self_attr is not None and obj_attr is not None: 1420 merged_attrs[attr] = np.concatenate([self_attr, obj_attr], axis=0) 1421 else: 1422 # If either is None, store None 1423 merged_attrs[attr] = None 1424 1425 # Merge LAI data if present and compatible 1426 if self.calldata_lai is not None and snpobj.calldata_lai is not None: 1427 if self.calldata_lai.ndim != snpobj.calldata_lai.ndim: 1428 raise ValueError( 1429 f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n" 1430 f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, " 1431 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions." 1432 ) 1433 if self.calldata_lai.shape[1] != snpobj.calldata_lai.shape[1]: 1434 raise ValueError( 1435 f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_lai`.\n" 1436 f"`self.calldata_lai` has {self.calldata_lai.shape[1]} samples, " 1437 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[1]} samples." 1438 ) 1439 calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=0) 1440 else: 1441 calldata_lai = None 1442 1443 if inplace: 1444 self.calldata_gt = calldata_gt 1445 self.calldata_lai = calldata_lai 1446 for attr in attributes: 1447 self[attr] = merged_attrs[attr] 1448 return self 1449 1450 # Create and return a new SNPObject containing the concatenated SNPs 1451 return SNPObject( 1452 calldata_gt=calldata_gt, 1453 calldata_lai=calldata_lai, 1454 samples=self.samples, 1455 variants_ref=merged_attrs['variants_ref'], 1456 variants_alt=merged_attrs['variants_alt'], 1457 variants_chrom=merged_attrs['variants_chrom'], 1458 variants_id=merged_attrs['variants_id'], 1459 variants_pos=merged_attrs['variants_pos'], 1460 variants_qual=merged_attrs['variants_qual'], 1461 variants_filter_pass=merged_attrs['variants_filter_pass'], 1462 ancestry_map=self.ancestry_map 1463 ) 1464 1465 def remove_strand_ambiguous_variants(self, inplace: bool = False) -> Optional['SNPObject']: 1466 """ 1467 A strand-ambiguous variant has reference (`variants_ref`) and alternate (`variants_alt`) alleles 1468 in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable 1469 in terms of strand orientation. 1470 1471 Args: 1472 inplace (bool, default=False): 1473 If True, modifies `self` in place. If False, returns a new `SNPObject` with the 1474 strand-ambiguous variants removed. Default is False. 1475 1476 Returns: 1477 **Optional[SNPObject]:** A new `SNPObject` with non-ambiguous variants only if `inplace=False`. 1478 If `inplace=True`, modifies `self` in place and returns None. 1479 """ 1480 # Identify strand-ambiguous SNPs using vectorized comparisons 1481 is_AT = (self['variants_ref'] == 'A') & (self['variants_alt'] == 'T') 1482 is_TA = (self['variants_ref'] == 'T') & (self['variants_alt'] == 'A') 1483 is_CG = (self['variants_ref'] == 'C') & (self['variants_alt'] == 'G') 1484 is_GC = (self['variants_ref'] == 'G') & (self['variants_alt'] == 'C') 1485 1486 # Create a combined mask for all ambiguous variants 1487 ambiguous_mask = is_AT | is_TA | is_CG | is_GC 1488 non_ambiguous_idx = np.where(~ambiguous_mask)[0] 1489 1490 # Count each type of ambiguity using numpy's sum on boolean arrays 1491 A_T_count = np.sum(is_AT) 1492 T_A_count = np.sum(is_TA) 1493 C_G_count = np.sum(is_CG) 1494 G_C_count = np.sum(is_GC) 1495 1496 # Log the counts of each type of strand-ambiguous variants 1497 total_ambiguous = A_T_count + T_A_count + C_G_count + G_C_count 1498 log.info(f'{A_T_count} ambiguities of A-T type.') 1499 log.info(f'{T_A_count} ambiguities of T-A type.') 1500 log.info(f'{C_G_count} ambiguities of C-G type.') 1501 log.info(f'{G_C_count} ambiguities of G-C type.') 1502 1503 # Filter out ambiguous variants and keep non-ambiguous ones 1504 log.debug(f'Removing {total_ambiguous} strand-ambiguous variants...') 1505 return self.filter_variants(indexes=non_ambiguous_idx, include=True, inplace=inplace) 1506 1507 def correct_flipped_variants( 1508 self, 1509 snpobj: 'SNPObject', 1510 check_complement: bool = True, 1511 index_by: str = 'pos', 1512 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1513 log_stats: bool = True, 1514 inplace: bool = False 1515 ) -> Optional['SNPObject']: 1516 """ 1517 Correct flipped variants between between `self` and a reference `snpobj`, where reference (`variants_ref`) 1518 and alternate (`variants_alt`) alleles are swapped. 1519 1520 **Flip Detection Based on `check_complement`:** 1521 1522 - If `check_complement=False`, only direct allele swaps are considered: 1523 1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1524 1525 - If `check_complement=True`, both direct and complementary swaps are considered, with four possible cases: 1526 1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1527 2. **Complement Swap of Ref:** `complement(self.variants_ref) == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1528 3. **Complement Swap of Alt:** `self.variants_ref == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`. 1529 4. **Complement Swap of both Ref and Alt:** `complement(self.variants_ref) == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`. 1530 1531 **Note:** Variants where `self.variants_ref == self.variants_alt` are ignored as they are ambiguous. 1532 1533 **Correction Process:** 1534 - Swaps `variants_ref` and `variants_alt` alleles in `self` to align with `snpobj`. 1535 - Flips `calldata_gt` values (0 becomes 1, and 1 becomes 0) to match the updated allele configuration. 1536 1537 Args: 1538 snpobj (SNPObject): 1539 The reference SNPObject to compare against. 1540 check_complement (bool, default=True): 1541 If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants. 1542 Default is True. 1543 index_by (str, default='pos'): 1544 Criteria for matching variants. Options: 1545 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1546 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1547 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1548 Default is 'pos'. 1549 common_variants_intersection (tuple of arrays, optional): 1550 Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 1551 computed within the function. 1552 log_stats (bool, default=True): 1553 If True, logs statistical information about matching and ambiguous alleles. Default is True. 1554 inplace (bool, default=False): 1555 If True, modifies `self` in place. If False, returns a new `SNPObject` with corrected 1556 flips. Default is False. 1557 1558 Returns: 1559 **Optional[SNPObject]**: 1560 A new `SNPObject` with corrected flips if `inplace=False`. 1561 If `inplace=True`, modifies `self` in place and returns None. 1562 """ 1563 # Define complement mappings for nucleotides 1564 complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} 1565 1566 # Helper function to get the complement of a base 1567 def get_complement(base: str) -> str: 1568 return complement_map.get(base, base) 1569 1570 # Get common variant indices if not provided 1571 if common_variants_intersection != None: 1572 query_idx, reference_idx = common_variants_intersection 1573 else: 1574 _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by) 1575 1576 # Log statistics on matching alleles if enabled 1577 if log_stats: 1578 matching_ref = np.sum(self['variants_ref'][query_idx] == snpobj['variants_ref'][reference_idx]) 1579 matching_alt = np.sum(self['variants_alt'][query_idx] == snpobj['variants_alt'][reference_idx]) 1580 ambiguous = np.sum(self['variants_ref'][query_idx] == self['variants_alt'][query_idx]) 1581 log.info(f"Matching reference alleles (ref=ref'): {matching_ref}, Matching alternate alleles (alt=alt'): {matching_alt}.") 1582 log.info(f"Number of ambiguous alleles (ref=alt): {ambiguous}.") 1583 1584 # Identify indices where `ref` and `alt` alleles are swapped 1585 if not check_complement: 1586 # Simple exact match for swapped alleles 1587 swapped_ref = (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) 1588 swapped_alt = (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) 1589 else: 1590 # Check for swapped or complementary-swapped alleles 1591 swapped_ref = ( 1592 (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) | 1593 (np.vectorize(get_complement)(self['variants_ref'][query_idx]) == snpobj['variants_alt'][reference_idx]) 1594 ) 1595 swapped_alt = ( 1596 (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) | 1597 (np.vectorize(get_complement)(self['variants_alt'][query_idx]) == snpobj['variants_ref'][reference_idx]) 1598 ) 1599 1600 # Filter out ambiguous variants where `ref` and `alt` alleles match (ref=alt) 1601 not_ambiguous = (self['variants_ref'][query_idx] != self['variants_alt'][query_idx]) 1602 1603 # Indices in `self` of flipped variants 1604 flip_idx_query = query_idx[swapped_ref & swapped_alt & not_ambiguous] 1605 1606 # Correct the identified variant flips 1607 if len(flip_idx_query) > 0: 1608 log.info(f'Correcting {len(flip_idx_query)} variant flips...') 1609 1610 temp_alts = self['variants_alt'][flip_idx_query] 1611 temp_refs = self['variants_ref'][flip_idx_query] 1612 1613 # Correct the variant flips based on whether the operation is in-place or not 1614 if inplace: 1615 self['variants_alt'][flip_idx_query] = temp_refs 1616 self['variants_ref'][flip_idx_query] = temp_alts 1617 self['calldata_gt'][flip_idx_query] = 1 - self['calldata_gt'][flip_idx_query] 1618 return None 1619 else: 1620 snpobj = self.copy() 1621 snpobj['variants_alt'][flip_idx_query] = temp_refs 1622 snpobj['variants_ref'][flip_idx_query] = temp_alts 1623 snpobj['calldata_gt'][flip_idx_query] = 1 - snpobj['calldata_gt'][flip_idx_query] 1624 return snpobj 1625 else: 1626 log.info('No variant flips found to correct.') 1627 return self if not inplace else None 1628 1629 def remove_mismatching_variants( 1630 self, 1631 snpobj: 'SNPObject', 1632 index_by: str = 'pos', 1633 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1634 inplace: bool = False 1635 ) -> Optional['SNPObject']: 1636 """ 1637 Remove variants from `self`, where reference (`variants_ref`) and/or alternate (`variants_alt`) alleles 1638 do not match with a reference `snpobj`. 1639 1640 Args: 1641 snpobj (SNPObject): 1642 The reference SNPObject to compare against. 1643 index_by (str, default='pos'): 1644 Criteria for matching variants. Options: 1645 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1646 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1647 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1648 Default is 'pos'. 1649 common_variants_intersection (tuple of arrays, optional): 1650 Precomputed indices of common variants between `self` and the reference `snpobj`. 1651 If None, the intersection is computed within the function. 1652 inplace (bool, default=False): 1653 If True, modifies `self` in place. If False, returns a new `SNPObject` without 1654 mismatching variants. Default is False. 1655 1656 Returns: 1657 **Optional[SNPObject]:** 1658 A new `SNPObject` without mismatching variants if `inplace=False`. 1659 If `inplace=True`, modifies `self` in place and returns None. 1660 """ 1661 # Get common variant indices if not provided 1662 if common_variants_intersection is not None: 1663 query_idx, reference_idx = common_variants_intersection 1664 else: 1665 _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by) 1666 1667 # Vectorized comparison of `ref` and `alt` alleles 1668 ref_mismatch = self['variants_ref'][query_idx] != snpobj['variants_ref'][reference_idx] 1669 alt_mismatch = self['variants_alt'][query_idx] != snpobj['variants_alt'][reference_idx] 1670 mismatch_mask = ref_mismatch | alt_mismatch 1671 1672 # Identify indices in `self` of mismatching variants 1673 mismatch_idx = query_idx[mismatch_mask] 1674 1675 # Compute total number of variant mismatches 1676 total_mismatches = np.sum(mismatch_mask) 1677 1678 # Filter out mismatching variants 1679 log.debug(f'Removing {total_mismatches} mismatching variants...') 1680 return self.filter_variants(indexes=mismatch_idx, include=True, inplace=inplace) 1681 1682 def shuffle_variants(self, inplace: bool = False) -> Optional['SNPObject']: 1683 """ 1684 Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated 1685 data (e.g., `calldata_gt` and variant-specific attributes) remain aligned. 1686 1687 Args: 1688 inplace (bool, default=False): 1689 If True, modifies `self` in place. If False, returns a new `SNPObject` with 1690 shuffled variants. Default is False. 1691 1692 Returns: 1693 **Optional[SNPObject]:** 1694 A new `SNPObject` without shuffled variant positions if `inplace=False`. 1695 If `inplace=True`, modifies `self` in place and returns None. 1696 """ 1697 # Generate a random permutation index for shuffling variant positions 1698 shuffle_index = np.random.permutation(self.n_snps) 1699 1700 # Apply shuffling to all relevant attributes using the class's dictionary-like interface 1701 if inplace: 1702 for key in self.keys(): 1703 if self[key] is not None: 1704 if key == 'calldata_gt': 1705 # `calldata_gt`` has a different shape, so it's shuffled along axis 0 1706 self[key] = self[key][shuffle_index, ...] 1707 elif 'variant' in key: 1708 # snpobj attributes are 1D arrays 1709 self[key] = np.asarray(self[key])[shuffle_index] 1710 return None 1711 else: 1712 shuffled_snpobj = self.copy() 1713 for key in shuffled_snpobj.keys(): 1714 if shuffled_snpobj[key] is not None: 1715 if key == 'calldata_gt': 1716 shuffled_snpobj[key] = shuffled_snpobj[key][shuffle_index, ...] 1717 elif 'variant' in key: 1718 shuffled_snpobj[key] = np.asarray(shuffled_snpobj[key])[shuffle_index] 1719 return shuffled_snpobj 1720 1721 def set_empty_to_missing(self, inplace: bool = False) -> Optional['SNPObject']: 1722 """ 1723 Replace empty strings `''` with missing values `'.'` in attributes of `self`. 1724 1725 Args: 1726 inplace (bool, default=False): 1727 If True, modifies `self` in place. If False, returns a new `SNPObject` with empty 1728 strings `''` replaced by missing values `'.'`. Default is False. 1729 1730 Returns: 1731 **Optional[SNPObject]:** 1732 A new `SNPObject` with empty strings replaced if `inplace=False`. 1733 If `inplace=True`, modifies `self` in place and returns None. 1734 """ 1735 if inplace: 1736 if self.variants_alt is not None: 1737 self.variants_alt[self.variants_alt == ''] = '.' 1738 if self.variants_ref is not None: 1739 self.variants_ref[self.variants_ref == ''] = '.' 1740 if self.variants_qual is not None: 1741 self.variants_qual = self.variants_qual.astype(str) 1742 self.variants_qual[(self.variants_qual == '') | (self.variants_qual == 'nan')] = '.' 1743 if self.variants_chrom is not None: 1744 self.variants_chrom = self.variants_chrom.astype(str) 1745 self.variants_chrom[self.variants_chrom == ''] = '.' 1746 if self.variants_filter_pass is not None: 1747 self.variants_filter_pass[self.variants_filter_pass == ''] = '.' 1748 if self.variants_id is not None: 1749 self.variants_id[self.variants_id == ''] = '.' 1750 return self 1751 else: 1752 snpobj = self.copy() 1753 if snpobj.variants_alt is not None: 1754 snpobj.variants_alt[snpobj.variants_alt == ''] = '.' 1755 if snpobj.variants_ref is not None: 1756 snpobj.variants_ref[snpobj.variants_ref == ''] = '.' 1757 if snpobj.variants_qual is not None: 1758 snpobj.variants_qual = snpobj.variants_qual.astype(str) 1759 snpobj.variants_qual[(snpobj.variants_qual == '') | (snpobj.variants_qual == 'nan')] = '.' 1760 if snpobj.variants_chrom is not None: 1761 snpobj.variants_chrom[snpobj.variants_chrom == ''] = '.' 1762 if snpobj.variants_filter_pass is not None: 1763 snpobj.variants_filter_pass[snpobj.variants_filter_pass == ''] = '.' 1764 if snpobj.variants_id is not None: 1765 snpobj.variants_id[snpobj.variants_id == ''] = '.' 1766 return snpobj 1767 1768 def convert_to_window_level( 1769 self, 1770 window_size: Optional[int] = None, 1771 physical_pos: Optional[np.ndarray] = None, 1772 chromosomes: Optional[np.ndarray] = None, 1773 window_sizes: Optional[np.ndarray] = None, 1774 laiobj: Optional['LocalAncestryObject'] = None 1775 ) -> 'LocalAncestryObject': 1776 """ 1777 Aggregate the `calldata_lai` attribute into genomic windows within a 1778 `snputils.ancestry.genobj.LocalAncestryObject`. 1779 1780 **Options for defining windows (in order of precedence):** 1781 1782 1. **Fixed window size**: 1783 - Use `window_size` to specify how many SNPs go into each window. The last window on each 1784 chromosome may be larger if SNPs are not evenly divisible by the size. 1785 1786 2. **Custom start and end positions**: 1787 - Provide `physical_pos` (2D array of shape (n_windows, 2)) as the [start, end] base-pair 1788 coordinates for each window. 1789 - If `chromosomes` is not provided and `self` has exactly one chromosome, all windows are 1790 assumed to belong to that chromosome. 1791 - If multiple chromosomes exist but `chromosomes` is missing, an error will be raised. 1792 - Optionally, provide `window_sizes` to store the SNP count per-window. 1793 1794 3. **Matching existing windows**: 1795 - Reuse window definitions (`physical_pos`, `chromosomes`, `window_sizes`) from an existing `laiobj`. 1796 1797 Args: 1798 window_size (int, optional): 1799 Number of SNPs in each window if defining fixed-size windows. If the total number of 1800 SNPs in a chromosome is not evenly divisible by the window size, the last window on that 1801 chromosome will include all remaining SNPs and therefore be larger than the specified size. 1802 physical_pos (array of shape (n_windows, 2), optional): 1803 A 2D array containing the start and end physical positions for each window. 1804 chromosomes (array of shape (n_windows,), optional): 1805 An array with chromosome numbers corresponding to each genomic window. 1806 window_sizes (array of shape (n_windows,), optional): 1807 An array specifying the number of SNPs in each genomic window. 1808 laiobj (LocalAncestryObject, optional): 1809 A reference `LocalAncestryObject` from which to copy existing window definitions. 1810 1811 Returns: 1812 **LocalAncestryObject:** 1813 A LocalAncestryObject containing window-level ancestry data. 1814 """ 1815 from snputils.ancestry.genobj.local import LocalAncestryObject 1816 1817 if window_size is None and physical_pos is None and laiobj is None: 1818 raise ValueError("One of `window_size`, `physical_pos`, or `laiobj` must be provided.") 1819 1820 # Fixed window size 1821 if window_size is not None: 1822 physical_pos = [] # Boundaries [start, end] of each window 1823 chromosomes = [] # Chromosome for each window 1824 window_sizes = [] # Number of SNPs for each window 1825 for chrom in self.unique_chrom: 1826 # Extract indices corresponding to this chromosome 1827 mask_chrom = (self.variants_chrom == chrom) 1828 # Subset to this chromosome 1829 pos_chrom = self.variants_pos[mask_chrom] 1830 # Number of SNPs for this chromosome 1831 n_snps_chrom = pos_chrom.size 1832 1833 # Initialize the start of the first window with the position of the first SNP 1834 current_start = self.variants_pos[0] 1835 1836 # Number of full windows with exactly `window_size` SNPs 1837 n_full_windows = n_snps_chrom // window_size 1838 1839 # Build all but the last window 1840 for i in range(n_full_windows-1): 1841 current_end = self.variants_pos[(i+1) * window_size - 1] 1842 physical_pos.append([current_start, current_end]) 1843 chromosomes.append(chrom) 1844 window_sizes.append(window_size) 1845 current_start = self.variants_pos[(i+1) * window_size] 1846 1847 # Build the last window 1848 current_end = self.variants_pos[-1] 1849 physical_pos.append([current_start, current_end]) 1850 chromosomes.append(chrom) 1851 window_sizes.append(n_snps_chrom - ((n_full_windows - 1) * window_size)) 1852 1853 physical_pos = np.array(physical_pos) 1854 chromosomes = np.array(chromosomes) 1855 window_sizes = np.array(window_sizes) 1856 1857 # Custom start and end positions 1858 elif physical_pos is not None: 1859 # Check if there is exactly one chromosome 1860 if chromosomes is None: 1861 unique_chrom = self.unique_chrom 1862 if len(unique_chrom) == 1: 1863 # We assume all windows belong to this single chromosome 1864 single_chrom = unique_chrom[0] 1865 chromosomes = np.array([single_chrom] * physical_pos.shape[0]) 1866 else: 1867 raise ValueError("Multiple chromosomes detected, but `chromosomes` was not provided.") 1868 1869 # Match existing windows to a reference laiobj 1870 elif laiobj is not None: 1871 physical_pos = laiobj.physical_pos 1872 chromosomes = laiobj.chromosomes 1873 window_sizes = laiobj.window_sizes 1874 1875 # Allocate an output LAI array 1876 n_windows = physical_pos.shape[0] 1877 n_samples = self.n_samples 1878 if self.calldata_lai.ndim == 3: 1879 lai = np.zeros((n_windows, n_samples, 2)) 1880 else: 1881 lai = np.zeros((n_windows, n_samples*2)) 1882 1883 # For each window, find the relevant SNPs and compute the mode of the ancestries 1884 for i, ((start, end), chrom) in enumerate(zip(physical_pos, chromosomes)): 1885 snps_mask = ( 1886 (self.variants_chrom == chrom) & 1887 (self.variants_pos >= start) & 1888 (self.variants_pos <= end) 1889 ) 1890 if np.any(snps_mask): 1891 lai_mask = self.calldata_lai[snps_mask, ...] 1892 mode_ancestries = mode(lai_mask, axis=0, nan_policy='omit').mode 1893 lai[i] = mode_ancestries 1894 else: 1895 lai[i] = np.nan 1896 1897 # Generate haplotype labels, e.g. "Sample1.0", "Sample1.1" 1898 haplotypes = [f"{sample}.{i}" for sample in self.samples for i in range(2)] 1899 1900 # If original data was (n_snps, n_samples, 2), flatten to (n_windows, n_samples*2) 1901 if self.calldata_lai.ndim == 3: 1902 lai = lai.reshape(n_windows, -1) 1903 1904 # Aggregate into a LocalAncestryObject 1905 return LocalAncestryObject( 1906 haplotypes=haplotypes, 1907 lai=lai, 1908 samples=self.samples, 1909 ancestry_map=self.ancestry_map, 1910 window_sizes=window_sizes, 1911 physical_pos=physical_pos, 1912 chromosomes=chromosomes 1913 ) 1914 1915 def save(self, file: Union[str, Path]) -> None: 1916 """ 1917 Save the data stored in `self` to a specified file. 1918 1919 The format of the saved file is determined by the file extension provided in the `file` 1920 argument. 1921 1922 **Supported formats:** 1923 1924 - `.bed`: Binary PED (Plink) format. 1925 - `.pgen`: Plink2 binary genotype format. 1926 - `.vcf`: Variant Call Format. 1927 - `.pkl`: Pickle format for saving `self` in serialized form. 1928 1929 Args: 1930 file (str or pathlib.Path): 1931 Path to the file where the data will be saved. The extension of the file determines the save format. 1932 Supported extensions: `.bed`, `.pgen`, `.vcf`, `.pkl`. 1933 """ 1934 ext = Path(file).suffix.lower() 1935 if ext == '.bed': 1936 self.save_bed(file) 1937 elif ext == '.pgen': 1938 self.save_pgen(file) 1939 elif ext == '.vcf': 1940 self.save_vcf(file) 1941 elif ext == '.pkl': 1942 self.save_pickle(file) 1943 else: 1944 raise ValueError(f"Unsupported file extension: {ext}") 1945 1946 def save_bed(self, file: Union[str, Path]) -> None: 1947 """ 1948 Save the data stored in `self` to a `.bed` file. 1949 1950 Args: 1951 file (str or pathlib.Path): 1952 Path to the file where the data will be saved. It should end with `.bed`. 1953 If the provided path does not have this extension, it will be appended. 1954 """ 1955 from snputils.snp.io.write.bed import BEDWriter 1956 writer = BEDWriter(snpobj=self, filename=file) 1957 writer.write() 1958 1959 def save_pgen(self, file: Union[str, Path]) -> None: 1960 """ 1961 Save the data stored in `self` to a `.pgen` file. 1962 1963 Args: 1964 file (str or pathlib.Path): 1965 Path to the file where the data will be saved. It should end with `.pgen`. 1966 If the provided path does not have this extension, it will be appended. 1967 """ 1968 from snputils.snp.io.write.pgen import PGENWriter 1969 writer = PGENWriter(snpobj=self, filename=file) 1970 writer.write() 1971 1972 def save_vcf(self, file: Union[str, Path]) -> None: 1973 """ 1974 Save the data stored in `self` to a `.vcf` file. 1975 1976 Args: 1977 file (str or pathlib.Path): 1978 Path to the file where the data will be saved. It should end with `.vcf`. 1979 If the provided path does not have this extension, it will be appended. 1980 """ 1981 from snputils.snp.io.write.vcf import VCFWriter 1982 writer = VCFWriter(snpobj=self, filename=file) 1983 writer.write() 1984 1985 def save_pickle(self, file: Union[str, Path]) -> None: 1986 """ 1987 Save `self` in serialized form to a `.pkl` file. 1988 1989 Args: 1990 file (str or pathlib.Path): 1991 Path to the file where the data will be saved. It should end with `.pkl`. 1992 If the provided path does not have this extension, it will be appended. 1993 """ 1994 import pickle 1995 with open(file, 'wb') as file: 1996 pickle.dump(self, file) 1997 1998 @staticmethod 1999 def _match_to_replace(val: Union[str, int, float], dictionary: Dict[Any, Any], regex: bool = True) -> Union[str, int, float]: 2000 """ 2001 Find a matching key in the provided dictionary for the given value `val` 2002 and replace it with the corresponding value. 2003 2004 Args: 2005 val (str, int, or float): 2006 The value to be matched and potentially replaced. 2007 dictionary (Dict): 2008 A dictionary containing keys and values for matching and replacement. 2009 The keys should match the data type of `val`. 2010 regex (bool): 2011 If True, interprets keys in `dictionary` as regular expressions. 2012 Default is True. 2013 2014 Returns: 2015 str, int, or float: 2016 The replacement value from `dictionary` if a match is found; otherwise, the original `val`. 2017 """ 2018 if regex: 2019 # Use regular expression matching to find replacements 2020 for key, value in dictionary.items(): 2021 if isinstance(key, str): 2022 match = re.match(key, val) 2023 if match: 2024 # Replace using the first matching regex pattern 2025 return re.sub(key, value, val) 2026 # Return the original value if no regex match is found 2027 return val 2028 else: 2029 # Return the value for `val` if present in `dictionary`; otherwise, return `val` 2030 return dictionary.get(val, val) 2031 2032 @staticmethod 2033 def _get_chromosome_number(chrom_string: str) -> Union[int, str]: 2034 """ 2035 Extracts the chromosome number from the given chromosome string. 2036 2037 Args: 2038 chrom_string (str): 2039 The chromosome identifier. 2040 2041 Returns: 2042 int or str: 2043 The numeric representation of the chromosome if detected. 2044 Returns 10001 for 'X' or 'chrX', 10002 for 'Y' or 'chrY', 2045 and the original `chrom_string` if unrecognized. 2046 """ 2047 if chrom_string.isdigit(): 2048 return int(chrom_string) 2049 else: 2050 chrom_num = re.search(r'\d+', chrom_string) 2051 if chrom_num: 2052 return int(chrom_num.group()) 2053 elif chrom_string.lower() in ['x', 'chrx']: 2054 return 10001 2055 elif chrom_string.lower() in ['y', 'chry']: 2056 return 10002 2057 else: 2058 log.warning(f"Chromosome nomenclature not standard. Chromosome: {chrom_string}") 2059 return chrom_string 2060 2061 def _sanity_check(self) -> None: 2062 """ 2063 Perform sanity checks to ensure LAI and ancestry map consistency. 2064 2065 This method checks that all unique ancestries in the LAI data are represented 2066 in the ancestry map if it is provided. 2067 """ 2068 if self.__calldata_lai is not None and self.__ancestry_map is not None: 2069 unique_ancestries = np.unique(self.__calldata_lai) 2070 missing_ancestries = [anc for anc in unique_ancestries if str(anc) not in self.__ancestry_map] 2071 if missing_ancestries: 2072 warnings.warn(f"Missing ancestries in ancestry_map: {missing_ancestries}")
A class for Single Nucleotide Polymorphism (SNP) data, with optional support for SNP-level Local Ancestry Information (LAI).
26 def __init__( 27 self, 28 calldata_gt: Optional[np.ndarray] = None, 29 samples: Optional[np.ndarray] = None, 30 variants_ref: Optional[np.ndarray] = None, 31 variants_alt: Optional[np.ndarray] = None, 32 variants_chrom: Optional[np.ndarray] = None, 33 variants_filter_pass: Optional[np.ndarray] = None, 34 variants_id: Optional[np.ndarray] = None, 35 variants_pos: Optional[np.ndarray] = None, 36 variants_qual: Optional[np.ndarray] = None, 37 calldata_lai: Optional[np.ndarray] = None, 38 ancestry_map: Optional[Dict[str, str]] = None 39 ) -> None: 40 """ 41 Args: 42 calldata_gt (array, optional): 43 An array containing genotype data for each sample. This array can be either 2D with shape 44 `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 45 `(n_snps, n_samples, 2)` if the strands are kept separate. 46 samples (array of shape (n_samples,), optional): 47 An array containing unique sample identifiers. 48 variants_ref (array of shape (n_snps,), optional): 49 An array containing the reference allele for each SNP. 50 variants_alt (array of shape (n_snps,), optional): 51 An array containing the alternate allele for each SNP. 52 variants_chrom (array of shape (n_snps,), optional): 53 An array containing the chromosome for each SNP. 54 variants_filter_pass (array of shape (n_snps,), optional): 55 An array indicating whether each SNP passed control checks. 56 variants_id (array of shape (n_snps,), optional): 57 An array containing unique identifiers (IDs) for each SNP. 58 variants_pos (array of shape (n_snps,), optional): 59 An array containing the chromosomal positions for each SNP. 60 variants_qual (array of shape (n_snps,), optional): 61 An array containing the Phred-scaled quality score for each SNP. 62 calldata_lai (array, optional): 63 An array containing the ancestry for each SNP. This array can be either 2D with shape 64 `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2). 65 ancestry_map (dict of str to str, optional): 66 A dictionary mapping ancestry codes to region names. 67 """ 68 self.__calldata_gt = calldata_gt 69 self.__samples = samples 70 self.__variants_ref = variants_ref 71 self.__variants_alt = variants_alt 72 self.__variants_chrom = variants_chrom 73 self.__variants_filter_pass = variants_filter_pass 74 self.__variants_id = variants_id 75 self.__variants_pos = variants_pos 76 self.__variants_qual = variants_qual 77 self.__calldata_lai = calldata_lai 78 self.__ancestry_map = ancestry_map 79 80 self._sanity_check()
Arguments:
- calldata_gt (array, optional): An array containing genotype data for each sample. This array can be either 2D with shape
(n_snps, n_samples)if the paternal and maternal strands are summed, or 3D with shape(n_snps, n_samples, 2)if the strands are kept separate. - samples (array of shape (n_samples,), optional): An array containing unique sample identifiers.
- variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
- variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
- variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
- variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
- variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
- variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
- variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
- calldata_lai (array, optional): An array containing the ancestry for each SNP. This array can be either 2D with shape
(n_snps, n_samples*2), or 3D with shape (n_snps, n_samples, 2). - ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
102 @property 103 def calldata_gt(self) -> np.ndarray: 104 """ 105 Retrieve `calldata_gt`. 106 107 Returns: 108 **array:** 109 An array containing genotype data for each sample. This array can be either 2D with shape 110 `(n_snps, n_samples)` if the paternal and maternal strands are summed, or 3D with shape 111 `(n_snps, n_samples, 2)` if the strands are kept separate. 112 """ 113 return self.__calldata_gt
Retrieve calldata_gt.
Returns:
array: An array containing genotype data for each sample. This array can be either 2D with shape
(n_snps, n_samples)if the paternal and maternal strands are summed, or 3D with shape(n_snps, n_samples, 2)if the strands are kept separate.
122 @property 123 def samples(self) -> Optional[np.ndarray]: 124 """ 125 Retrieve `samples`. 126 127 Returns: 128 **array of shape (n_samples,):** 129 An array containing unique sample identifiers. 130 """ 131 return self.__samples
Retrieve samples.
Returns:
array of shape (n_samples,): An array containing unique sample identifiers.
140 @property 141 def variants_ref(self) -> Optional[np.ndarray]: 142 """ 143 Retrieve `variants_ref`. 144 145 Returns: 146 **array of shape (n_snps,):** An array containing the reference allele for each SNP. 147 """ 148 return self.__variants_ref
Retrieve variants_ref.
Returns:
array of shape (n_snps,): An array containing the reference allele for each SNP.
157 @property 158 def variants_alt(self) -> Optional[np.ndarray]: 159 """ 160 Retrieve `variants_alt`. 161 162 Returns: 163 **array of shape (n_snps,):** An array containing the alternate allele for each SNP. 164 """ 165 return self.__variants_alt
Retrieve variants_alt.
Returns:
array of shape (n_snps,): An array containing the alternate allele for each SNP.
174 @property 175 def variants_chrom(self) -> Optional[np.ndarray]: 176 """ 177 Retrieve `variants_chrom`. 178 179 Returns: 180 **array of shape (n_snps,):** An array containing the chromosome for each SNP. 181 """ 182 return self.__variants_chrom
Retrieve variants_chrom.
Returns:
array of shape (n_snps,): An array containing the chromosome for each SNP.
191 @property 192 def variants_filter_pass(self) -> Optional[np.ndarray]: 193 """ 194 Retrieve `variants_filter_pass`. 195 196 Returns: 197 **array of shape (n_snps,):** An array indicating whether each SNP passed control checks. 198 """ 199 return self.__variants_filter_pass
Retrieve variants_filter_pass.
Returns:
array of shape (n_snps,): An array indicating whether each SNP passed control checks.
208 @property 209 def variants_id(self) -> Optional[np.ndarray]: 210 """ 211 Retrieve `variants_id`. 212 213 Returns: 214 **array of shape (n_snps,):** An array containing unique identifiers (IDs) for each SNP. 215 """ 216 return self.__variants_id
Retrieve variants_id.
Returns:
array of shape (n_snps,): An array containing unique identifiers (IDs) for each SNP.
225 @property 226 def variants_pos(self) -> Optional[np.ndarray]: 227 """ 228 Retrieve `variants_pos`. 229 230 Returns: 231 **array of shape (n_snps,):** An array containing the chromosomal positions for each SNP. 232 """ 233 return self.__variants_pos
Retrieve variants_pos.
Returns:
array of shape (n_snps,): An array containing the chromosomal positions for each SNP.
242 @property 243 def variants_qual(self) -> Optional[np.ndarray]: 244 """ 245 Retrieve `variants_qual`. 246 247 Returns: 248 **array of shape (n_snps,):** An array containing the Phred-scaled quality score for each SNP. 249 """ 250 return self.__variants_qual
Retrieve variants_qual.
Returns:
array of shape (n_snps,): An array containing the Phred-scaled quality score for each SNP.
259 @property 260 def calldata_lai(self) -> Optional[np.ndarray]: 261 """ 262 Retrieve `calldata_lai`. 263 264 Returns: 265 **array:** 266 An array containing the ancestry for each SNP. This array can be either 2D with shape 267 `(n_snps, n_samples*2)`, or 3D with shape (n_snps, n_samples, 2). 268 """ 269 return self.__calldata_lai
Retrieve calldata_lai.
Returns:
array: An array containing the ancestry for each SNP. This array can be either 2D with shape
(n_snps, n_samples*2), or 3D with shape (n_snps, n_samples, 2).
278 @property 279 def ancestry_map(self) -> Optional[Dict[str, str]]: 280 """ 281 Retrieve `ancestry_map`. 282 283 Returns: 284 **dict of str to str:** A dictionary mapping ancestry codes to region names. 285 """ 286 return self.__ancestry_map
Retrieve ancestry_map.
Returns:
dict of str to str: A dictionary mapping ancestry codes to region names.
295 @property 296 def n_samples(self) -> int: 297 """ 298 Retrieve `n_samples`. 299 300 Returns: 301 **int:** The total number of samples. 302 """ 303 if self.__samples is not None: 304 return len(self.__samples) 305 elif self.__calldata_gt is not None: 306 return self.__calldata_gt.shape[1] 307 elif self.__calldata_lai is not None: 308 if self.__calldata_lai.ndim == 2: 309 return self.__calldata_lai.shape[1] // 2 310 elif self.__calldata_lai.ndim == 3: 311 return self.__calldata_lai.shape[1] 312 else: 313 raise ValueError("Unable to determine the total number of samples: no relevant data is available.")
315 @property 316 def n_snps(self) -> int: 317 """ 318 Retrieve `n_snps`. 319 320 Returns: 321 **int:** The total number of SNPs. 322 """ 323 # List of attributes that can indicate the number of SNPs 324 potential_attributes = [ 325 self.__calldata_gt, 326 self.__variants_ref, 327 self.__variants_alt, 328 self.__variants_chrom, 329 self.__variants_filter_pass, 330 self.__variants_id, 331 self.__variants_pos, 332 self.__variants_qual, 333 self.__calldata_lai 334 ] 335 336 # Check each attribute for its first dimension, which corresponds to `n_snps` 337 for attr in potential_attributes: 338 if attr is not None: 339 return attr.shape[0] 340 341 raise ValueError("Unable to determine the total number of SNPs: no relevant data is available.")
343 @property 344 def n_chrom(self) -> Optional[int]: 345 """ 346 Retrieve `n_chrom`. 347 348 Returns: 349 **int:** The total number of unique chromosomes in `variants_chrom`. 350 """ 351 if self.variants_chrom is None: 352 warnings.warn("Chromosome data `variants_chrom` is None.") 353 return None 354 355 return len(self.unique_chrom)
357 @property 358 def n_ancestries(self) -> int: 359 """ 360 Retrieve `n_ancestries`. 361 362 Returns: 363 **int:** The total number of unique ancestries. 364 """ 365 if self.__calldata_lai is not None: 366 return len(np.unique(self.__calldata_lai)) 367 else: 368 raise ValueError("Unable to determine the total number of ancestries: no relevant data is available.")
370 @property 371 def unique_chrom(self) -> Optional[np.ndarray]: 372 """ 373 Retrieve `unique_chrom`. 374 375 Returns: 376 **array:** The unique chromosome names in `variants_chrom`, preserving their order of appearance. 377 """ 378 if self.variants_chrom is None: 379 warnings.warn("Chromosome data `variants_chrom` is None.") 380 return None 381 382 # Identify unique chromosome names and their first indexes of occurrence 383 _, idx = np.unique(self.variants_chrom, return_index=True) 384 # Return chromosome names sorted by their first occurrence to maintain original order 385 return self.variants_chrom[np.sort(idx)]
Retrieve unique_chrom.
Returns:
array: The unique chromosome names in
variants_chrom, preserving their order of appearance.
387 @property 388 def are_strands_summed(self) -> bool: 389 """ 390 Retrieve `are_strands_summed`. 391 392 Returns: 393 **bool:** 394 True if the maternal and paternal strands have been summed together, which is indicated by 395 `calldata_gt` having shape `(n_samples, n_snps)`. False if the strands are stored separately, 396 indicated by `calldata_gt` having shape `(n_samples, n_snps, 2)`. 397 """ 398 if self.calldata_gt is None: 399 warnings.warn("Genotype data `calldata_gt` is None.") 400 return None 401 402 return self.calldata_gt.ndim == 2
Retrieve are_strands_summed.
Returns:
bool: True if the maternal and paternal strands have been summed together, which is indicated by
calldata_gthaving shape(n_samples, n_snps). False if the strands are stored separately, indicated bycalldata_gthaving shape(n_samples, n_snps, 2).
404 def copy(self) -> SNPObject: 405 """ 406 Create and return a copy of `self`. 407 408 Returns: 409 **SNPObject:** 410 A new instance of the current object. 411 """ 412 return copy.deepcopy(self)
Create and return a copy of self.
Returns:
SNPObject: A new instance of the current object.
414 def keys(self) -> List[str]: 415 """ 416 Retrieve a list of public attribute names for `self`. 417 418 Returns: 419 **list of str:** 420 A list of attribute names, with internal name-mangling removed, 421 for easier reference to public attributes in the instance. 422 """ 423 return [attr.replace('_SNPObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
425 def allele_freq( 426 self, 427 sample_labels: Optional[Sequence[Any]] = None, 428 ancestry: Optional[Union[str, int]] = None, 429 laiobj: Optional["LocalAncestryObject"] = None, 430 return_counts: bool = False, 431 as_dataframe: bool = False, 432 ) -> Any: 433 """ 434 Compute per-SNP alternate allele frequencies from `calldata_gt`. 435 436 Args: 437 sample_labels (sequence, optional): 438 Population label per sample. If None, computes cohort-level frequencies. 439 ancestry (str or int, optional): 440 If provided, compute ancestry-masked frequencies using SNP-level LAI. 441 laiobj (LocalAncestryObject, optional): 442 Optional LAI object used when `self.calldata_lai` is not set. 443 return_counts (bool, default=False): 444 If True, also return called-allele counts with the same shape as frequencies. 445 as_dataframe (bool, default=False): 446 If True, return pandas DataFrame output. 447 448 Returns: 449 Frequencies as a NumPy array (or DataFrame if `as_dataframe=True`). 450 If `return_counts=True`, returns `(freq, counts)`. 451 """ 452 if self.calldata_gt is None: 453 raise ValueError("Genotype data `calldata_gt` is None.") 454 455 gt = np.asarray(self.calldata_gt) 456 if gt.ndim not in (2, 3): 457 raise ValueError("'calldata_gt' must be 2D or 3D array") 458 459 n_samples = gt.shape[1] 460 461 grouped_output = sample_labels is not None 462 if sample_labels is None: 463 labels = np.repeat("__all__", n_samples) 464 else: 465 labels = np.asarray(sample_labels) 466 if labels.ndim != 1: 467 labels = labels.ravel() 468 if labels.shape[0] != n_samples: 469 raise ValueError( 470 "'sample_labels' must have length equal to the number of samples in `calldata_gt`." 471 ) 472 473 calldata_lai = None 474 if ancestry is not None: 475 if self.calldata_lai is not None: 476 calldata_lai = self.calldata_lai 477 elif laiobj is not None: 478 try: 479 converted_lai = laiobj.convert_to_snp_level(snpobject=self, lai_format="3D") 480 calldata_lai = getattr(converted_lai, "calldata_lai", None) 481 except Exception: 482 calldata_lai = None 483 484 if calldata_lai is None: 485 raise ValueError( 486 "Ancestry-specific masking requires SNP-level LAI " 487 "(provide a LocalAncestryObject via 'laiobj' or ensure 'self.calldata_lai' is set)." 488 ) 489 490 afs, counts, pops = aggregate_pop_allele_freq( 491 calldata_gt=gt, 492 sample_labels=labels, 493 ancestry=ancestry, 494 calldata_lai=calldata_lai, 495 ) 496 497 if grouped_output: 498 freq_out = afs 499 count_out = counts 500 if as_dataframe: 501 import pandas as pd 502 503 freq_out = pd.DataFrame(afs, columns=pops) 504 count_out = pd.DataFrame(counts, columns=pops) 505 else: 506 freq_out = afs[:, 0] 507 count_out = counts[:, 0] 508 if as_dataframe: 509 import pandas as pd 510 511 freq_out = pd.DataFrame({"allele_freq": freq_out}) 512 count_out = pd.DataFrame({"called_alleles": count_out}) 513 514 if return_counts: 515 return freq_out, count_out 516 return freq_out
Compute per-SNP alternate allele frequencies from calldata_gt.
Arguments:
- sample_labels (sequence, optional): Population label per sample. If None, computes cohort-level frequencies.
- ancestry (str or int, optional): If provided, compute ancestry-masked frequencies using SNP-level LAI.
- laiobj (LocalAncestryObject, optional): Optional LAI object used when
self.calldata_laiis not set. - return_counts (bool, default=False): If True, also return called-allele counts with the same shape as frequencies.
- as_dataframe (bool, default=False): If True, return pandas DataFrame output.
Returns:
Frequencies as a NumPy array (or DataFrame if
as_dataframe=True). Ifreturn_counts=True, returns(freq, counts).
518 def sum_strands(self, inplace: bool = False) -> Optional['SNPObject']: 519 """ 520 Sum paternal and maternal strands. 521 522 Args: 523 inplace (bool, default=False): 524 If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 525 filtered. Default is False. 526 527 Returns: 528 **Optional[SNPObject]:** 529 A new `SNPObject` with summed strands if `inplace=False`. 530 If `inplace=True`, modifies `self` in place and returns None. 531 """ 532 if self.calldata_gt is None: 533 warnings.warn("Genotype data `calldata_gt` is None.") 534 return None if not inplace else self 535 536 if self.are_strands_summed: 537 warnings.warn("Genotype data `calldata_gt` is already summed.") 538 return self if inplace else self.copy() 539 540 if inplace: 541 self.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8) 542 return self 543 else: 544 snpobj = self.copy() 545 snpobj.calldata_gt = self.calldata_gt.sum(axis=2, dtype=np.int8) 546 return snpobj
Sum paternal and maternal strands.
Arguments:
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the variants filtered. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith summed strands ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
548 def filter_variants( 549 self, 550 chrom: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 551 pos: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 552 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 553 include: bool = True, 554 inplace: bool = False 555 ) -> Optional['SNPObject']: 556 """ 557 Filter variants based on specified chromosome names, variant positions, or variant indexes. 558 559 This method updates the `calldata_gt`, `variants_ref`, `variants_alt`, 560 `variants_chrom`, `variants_filter_pass`, `variants_id`, `variants_pos`, 561 `variants_qual`, and `lai` attributes to include or exclude the specified variants. The filtering 562 criteria can be based on chromosome names, variant positions, or indexes. If multiple 563 criteria are provided, their union is used for filtering. The order of the variants is preserved. 564 565 Negative indexes are supported and follow 566 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 567 568 Args: 569 chrom (str or array_like of str, optional): 570 Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence 571 of chromosomes. If both `chrom` and `pos` are provided, they must either have matching lengths 572 (pairing each chromosome with a position) or `chrom` should be a single value that applies to 573 all positions in `pos`. Default is None. 574 pos (int or array_like of int, optional): 575 Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions. 576 If `chrom` is also provided, `pos` should either match `chrom` in length or `chrom` should be a 577 single value. Default is None. 578 indexes (int or array_like of int, optional): 579 Index(es) of the variants to include or exclude. Can be a single index or a sequence 580 of indexes. Negative indexes are supported. Default is None. 581 include (bool, default=True): 582 If True, includes only the specified variants. If False, excludes the specified 583 variants. Default is True. 584 inplace (bool, default=False): 585 If True, modifies `self` in place. If False, returns a new `SNPObject` with the variants 586 filtered. Default is False. 587 588 Returns: 589 **Optional[SNPObject]:** 590 A new `SNPObject` with the specified variants filtered if `inplace=False`. 591 If `inplace=True`, modifies `self` in place and returns None. 592 """ 593 if chrom is None and pos is None and indexes is None: 594 raise ValueError("At least one of 'chrom', 'pos', or 'indexes' must be provided.") 595 596 n_snps = self.n_snps 597 598 # Convert inputs to arrays for consistency 599 chrom = np.atleast_1d(chrom) if chrom is not None else None 600 pos = np.atleast_1d(pos) if pos is not None else None 601 indexes = np.atleast_1d(indexes) if indexes is not None else None 602 603 # Validate chrom and pos lengths if both are provided 604 if chrom is not None and pos is not None: 605 if len(chrom) != len(pos) and len(chrom) > 1: 606 raise ValueError( 607 "When both 'chrom' and 'pos' are provided, they must either be of the same length " 608 "or 'chrom' must be a single value." 609 ) 610 611 # Create a mask for chromosome and position filtering 612 mask_combined = np.zeros(n_snps, dtype=bool) 613 if chrom is not None and pos is not None: 614 if len(chrom) == 1: 615 # Apply single chromosome to all positions in `pos` 616 mask_combined = (self['variants_chrom'] == chrom[0]) & np.isin(self['variants_pos'], pos) 617 else: 618 # Vectorized pair matching for chrom and pos 619 query_pairs = np.array( 620 list(zip(chrom, pos)), 621 dtype=[ 622 ('chrom', self['variants_chrom'].dtype), 623 ('pos', self['variants_pos'].dtype) 624 ] 625 ) 626 data_pairs = np.array( 627 list(zip(self['variants_chrom'], self['variants_pos'])), 628 dtype=[ 629 ('chrom', self['variants_chrom'].dtype), 630 ('pos', self['variants_pos'].dtype) 631 ] 632 ) 633 mask_combined = np.isin(data_pairs, query_pairs) 634 635 elif chrom is not None: 636 # Only chromosome filtering 637 mask_combined = np.isin(self['variants_chrom'], chrom) 638 elif pos is not None: 639 # Only position filtering 640 mask_combined = np.isin(self['variants_pos'], pos) 641 642 # Create mask based on indexes if provided 643 if indexes is not None: 644 # Validate indexes, allowing negative indexes 645 out_of_bounds_indexes = indexes[(indexes < -n_snps) | (indexes >= n_snps)] 646 if out_of_bounds_indexes.size > 0: 647 raise ValueError(f"One or more sample indexes are out of bounds.") 648 649 # Handle negative indexes and check for out-of-bounds indexes 650 adjusted_indexes = np.mod(indexes, n_snps) 651 652 # Create mask for specified indexes 653 mask_indexes = np.zeros(n_snps, dtype=bool) 654 mask_indexes[adjusted_indexes] = True 655 656 # Combine with `chrom` and `pos` mask using logical OR (union of all specified criteria) 657 mask_combined = mask_combined | mask_indexes 658 659 # Invert mask if `include` is False 660 if not include: 661 mask_combined = ~mask_combined 662 663 # Define keys to filter 664 keys = [ 665 'calldata_gt', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 666 'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai' 667 ] 668 669 # Apply filtering based on inplace parameter 670 if inplace: 671 for key in keys: 672 if self[key] is not None: 673 if self[key].ndim > 1: 674 self[key] = np.asarray(self[key])[mask_combined, ...] 675 else: 676 self[key] = np.asarray(self[key])[mask_combined] 677 678 return None 679 else: 680 # Create A new `SNPObject` with filtered data 681 snpobj = self.copy() 682 for key in keys: 683 if snpobj[key] is not None: 684 if snpobj[key].ndim > 1: 685 snpobj[key] = np.asarray(snpobj[key])[mask_combined, ...] 686 else: 687 snpobj[key] = np.asarray(snpobj[key])[mask_combined] 688 689 return snpobj
Filter variants based on specified chromosome names, variant positions, or variant indexes.
This method updates the calldata_gt, variants_ref, variants_alt,
variants_chrom, variants_filter_pass, variants_id, variants_pos,
variants_qual, and lai attributes to include or exclude the specified variants. The filtering
criteria can be based on chromosome names, variant positions, or indexes. If multiple
criteria are provided, their union is used for filtering. The order of the variants is preserved.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- chrom (str or array_like of str, optional): Chromosome(s) to filter variants by. Can be a single chromosome as a string or a sequence
of chromosomes. If both
chromandposare provided, they must either have matching lengths (pairing each chromosome with a position) orchromshould be a single value that applies to all positions inpos. Default is None. - pos (int or array_like of int, optional): Position(s) to filter variants by. Can be a single position as an integer or a sequence of positions.
If
chromis also provided,posshould either matchchromin length orchromshould be a single value. Default is None. - indexes (int or array_like of int, optional): Index(es) of the variants to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified variants. If False, excludes the specified variants. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the variants filtered. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the specified variants filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
691 def filter_samples( 692 self, 693 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 694 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 695 include: bool = True, 696 reorder: bool = False, 697 inplace: bool = False 698 ) -> Optional['SNPObject']: 699 """ 700 Filter samples based on specified names or indexes. 701 702 This method updates the `samples` and `calldata_gt` attributes to include or exclude the specified 703 samples. The order of the samples is preserved. Set `reorder=True` to match the ordering of the 704 provided `samples` and/or `indexes` lists when including. 705 706 If both samples and indexes are provided, any sample matching either a name in samples or an index in 707 indexes will be included or excluded. 708 709 This method allows inclusion or exclusion of specific samples by their names or 710 indexes. When both sample names and indexes are provided, the union of the specified samples 711 is used. Negative indexes are supported and follow 712 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 713 714 Args: 715 samples (str or array_like of str, optional): 716 Name(s) of the samples to include or exclude. Can be a single sample name or a 717 sequence of sample names. Default is None. 718 indexes (int or array_like of int, optional): 719 Index(es) of the samples to include or exclude. Can be a single index or a sequence 720 of indexes. Negative indexes are supported. Default is None. 721 include (bool, default=True): 722 If True, includes only the specified samples. If False, excludes the specified 723 samples. Default is True. 724 inplace (bool, default=False): 725 If True, modifies `self` in place. If False, returns a new `SNPObject` with the samples 726 filtered. Default is False. 727 728 Returns: 729 **Optional[SNPObject]:** 730 A new `SNPObject` with the specified samples filtered if `inplace=False`. 731 If `inplace=True`, modifies `self` in place and returns None. 732 """ 733 if samples is None and indexes is None: 734 raise ValueError("At least one of 'samples' or 'indexes' must be provided.") 735 736 n_samples = self.n_samples 737 sample_names = np.array(self['samples']) 738 739 # Create mask based on sample names 740 if samples is not None: 741 samples = np.asarray(samples).ravel() 742 mask_samples = np.isin(sample_names, samples) 743 missing_samples = samples[~np.isin(samples, sample_names)] 744 if missing_samples.size > 0: 745 raise ValueError(f"The following specified samples were not found: {missing_samples.tolist()}") 746 else: 747 mask_samples = np.zeros(n_samples, dtype=bool) 748 749 # Create mask based on sample indexes 750 if indexes is not None: 751 indexes = np.asarray(indexes).ravel() 752 753 # Validate indexes, allowing negative indexes 754 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 755 if out_of_bounds_indexes.size > 0: 756 raise ValueError(f"One or more sample indexes are out of bounds.") 757 758 # Handle negative indexes 759 adjusted_indexes = np.mod(indexes, n_samples) 760 761 mask_indexes = np.zeros(n_samples, dtype=bool) 762 mask_indexes[adjusted_indexes] = True 763 else: 764 mask_indexes = np.zeros(n_samples, dtype=bool) 765 766 # Combine masks using logical OR (union of samples) 767 mask_combined = mask_samples | mask_indexes 768 769 if not include: 770 mask_combined = ~mask_combined 771 772 # If requested, compute an ordering of selected samples that follows the provided lists. 773 ordered_indices = None 774 if include and reorder: 775 sel_indices = np.where(mask_combined)[0] 776 ordered_list: List[int] = [] 777 added = np.zeros(n_samples, dtype=bool) 778 779 # Prioritize the order in `samples` 780 if samples is not None: 781 name_to_idx = {name: idx for idx, name in enumerate(sample_names)} 782 for s in samples: 783 idx = name_to_idx.get(s) 784 if idx is not None and mask_combined[idx] and not added[idx]: 785 ordered_list.append(idx) 786 added[idx] = True 787 788 # Then respect the order in `indexes` 789 if indexes is not None: 790 adj_idx = np.mod(np.atleast_1d(indexes), n_samples) 791 for idx in adj_idx: 792 if mask_combined[idx] and not added[idx]: 793 ordered_list.append(int(idx)) 794 added[idx] = True 795 796 # Finally, append any remaining selected samples in their original order 797 for idx in sel_indices: 798 if not added[idx]: 799 ordered_list.append(int(idx)) 800 801 ordered_indices = np.asarray(ordered_list, dtype=int) 802 803 # Define keys to filter 804 keys = ['samples', 'calldata_gt', 'calldata_lai'] 805 806 # Apply filtering based on inplace parameter 807 if inplace: 808 for key in keys: 809 if self[key] is not None: 810 arr = np.asarray(self[key]) 811 if ordered_indices is not None: 812 if key == 'calldata_lai' and arr.ndim == 2: 813 # Haplotype-aware reordering for 2D LAI (n_snps, n_samples*2) 814 hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1]) 815 self[key] = arr[:, hap_idx] 816 elif arr.ndim > 1: 817 self[key] = arr[:, ordered_indices, ...] 818 else: 819 self[key] = arr[ordered_indices] 820 else: 821 if arr.ndim > 1: 822 self[key] = arr[:, mask_combined, ...] 823 else: 824 self[key] = arr[mask_combined] 825 826 return None 827 else: 828 # Create A new `SNPObject` with filtered data 829 snpobj = self.copy() 830 for key in keys: 831 if snpobj[key] is not None: 832 arr = np.asarray(snpobj[key]) 833 if ordered_indices is not None: 834 if key == 'calldata_lai' and arr.ndim == 2: 835 hap_idx = np.concatenate([2*ordered_indices, 2*ordered_indices + 1]) 836 snpobj[key] = arr[:, hap_idx] 837 elif arr.ndim > 1: 838 snpobj[key] = arr[:, ordered_indices, ...] 839 else: 840 snpobj[key] = arr[ordered_indices] 841 else: 842 if arr.ndim > 1: 843 snpobj[key] = arr[:, mask_combined, ...] 844 else: 845 snpobj[key] = arr[mask_combined] 846 return snpobj
Filter samples based on specified names or indexes.
This method updates the samples and calldata_gt attributes to include or exclude the specified
samples. The order of the samples is preserved. Set reorder=True to match the ordering of the
provided samples and/or indexes lists when including.
If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.
This method allows inclusion or exclusion of specific samples by their names or indexes. When both sample names and indexes are provided, the union of the specified samples is used. Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
- indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the samples filtered. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the specified samples filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
848 def detect_chromosome_format(self) -> str: 849 """ 850 Detect the chromosome naming convention in `variants_chrom` based on the prefix 851 of the first chromosome identifier in `unique_chrom`. 852 853 **Recognized formats:** 854 855 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 856 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 857 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 858 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 859 860 If the format does not match any recognized pattern, `'Unknown format'` is returned. 861 862 Returns: 863 **str:** 864 A string indicating the detected chromosome format (`'chr'`, `'chm'`, `'chrom'`, or `'plain'`). 865 If no recognized format is matched, returns `'Unknown format'`. 866 """ 867 # Select the first unique chromosome identifier for format detection 868 chromosome_str = self.unique_chrom[0] 869 870 # Define regular expressions to match each recognized chromosome format 871 patterns = { 872 'chr': r'^chr(\d+|X|Y|M)$', # Matches 'chr' prefixed format 873 'chm': r'^chm(\d+|X|Y|M)$', # Matches 'chm' prefixed format 874 'chrom': r'^chrom(\d+|X|Y|M)$', # Matches 'chrom' prefixed format 875 'plain': r'^(\d+|X|Y|M)$' # Matches plain format without prefix 876 } 877 878 # Iterate through the patterns to identify the chromosome format 879 for prefix, pattern in patterns.items(): 880 if re.match(pattern, chromosome_str): 881 return prefix # Return the recognized format prefix 882 883 # If no pattern matches, return 'Unknown format' 884 return 'Unknown format'
Detect the chromosome naming convention in variants_chrom based on the prefix
of the first chromosome identifier in unique_chrom.
Recognized formats:
'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
If the format does not match any recognized pattern, 'Unknown format' is returned.
Returns:
str: A string indicating the detected chromosome format (
'chr','chm','chrom', or'plain'). If no recognized format is matched, returns'Unknown format'.
886 def convert_chromosome_format( 887 self, 888 from_format: str, 889 to_format: str, 890 inplace: bool = False 891 ) -> Optional['SNPObject']: 892 """ 893 Convert the chromosome format from one naming convention to another in `variants_chrom`. 894 895 **Supported formats:** 896 897 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 898 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 899 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 900 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 901 902 Args: 903 from_format (str): 904 The current chromosome format. Acceptable values are `'chr'`, `'chm'`, `'chrom'`, or `'plain'`. 905 to_format (str): 906 The target format for chromosome data conversion. Acceptable values match `from_format` options. 907 inplace (bool, default=False): 908 If True, modifies `self` in place. If False, returns a new `SNPObject` with the converted format. 909 Default is False. 910 911 Returns: 912 **Optional[SNPObject]:** A new `SNPObject` with the converted chromosome format if `inplace=False`. 913 If `inplace=True`, modifies `self` in place and returns None. 914 """ 915 # Define the list of standard chromosome identifiers 916 chrom_list = [*map(str, range(1, 23)), 'X', 'Y', 'M'] # M for mitochondrial chromosomes 917 918 # Format mappings for different chromosome naming conventions 919 format_mappings = { 920 'chr': [f'chr{i}' for i in chrom_list], 921 'chm': [f'chm{i}' for i in chrom_list], 922 'chrom': [f'chrom{i}' for i in chrom_list], 923 'plain': chrom_list, 924 } 925 926 # Verify that from_format and to_format are valid naming conventions 927 if from_format not in format_mappings or to_format not in format_mappings: 928 raise ValueError(f"Invalid format: {from_format} or {to_format}. Must be one of {list(format_mappings.keys())}.") 929 930 # Convert chromosomes to string for consistent comparison 931 variants_chrom = self['variants_chrom'].astype(str) 932 933 # Verify that all chromosomes in the object follow the specified `from_format` 934 expected_chroms = set(format_mappings[from_format]) 935 mismatched_chroms = set(variants_chrom) - expected_chroms 936 937 if mismatched_chroms: 938 raise ValueError(f"The following chromosomes do not match the `from_format` '{from_format}': {mismatched_chroms}.") 939 940 # Create conditions for selecting based on current `from_format` names 941 conditions = [variants_chrom == chrom for chrom in format_mappings[from_format]] 942 943 # Rename chromosomes based on inplace flag 944 if inplace: 945 self['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown') 946 return None 947 else: 948 snpobject = self.copy() 949 snpobject['variants_chrom'] = np.select(conditions, format_mappings[to_format], default='unknown') 950 return snpobject
Convert the chromosome format from one naming convention to another in variants_chrom.
Supported formats:
'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
Arguments:
- from_format (str): The current chromosome format. Acceptable values are
'chr','chm','chrom', or'plain'. - to_format (str): The target format for chromosome data conversion. Acceptable values match
from_formatoptions. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the converted format. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the converted chromosome format ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
952 def match_chromosome_format(self, snpobj: 'SNPObject', inplace: bool = False) -> Optional['SNPObject']: 953 """ 954 Convert the chromosome format in `variants_chrom` from `self` to match the format of a reference `snpobj`. 955 956 **Recognized formats:** 957 958 - `'chr'`: Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'. 959 - `'chm'`: Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'. 960 - `'chrom'`: Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'. 961 - `'plain'`: Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'. 962 963 Args: 964 snpobj (SNPObject): 965 The reference SNPObject whose chromosome format will be matched. 966 inplace (bool, default=False): 967 If True, modifies `self` in place. If False, returns a new `SNPObject` with the 968 chromosome format matching that of `snpobj`. Default is False. 969 970 Returns: 971 **Optional[SNPObject]:** 972 A new `SNPObject` with matched chromosome format if `inplace=False`. 973 If `inplace=True`, modifies `self` in place and returns None. 974 """ 975 # Detect the chromosome naming format of the current SNPObject 976 fmt1 = self.detect_chromosome_format() 977 if fmt1 == 'Unknown format': 978 raise ValueError("The chromosome format of the current SNPObject is unrecognized.") 979 980 # Detect the chromosome naming format of the reference SNPObject 981 fmt2 = snpobj.detect_chromosome_format() 982 if fmt2 == 'Unknown format': 983 raise ValueError("The chromosome format of the reference SNPObject is unrecognized.") 984 985 # Convert the current SNPObject's chromosome format to match the reference format 986 return self.convert_chromosome_format(fmt1, fmt2, inplace=inplace)
Convert the chromosome format in variants_chrom from self to match the format of a reference snpobj.
Recognized formats:
'chr': Format with 'chr' prefix, e.g., 'chr1', 'chr2', ..., 'chrX', 'chrY', 'chrM'.'chm': Format with 'chm' prefix, e.g., 'chm1', 'chm2', ..., 'chmX', 'chmY', 'chmM'.'chrom': Format with 'chrom' prefix, e.g., 'chrom1', 'chrom2', ..., 'chromX', 'chromY', 'chromM'.'plain': Plain format without a prefix, e.g., '1', '2', ..., 'X', 'Y', 'M'.
Arguments:
- snpobj (SNPObject): The reference SNPObject whose chromosome format will be matched.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the chromosome format matching that ofsnpobj. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith matched chromosome format ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
988 def rename_chrom( 989 self, 990 to_replace: Union[Dict[str, str], str, List[str]] = {'^([0-9]+)$': r'chr\1', r'^chr([0-9]+)$': r'\1'}, 991 value: Optional[Union[str, List[str]]] = None, 992 regex: bool = True, 993 inplace: bool = False 994 ) -> Optional['SNPObject']: 995 """ 996 Replace chromosome values in `variants_chrom` using patterns or exact matches. 997 998 This method allows flexible chromosome replacements, using regex or exact matches, useful 999 for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'), 1000 consider `convert_chromosome_format`. 1001 1002 Args: 1003 to_replace (dict, str, or list of str): 1004 Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior 1005 transforms `<chrom_num>` to `chr<chrom_num>` or vice versa. Non-matching values 1006 remain unchanged. 1007 - If str or list of str: Matches will be replaced with `value`. 1008 - If regex (bool), then any regex matches will be replaced with `value`. 1009 - If dict: Keys defines values to replace, with corresponding replacements as values. 1010 value (str or list of str, optional): 1011 Replacement value(s) if `to_replace` is a string or list. Ignored if `to_replace` 1012 is a dictionary. 1013 regex (bool, default=True): 1014 If True, interprets `to_replace` keys as regex patterns. 1015 inplace (bool, default=False): 1016 If True, modifies `self` in place. If False, returns a new `SNPObject` with the chromosomes 1017 renamed. Default is False. 1018 1019 Returns: 1020 **Optional[SNPObject]:** A new `SNPObject` with the renamed chromosome format if `inplace=False`. 1021 If `inplace=True`, modifies `self` in place and returns None. 1022 """ 1023 # Standardize input format: convert `to_replace` and `value` to a dictionary if needed 1024 if isinstance(to_replace, (str, int)): 1025 to_replace = [to_replace] 1026 if isinstance(value, (str, int)): 1027 value = [value] 1028 if isinstance(to_replace, list) and isinstance(value, list): 1029 dictionary = dict(zip(to_replace, value)) 1030 elif isinstance(to_replace, dict) and value is None: 1031 dictionary = to_replace 1032 else: 1033 raise ValueError( 1034 "Invalid input: `to_replace` and `value` must be compatible types (both str, list of str, or dict)." 1035 ) 1036 1037 # Vectorized function for replacing values in chromosome array 1038 vec_replace_values = np.vectorize(self._match_to_replace) 1039 1040 # Rename chromosomes based on inplace flag 1041 if inplace: 1042 self.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex) 1043 return None 1044 else: 1045 snpobj = self.copy() 1046 snpobj.variants_chrom = vec_replace_values(self.variants_chrom, dictionary, regex) 1047 return snpobj
Replace chromosome values in variants_chrom using patterns or exact matches.
This method allows flexible chromosome replacements, using regex or exact matches, useful
for non-standard chromosome formats. For standard conversions (e.g., 'chr1' to '1'),
consider convert_chromosome_format.
Arguments:
- to_replace (dict, str, or list of str): Pattern(s) or exact value(s) to be replaced in chromosome names. Default behavior
transforms
<chrom_num>tochr<chrom_num>or vice versa. Non-matching values remain unchanged.- If str or list of str: Matches will be replaced with
value. - If regex (bool), then any regex matches will be replaced with
value. - If dict: Keys defines values to replace, with corresponding replacements as values.
- If str or list of str: Matches will be replaced with
- value (str or list of str, optional): Replacement value(s) if
to_replaceis a string or list. Ignored ifto_replaceis a dictionary. - regex (bool, default=True): If True, interprets
to_replacekeys as regex patterns. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the chromosomes renamed. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the renamed chromosome format ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1049 def rename_missings( 1050 self, 1051 before: Union[int, float, str] = -1, 1052 after: Union[int, float, str] = '.', 1053 inplace: bool = False 1054 ) -> Optional['SNPObject']: 1055 """ 1056 Replace missing values in the `calldata_gt` attribute. 1057 1058 This method identifies missing values in 'calldata_gt' and replaces them with a specified 1059 value. By default, it replaces occurrences of `-1` (often used to signify missing data) with `'.'`. 1060 1061 Args: 1062 before (int, float, or str, default=-1): 1063 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 1064 Default is -1. 1065 after (int, float, or str, default='.'): 1066 The value that will replace `before`. Default is '.'. 1067 inplace (bool, default=False): 1068 If True, modifies `self` in place. If False, returns a new `SNPObject` with the applied 1069 replacements. Default is False. 1070 1071 Returns: 1072 **Optional[SNPObject]:** 1073 A new `SNPObject` with the renamed missing values if `inplace=False`. 1074 If `inplace=True`, modifies `self` in place and returns None. 1075 """ 1076 # Rename missing values in the `calldata_gt` attribute based on inplace flag 1077 if inplace: 1078 self['calldata_gt'] = np.where(self['calldata_gt'] == before, after, self['calldata_gt']) 1079 return None 1080 else: 1081 snpobj = self.copy() 1082 snpobj['calldata_gt'] = np.where(snpobj['calldata_gt'] == before, after, snpobj['calldata_gt']) 1083 return snpobj
Replace missing values in the calldata_gt attribute.
This method identifies missing values in 'calldata_gt' and replaces them with a specified
value. By default, it replaces occurrences of -1 (often used to signify missing data) with '.'.
Arguments:
- before (int, float, or str, default=-1): The current representation of missing values in
calldata_gt. Common values might be -1, '.', or NaN. Default is -1. - after (int, float, or str, default='.'): The value that will replace
before. Default is '.'. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the applied replacements. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the renamed missing values ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1085 def get_common_variants_intersection( 1086 self, 1087 snpobj: 'SNPObject', 1088 index_by: str = 'pos' 1089 ) -> Tuple[List[str], np.ndarray, np.ndarray]: 1090 """ 1091 Identify common variants between `self` and the `snpobj` instance based on the specified `index_by` criterion, 1092 which may match based on chromosome and position (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both. 1093 1094 This method returns the identifiers of common variants and their corresponding indices in both objects. 1095 1096 Args: 1097 snpobj (SNPObject): 1098 The reference SNPObject to compare against. 1099 index_by (str, default='pos'): 1100 Criteria for matching variants. Options: 1101 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1102 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1103 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1104 Default is 'pos'. 1105 1106 Returns: 1107 Tuple containing: 1108 - **list of str:** A list of common variant identifiers (as strings). 1109 - **array:** An array of indices in `self` where common variants are located. 1110 - **array:** An array of indices in `snpobj` where common variants are located. 1111 """ 1112 # Create unique identifiers for each variant in both SNPObjects based on the specified criterion 1113 if index_by == 'pos': 1114 query_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(self['variants_chrom'], self['variants_pos'])] 1115 reference_identifiers = [f"{chrom}-{pos}" for chrom, pos in zip(snpobj['variants_chrom'], snpobj['variants_pos'])] 1116 elif index_by == 'id': 1117 query_identifiers = self['variants_id'].tolist() 1118 reference_identifiers = snpobj['variants_id'].tolist() 1119 elif index_by == 'pos+id': 1120 query_identifiers = [ 1121 f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(self['variants_chrom'], self['variants_pos'], self['variants_id']) 1122 ] 1123 reference_identifiers = [ 1124 f"{chrom}-{pos}-{ids}" for chrom, pos, ids in zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_id']) 1125 ] 1126 else: 1127 raise ValueError("`index_by` must be one of 'pos', 'id', or 'pos+id'.") 1128 1129 # Convert to sets for intersection 1130 common_ids = set(query_identifiers).intersection(reference_identifiers) 1131 1132 # Collect indices for common identifiers 1133 query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids] 1134 reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids] 1135 1136 return list(common_ids), np.array(query_idx), np.array(reference_idx)
Identify common variants between self and the snpobj instance based on the specified index_by criterion,
which may match based on chromosome and position (variants_chrom, variants_pos), ID (variants_id), or both.
This method returns the identifiers of common variants and their corresponding indices in both objects.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
- index_by (str, default='pos'): Criteria for matching variants. Options:
'pos': Matches by chromosome and position (variants_chrom,variants_pos), e.g., 'chr1-12345'.'id': Matches by variant ID alone (variants_id), e.g., 'rs123'.'pos+id': Matches by chromosome, position, and ID (variants_chrom,variants_pos,variants_id), e.g., 'chr1-12345-rs123'. Default is 'pos'.
Returns:
Tuple containing:
- list of str: A list of common variant identifiers (as strings).
- array: An array of indices in
selfwhere common variants are located.- array: An array of indices in
snpobjwhere common variants are located.
1138 def get_common_markers_intersection( 1139 self, 1140 snpobj: 'SNPObject' 1141 ) -> Tuple[List[str], np.ndarray, np.ndarray]: 1142 """ 1143 Identify common markers between between `self` and the `snpobj` instance. Common markers are identified 1144 based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 1145 and alternate (`variants_alt`) alleles. 1146 1147 This method returns the identifiers of common markers and their corresponding indices in both objects. 1148 1149 Args: 1150 snpobj (SNPObject): 1151 The reference SNPObject to compare against. 1152 1153 Returns: 1154 Tuple containing: 1155 - **list of str:** A list of common variant identifiers (as strings). 1156 - **array:** An array of indices in `self` where common variants are located. 1157 - **array:** An array of indices in `snpobj` where common variants are located. 1158 """ 1159 # Generate unique identifiers based on chrom, pos, ref, and alt alleles 1160 query_identifiers = [ 1161 f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 1162 zip(self['variants_chrom'], self['variants_pos'], self['variants_ref'], self['variants_alt']) 1163 ] 1164 reference_identifiers = [ 1165 f"{chrom}-{pos}-{ref}-{alt}" for chrom, pos, ref, alt in 1166 zip(snpobj['variants_chrom'], snpobj['variants_pos'], snpobj['variants_ref'], snpobj['variants_alt']) 1167 ] 1168 1169 # Convert to sets for intersection 1170 common_ids = set(query_identifiers).intersection(reference_identifiers) 1171 1172 # Collect indices for common identifiers in both SNPObjects 1173 query_idx = [i for i, id in enumerate(query_identifiers) if id in common_ids] 1174 reference_idx = [i for i, id in enumerate(reference_identifiers) if id in common_ids] 1175 1176 return list(common_ids), np.array(query_idx), np.array(reference_idx)
Identify common markers between between self and the snpobj instance. Common markers are identified
based on matching chromosome (variants_chrom), position (variants_pos), reference (variants_ref),
and alternate (variants_alt) alleles.
This method returns the identifiers of common markers and their corresponding indices in both objects.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
Returns:
Tuple containing:
- list of str: A list of common variant identifiers (as strings).
- array: An array of indices in
selfwhere common variants are located.- array: An array of indices in
snpobjwhere common variants are located.
1178 def subset_to_common_variants( 1179 self, 1180 snpobj: 'SNPObject', 1181 index_by: str = 'pos', 1182 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1183 inplace: bool = False 1184 ) -> Optional['SNPObject']: 1185 """ 1186 Subset `self` to include only the common variants with a reference `snpobj` based on 1187 the specified `index_by` criterion, which may match based on chromosome and position 1188 (`variants_chrom`, `variants_pos`), ID (`variants_id`), or both. 1189 1190 Args: 1191 snpobj (SNPObject): 1192 The reference SNPObject to compare against. 1193 index_by (str, default='pos'): 1194 Criteria for matching variants. Options: 1195 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1196 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1197 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1198 Default is 'pos'. 1199 common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): 1200 Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 1201 computed within the function. 1202 inplace (bool, default=False): 1203 If True, modifies `self` in place. If False, returns a new `SNPObject` with the common variants 1204 subsetted. Default is False. 1205 1206 Returns: 1207 **Optional[SNPObject]:** 1208 A new `SNPObject` with the common variants subsetted if `inplace=False`. 1209 If `inplace=True`, modifies `self` in place and returns None. 1210 """ 1211 # Get indices of common variants if not provided 1212 if common_variants_intersection is None: 1213 _, query_idx, _ = self.get_common_variants_intersection(snpobj, index_by=index_by) 1214 else: 1215 query_idx, _ = common_variants_intersection 1216 1217 # Use filter_variants method with the identified indices, applying `inplace` as specified 1218 return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)
Subset self to include only the common variants with a reference snpobj based on
the specified index_by criterion, which may match based on chromosome and position
(variants_chrom, variants_pos), ID (variants_id), or both.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
- index_by (str, default='pos'): Criteria for matching variants. Options:
'pos': Matches by chromosome and position (variants_chrom,variants_pos), e.g., 'chr1-12345'.'id': Matches by variant ID alone (variants_id), e.g., 'rs123'.'pos+id': Matches by chromosome, position, and ID (variants_chrom,variants_pos,variants_id), e.g., 'chr1-12345-rs123'. Default is 'pos'.
- common_variants_intersection (Tuple[np.ndarray, np.ndarray], optional): Precomputed indices of common variants between
selfandsnpobj. If None, intersection is computed within the function. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the common variants subsetted. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the common variants subsetted ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1220 def subset_to_common_markers( 1221 self, 1222 snpobj: 'SNPObject', 1223 common_markers_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1224 inplace: bool = False 1225 ) -> Optional['SNPObject']: 1226 """ 1227 Subset `self` to include only the common markers with a reference `snpobj`. Common markers are identified 1228 based on matching chromosome (`variants_chrom`), position (`variants_pos`), reference (`variants_ref`), 1229 and alternate (`variants_alt`) alleles. 1230 1231 Args: 1232 snpobj (SNPObject): 1233 The reference SNPObject to compare against. 1234 common_markers_intersection (tuple of arrays, optional): 1235 Precomputed indices of common markers between `self` and `snpobj`. If None, intersection is 1236 computed within the function. 1237 inplace (bool, default=False): 1238 If True, modifies `self` in place. If False, returns a new `SNPObject` with the common markers 1239 subsetted. Default is False. 1240 1241 Returns: 1242 **Optional[SNPObject]:** 1243 A new `SNPObject` with the common markers subsetted if `inplace=False`. 1244 If `inplace=True`, modifies `self` in place and returns None. 1245 """ 1246 # Get indices of common markers if not provided 1247 if common_markers_intersection is None: 1248 _, query_idx, _ = self.get_common_markers_intersection(snpobj) 1249 else: 1250 query_idx, _ = common_markers_intersection 1251 1252 # Use filter_variants method with the identified indices, applying `inplace` as specified 1253 return self.filter_variants(indexes=query_idx, include=True, inplace=inplace)
Subset self to include only the common markers with a reference snpobj. Common markers are identified
based on matching chromosome (variants_chrom), position (variants_pos), reference (variants_ref),
and alternate (variants_alt) alleles.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
- common_markers_intersection (tuple of arrays, optional): Precomputed indices of common markers between
selfandsnpobj. If None, intersection is computed within the function. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the common markers subsetted. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith the common markers subsetted ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1255 def merge( 1256 self, 1257 snpobj: 'SNPObject', 1258 force_samples: bool = False, 1259 prefix: str = '2', 1260 inplace: bool = False 1261 ) -> Optional['SNPObject']: 1262 """ 1263 Merge `self` with `snpobj` along the sample axis. 1264 1265 This method expects both SNPObjects to contain the same set of SNPs in the same order, 1266 then combines their genotype (`calldata_gt`) and LAI (`calldata_lai`) arrays by 1267 concatenating the sample dimension. Samples from `snpobj` are appended to those in `self`. 1268 1269 Args: 1270 snpobj (SNPObject): 1271 The SNPObject to merge samples with. 1272 force_samples (bool, default=False): 1273 If True, duplicate sample names are resolved by prepending the `prefix` to duplicate sample names in 1274 `snpobj`. Otherwise, merging fails when duplicate sample names are found. Default is False. 1275 prefix (str, default='2'): 1276 A string prepended to duplicate sample names in `snpobj` when `force_samples=True`. 1277 Duplicates are renamed from `<sample_name>` to `<prefix>:<sample_name>`. For instance, 1278 if `prefix='2'` and there is a conflict with a sample called "sample_1", it becomes "2:sample_1". 1279 inplace (bool, default=False): 1280 If True, modifies `self` in place. If False, returns a new `SNPObject` with the merged samples. 1281 Default is False. 1282 1283 Returns: 1284 **Optional[SNPObject]**: A new SNPObject containing the merged sample data. 1285 """ 1286 # Merge calldata_gt if present and compatible 1287 if self.calldata_gt is not None and snpobj.calldata_gt is not None: 1288 if self.calldata_gt.shape[0] != snpobj.calldata_gt.shape[0]: 1289 raise ValueError( 1290 f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_gt`.\n" 1291 f"`self.calldata_gt` has {self.calldata_gt.shape[0]} SNPs, " 1292 f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[0]} SNPs." 1293 ) 1294 if self.are_strands_summed and not snpobj.are_strands_summed: 1295 raise ValueError( 1296 "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n" 1297 "Ensure both objects have the same genotype summation state before merging." 1298 ) 1299 if not self.are_strands_summed and snpobj.are_strands_summed: 1300 raise ValueError( 1301 "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n" 1302 "Ensure both objects have the same genotype summation state before merging." 1303 ) 1304 calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=1) 1305 else: 1306 calldata_gt = None 1307 1308 # Merge samples if present and compatible, handling duplicates if `force_samples=True` 1309 if self.samples is not None and snpobj.samples is not None: 1310 overlapping_samples = set(self.samples).intersection(set(snpobj.samples)) 1311 if overlapping_samples: 1312 if not force_samples: 1313 raise ValueError( 1314 f"Cannot merge SNPObjects: Found overlapping sample names {overlapping_samples}.\n" 1315 "Samples must be strictly non-overlapping. To allow merging with renaming, set `force_samples=True`." 1316 ) 1317 else: 1318 # Rename duplicate samples by prepending the file index 1319 renamed_samples = [f"{prefix}:{sample}" if sample in overlapping_samples else sample for sample in snpobj.samples] 1320 samples = np.concatenate([self.samples, renamed_samples], axis=0) 1321 else: 1322 samples = np.concatenate([self.samples, snpobj.samples], axis=0) 1323 else: 1324 samples = None 1325 1326 # Merge LAI data if present and compatible 1327 if self.calldata_lai is not None and snpobj.calldata_lai is not None: 1328 if self.calldata_lai.ndim != snpobj.calldata_lai.ndim: 1329 raise ValueError( 1330 f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n" 1331 f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, " 1332 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions." 1333 ) 1334 if self.calldata_lai.shape[0] != snpobj.calldata_lai.shape[0]: 1335 raise ValueError( 1336 f"Cannot merge SNPObjects: Mismatch in the number of SNPs in `calldata_lai`.\n" 1337 f"`self.calldata_lai` has {self.calldata_lai.shape[0]} SNPs, " 1338 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[0]} SNPs." 1339 ) 1340 calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=1) 1341 else: 1342 calldata_lai = None 1343 1344 if inplace: 1345 self.calldata_gt = calldata_gt 1346 self.calldata_lai = calldata_lai 1347 self.samples = samples 1348 return self 1349 1350 # Create and return a new SNPObject containing the merged samples 1351 return SNPObject( 1352 calldata_gt=calldata_gt, 1353 samples=samples, 1354 variants_ref=self.variants_ref, 1355 variants_alt=self.variants_alt, 1356 variants_chrom=self.variants_chrom, 1357 variants_filter_pass=self.variants_filter_pass, 1358 variants_id=self.variants_id, 1359 variants_pos=self.variants_pos, 1360 variants_qual=self.variants_qual, 1361 calldata_lai=calldata_lai, 1362 ancestry_map=self.ancestry_map 1363 )
Merge self with snpobj along the sample axis.
This method expects both SNPObjects to contain the same set of SNPs in the same order,
then combines their genotype (calldata_gt) and LAI (calldata_lai) arrays by
concatenating the sample dimension. Samples from snpobj are appended to those in self.
Arguments:
- snpobj (SNPObject): The SNPObject to merge samples with.
- force_samples (bool, default=False): If True, duplicate sample names are resolved by prepending the
prefixto duplicate sample names insnpobj. Otherwise, merging fails when duplicate sample names are found. Default is False. - prefix (str, default='2'): A string prepended to duplicate sample names in
snpobjwhenforce_samples=True. Duplicates are renamed from<sample_name>to<prefix>:<sample_name>. For instance, ifprefix='2'and there is a conflict with a sample called "sample_1", it becomes "2:sample_1". - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the merged samples. Default is False.
Returns:
Optional[SNPObject]: A new SNPObject containing the merged sample data.
1365 def concat( 1366 self, 1367 snpobj: 'SNPObject', 1368 inplace: bool = False 1369 ) -> Optional['SNPObject']: 1370 """ 1371 Concatenate self with snpobj along the SNP axis. 1372 1373 This method expects both SNPObjects to contain the same set of samples in the same order, 1374 and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) 1375 those in self. 1376 1377 Args: 1378 snpobj (SNPObject): 1379 The SNPObject to concatenate SNPs with. 1380 inplace (bool, default=False): 1381 If True, modifies `self` in place. If False, returns a new `SNPObject` with the concatenated SNPs. 1382 Default is False. 1383 1384 Returns: 1385 **Optional[SNPObject]**: A new SNPObject containing the concatenated SNP data. 1386 """ 1387 # Merge calldata_gt if present and compatible 1388 if self.calldata_gt is not None and snpobj.calldata_gt is not None: 1389 if self.calldata_gt.shape[1] != snpobj.calldata_gt.shape[1]: 1390 raise ValueError( 1391 f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_gt`.\n" 1392 f"`self.calldata_gt` has {self.calldata_gt.shape[1]} samples, " 1393 f"while `snpobj.calldata_gt` has {snpobj.calldata_gt.shape[1]} samples." 1394 ) 1395 if self.are_strands_summed and not snpobj.are_strands_summed: 1396 raise ValueError( 1397 "Cannot merge SNPObjects: `self` has summed strands, but `snpobj` does not.\n" 1398 "Ensure both objects have the same genotype summation state before merging." 1399 ) 1400 if not self.are_strands_summed and snpobj.are_strands_summed: 1401 raise ValueError( 1402 "Cannot merge SNPObjects: `snpobj` has summed strands, but `self` does not.\n" 1403 "Ensure both objects have the same genotype summation state before merging." 1404 ) 1405 calldata_gt = np.concatenate([self.calldata_gt, snpobj.calldata_gt], axis=0) 1406 else: 1407 calldata_gt = None 1408 1409 # Merge SNP-related attributes if present 1410 attributes = [ 1411 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual' 1412 ] 1413 merged_attrs = {} 1414 for attr in attributes: 1415 self_attr = getattr(self, attr, None) 1416 obj_attr = getattr(snpobj, attr, None) 1417 1418 # Concatenate if both present 1419 if self_attr is not None and obj_attr is not None: 1420 merged_attrs[attr] = np.concatenate([self_attr, obj_attr], axis=0) 1421 else: 1422 # If either is None, store None 1423 merged_attrs[attr] = None 1424 1425 # Merge LAI data if present and compatible 1426 if self.calldata_lai is not None and snpobj.calldata_lai is not None: 1427 if self.calldata_lai.ndim != snpobj.calldata_lai.ndim: 1428 raise ValueError( 1429 f"Cannot merge SNPObjects: Mismatch in `calldata_lai` dimensions.\n" 1430 f"`self.calldata_lai` has {self.calldata_lai.ndim} dimensions, " 1431 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.ndim} dimensions." 1432 ) 1433 if self.calldata_lai.shape[1] != snpobj.calldata_lai.shape[1]: 1434 raise ValueError( 1435 f"Cannot merge SNPObjects: Mismatch in the number of samples in `calldata_lai`.\n" 1436 f"`self.calldata_lai` has {self.calldata_lai.shape[1]} samples, " 1437 f"while `snpobj.calldata_lai` has {snpobj.calldata_lai.shape[1]} samples." 1438 ) 1439 calldata_lai = np.concatenate([self.calldata_lai, snpobj.calldata_lai], axis=0) 1440 else: 1441 calldata_lai = None 1442 1443 if inplace: 1444 self.calldata_gt = calldata_gt 1445 self.calldata_lai = calldata_lai 1446 for attr in attributes: 1447 self[attr] = merged_attrs[attr] 1448 return self 1449 1450 # Create and return a new SNPObject containing the concatenated SNPs 1451 return SNPObject( 1452 calldata_gt=calldata_gt, 1453 calldata_lai=calldata_lai, 1454 samples=self.samples, 1455 variants_ref=merged_attrs['variants_ref'], 1456 variants_alt=merged_attrs['variants_alt'], 1457 variants_chrom=merged_attrs['variants_chrom'], 1458 variants_id=merged_attrs['variants_id'], 1459 variants_pos=merged_attrs['variants_pos'], 1460 variants_qual=merged_attrs['variants_qual'], 1461 variants_filter_pass=merged_attrs['variants_filter_pass'], 1462 ancestry_map=self.ancestry_map 1463 )
Concatenate self with snpobj along the SNP axis.
This method expects both SNPObjects to contain the same set of samples in the same order, and that the chromosome(s) in snpobj follow (i.e. have higher numeric identifiers than) those in self.
Arguments:
- snpobj (SNPObject): The SNPObject to concatenate SNPs with.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the concatenated SNPs. Default is False.
Returns:
Optional[SNPObject]: A new SNPObject containing the concatenated SNP data.
1465 def remove_strand_ambiguous_variants(self, inplace: bool = False) -> Optional['SNPObject']: 1466 """ 1467 A strand-ambiguous variant has reference (`variants_ref`) and alternate (`variants_alt`) alleles 1468 in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable 1469 in terms of strand orientation. 1470 1471 Args: 1472 inplace (bool, default=False): 1473 If True, modifies `self` in place. If False, returns a new `SNPObject` with the 1474 strand-ambiguous variants removed. Default is False. 1475 1476 Returns: 1477 **Optional[SNPObject]:** A new `SNPObject` with non-ambiguous variants only if `inplace=False`. 1478 If `inplace=True`, modifies `self` in place and returns None. 1479 """ 1480 # Identify strand-ambiguous SNPs using vectorized comparisons 1481 is_AT = (self['variants_ref'] == 'A') & (self['variants_alt'] == 'T') 1482 is_TA = (self['variants_ref'] == 'T') & (self['variants_alt'] == 'A') 1483 is_CG = (self['variants_ref'] == 'C') & (self['variants_alt'] == 'G') 1484 is_GC = (self['variants_ref'] == 'G') & (self['variants_alt'] == 'C') 1485 1486 # Create a combined mask for all ambiguous variants 1487 ambiguous_mask = is_AT | is_TA | is_CG | is_GC 1488 non_ambiguous_idx = np.where(~ambiguous_mask)[0] 1489 1490 # Count each type of ambiguity using numpy's sum on boolean arrays 1491 A_T_count = np.sum(is_AT) 1492 T_A_count = np.sum(is_TA) 1493 C_G_count = np.sum(is_CG) 1494 G_C_count = np.sum(is_GC) 1495 1496 # Log the counts of each type of strand-ambiguous variants 1497 total_ambiguous = A_T_count + T_A_count + C_G_count + G_C_count 1498 log.info(f'{A_T_count} ambiguities of A-T type.') 1499 log.info(f'{T_A_count} ambiguities of T-A type.') 1500 log.info(f'{C_G_count} ambiguities of C-G type.') 1501 log.info(f'{G_C_count} ambiguities of G-C type.') 1502 1503 # Filter out ambiguous variants and keep non-ambiguous ones 1504 log.debug(f'Removing {total_ambiguous} strand-ambiguous variants...') 1505 return self.filter_variants(indexes=non_ambiguous_idx, include=True, inplace=inplace)
A strand-ambiguous variant has reference (variants_ref) and alternate (variants_alt) alleles
in the pairs A/T, T/A, C/G, or G/C, where both alleles are complementary and thus indistinguishable
in terms of strand orientation.
Arguments:
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith the strand-ambiguous variants removed. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith non-ambiguous variants only ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1507 def correct_flipped_variants( 1508 self, 1509 snpobj: 'SNPObject', 1510 check_complement: bool = True, 1511 index_by: str = 'pos', 1512 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1513 log_stats: bool = True, 1514 inplace: bool = False 1515 ) -> Optional['SNPObject']: 1516 """ 1517 Correct flipped variants between between `self` and a reference `snpobj`, where reference (`variants_ref`) 1518 and alternate (`variants_alt`) alleles are swapped. 1519 1520 **Flip Detection Based on `check_complement`:** 1521 1522 - If `check_complement=False`, only direct allele swaps are considered: 1523 1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1524 1525 - If `check_complement=True`, both direct and complementary swaps are considered, with four possible cases: 1526 1. **Direct Swap:** `self.variants_ref == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1527 2. **Complement Swap of Ref:** `complement(self.variants_ref) == snpobj.variants_alt` and `self.variants_alt == snpobj.variants_ref`. 1528 3. **Complement Swap of Alt:** `self.variants_ref == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`. 1529 4. **Complement Swap of both Ref and Alt:** `complement(self.variants_ref) == snpobj.variants_alt` and `complement(self.variants_alt) == snpobj.variants_ref`. 1530 1531 **Note:** Variants where `self.variants_ref == self.variants_alt` are ignored as they are ambiguous. 1532 1533 **Correction Process:** 1534 - Swaps `variants_ref` and `variants_alt` alleles in `self` to align with `snpobj`. 1535 - Flips `calldata_gt` values (0 becomes 1, and 1 becomes 0) to match the updated allele configuration. 1536 1537 Args: 1538 snpobj (SNPObject): 1539 The reference SNPObject to compare against. 1540 check_complement (bool, default=True): 1541 If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants. 1542 Default is True. 1543 index_by (str, default='pos'): 1544 Criteria for matching variants. Options: 1545 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1546 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1547 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1548 Default is 'pos'. 1549 common_variants_intersection (tuple of arrays, optional): 1550 Precomputed indices of common variants between `self` and `snpobj`. If None, intersection is 1551 computed within the function. 1552 log_stats (bool, default=True): 1553 If True, logs statistical information about matching and ambiguous alleles. Default is True. 1554 inplace (bool, default=False): 1555 If True, modifies `self` in place. If False, returns a new `SNPObject` with corrected 1556 flips. Default is False. 1557 1558 Returns: 1559 **Optional[SNPObject]**: 1560 A new `SNPObject` with corrected flips if `inplace=False`. 1561 If `inplace=True`, modifies `self` in place and returns None. 1562 """ 1563 # Define complement mappings for nucleotides 1564 complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} 1565 1566 # Helper function to get the complement of a base 1567 def get_complement(base: str) -> str: 1568 return complement_map.get(base, base) 1569 1570 # Get common variant indices if not provided 1571 if common_variants_intersection != None: 1572 query_idx, reference_idx = common_variants_intersection 1573 else: 1574 _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by) 1575 1576 # Log statistics on matching alleles if enabled 1577 if log_stats: 1578 matching_ref = np.sum(self['variants_ref'][query_idx] == snpobj['variants_ref'][reference_idx]) 1579 matching_alt = np.sum(self['variants_alt'][query_idx] == snpobj['variants_alt'][reference_idx]) 1580 ambiguous = np.sum(self['variants_ref'][query_idx] == self['variants_alt'][query_idx]) 1581 log.info(f"Matching reference alleles (ref=ref'): {matching_ref}, Matching alternate alleles (alt=alt'): {matching_alt}.") 1582 log.info(f"Number of ambiguous alleles (ref=alt): {ambiguous}.") 1583 1584 # Identify indices where `ref` and `alt` alleles are swapped 1585 if not check_complement: 1586 # Simple exact match for swapped alleles 1587 swapped_ref = (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) 1588 swapped_alt = (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) 1589 else: 1590 # Check for swapped or complementary-swapped alleles 1591 swapped_ref = ( 1592 (self['variants_ref'][query_idx] == snpobj['variants_alt'][reference_idx]) | 1593 (np.vectorize(get_complement)(self['variants_ref'][query_idx]) == snpobj['variants_alt'][reference_idx]) 1594 ) 1595 swapped_alt = ( 1596 (self['variants_alt'][query_idx] == snpobj['variants_ref'][reference_idx]) | 1597 (np.vectorize(get_complement)(self['variants_alt'][query_idx]) == snpobj['variants_ref'][reference_idx]) 1598 ) 1599 1600 # Filter out ambiguous variants where `ref` and `alt` alleles match (ref=alt) 1601 not_ambiguous = (self['variants_ref'][query_idx] != self['variants_alt'][query_idx]) 1602 1603 # Indices in `self` of flipped variants 1604 flip_idx_query = query_idx[swapped_ref & swapped_alt & not_ambiguous] 1605 1606 # Correct the identified variant flips 1607 if len(flip_idx_query) > 0: 1608 log.info(f'Correcting {len(flip_idx_query)} variant flips...') 1609 1610 temp_alts = self['variants_alt'][flip_idx_query] 1611 temp_refs = self['variants_ref'][flip_idx_query] 1612 1613 # Correct the variant flips based on whether the operation is in-place or not 1614 if inplace: 1615 self['variants_alt'][flip_idx_query] = temp_refs 1616 self['variants_ref'][flip_idx_query] = temp_alts 1617 self['calldata_gt'][flip_idx_query] = 1 - self['calldata_gt'][flip_idx_query] 1618 return None 1619 else: 1620 snpobj = self.copy() 1621 snpobj['variants_alt'][flip_idx_query] = temp_refs 1622 snpobj['variants_ref'][flip_idx_query] = temp_alts 1623 snpobj['calldata_gt'][flip_idx_query] = 1 - snpobj['calldata_gt'][flip_idx_query] 1624 return snpobj 1625 else: 1626 log.info('No variant flips found to correct.') 1627 return self if not inplace else None
Correct flipped variants between between self and a reference snpobj, where reference (variants_ref)
and alternate (variants_alt) alleles are swapped.
Flip Detection Based on check_complement:
If
check_complement=False, only direct allele swaps are considered:- Direct Swap:
self.variants_ref == snpobj.variants_altandself.variants_alt == snpobj.variants_ref.
- Direct Swap:
If
check_complement=True, both direct and complementary swaps are considered, with four possible cases:- Direct Swap:
self.variants_ref == snpobj.variants_altandself.variants_alt == snpobj.variants_ref. - Complement Swap of Ref:
complement(self.variants_ref) == snpobj.variants_altandself.variants_alt == snpobj.variants_ref. - Complement Swap of Alt:
self.variants_ref == snpobj.variants_altandcomplement(self.variants_alt) == snpobj.variants_ref. - Complement Swap of both Ref and Alt:
complement(self.variants_ref) == snpobj.variants_altandcomplement(self.variants_alt) == snpobj.variants_ref.
- Direct Swap:
Note: Variants where self.variants_ref == self.variants_alt are ignored as they are ambiguous.
Correction Process:
- Swaps
variants_refandvariants_altalleles inselfto align withsnpobj. - Flips
calldata_gtvalues (0 becomes 1, and 1 becomes 0) to match the updated allele configuration.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
- check_complement (bool, default=True): If True, also checks for complementary base pairs (A/T, T/A, C/G, and G/C) when identifying swapped variants. Default is True.
- index_by (str, default='pos'): Criteria for matching variants. Options:
'pos': Matches by chromosome and position (variants_chrom,variants_pos), e.g., 'chr1-12345'.'id': Matches by variant ID alone (variants_id), e.g., 'rs123'.'pos+id': Matches by chromosome, position, and ID (variants_chrom,variants_pos,variants_id), e.g., 'chr1-12345-rs123'. Default is 'pos'.
- common_variants_intersection (tuple of arrays, optional): Precomputed indices of common variants between
selfandsnpobj. If None, intersection is computed within the function. - log_stats (bool, default=True): If True, logs statistical information about matching and ambiguous alleles. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith corrected flips. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith corrected flips ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1629 def remove_mismatching_variants( 1630 self, 1631 snpobj: 'SNPObject', 1632 index_by: str = 'pos', 1633 common_variants_intersection: Optional[Tuple[np.ndarray, np.ndarray]] = None, 1634 inplace: bool = False 1635 ) -> Optional['SNPObject']: 1636 """ 1637 Remove variants from `self`, where reference (`variants_ref`) and/or alternate (`variants_alt`) alleles 1638 do not match with a reference `snpobj`. 1639 1640 Args: 1641 snpobj (SNPObject): 1642 The reference SNPObject to compare against. 1643 index_by (str, default='pos'): 1644 Criteria for matching variants. Options: 1645 - `'pos'`: Matches by chromosome and position (`variants_chrom`, `variants_pos`), e.g., 'chr1-12345'. 1646 - `'id'`: Matches by variant ID alone (`variants_id`), e.g., 'rs123'. 1647 - `'pos+id'`: Matches by chromosome, position, and ID (`variants_chrom`, `variants_pos`, `variants_id`), e.g., 'chr1-12345-rs123'. 1648 Default is 'pos'. 1649 common_variants_intersection (tuple of arrays, optional): 1650 Precomputed indices of common variants between `self` and the reference `snpobj`. 1651 If None, the intersection is computed within the function. 1652 inplace (bool, default=False): 1653 If True, modifies `self` in place. If False, returns a new `SNPObject` without 1654 mismatching variants. Default is False. 1655 1656 Returns: 1657 **Optional[SNPObject]:** 1658 A new `SNPObject` without mismatching variants if `inplace=False`. 1659 If `inplace=True`, modifies `self` in place and returns None. 1660 """ 1661 # Get common variant indices if not provided 1662 if common_variants_intersection is not None: 1663 query_idx, reference_idx = common_variants_intersection 1664 else: 1665 _, query_idx, reference_idx = self.get_common_variants_intersection(snpobj, index_by=index_by) 1666 1667 # Vectorized comparison of `ref` and `alt` alleles 1668 ref_mismatch = self['variants_ref'][query_idx] != snpobj['variants_ref'][reference_idx] 1669 alt_mismatch = self['variants_alt'][query_idx] != snpobj['variants_alt'][reference_idx] 1670 mismatch_mask = ref_mismatch | alt_mismatch 1671 1672 # Identify indices in `self` of mismatching variants 1673 mismatch_idx = query_idx[mismatch_mask] 1674 1675 # Compute total number of variant mismatches 1676 total_mismatches = np.sum(mismatch_mask) 1677 1678 # Filter out mismatching variants 1679 log.debug(f'Removing {total_mismatches} mismatching variants...') 1680 return self.filter_variants(indexes=mismatch_idx, include=True, inplace=inplace)
Remove variants from self, where reference (variants_ref) and/or alternate (variants_alt) alleles
do not match with a reference snpobj.
Arguments:
- snpobj (SNPObject): The reference SNPObject to compare against.
- index_by (str, default='pos'): Criteria for matching variants. Options:
'pos': Matches by chromosome and position (variants_chrom,variants_pos), e.g., 'chr1-12345'.'id': Matches by variant ID alone (variants_id), e.g., 'rs123'.'pos+id': Matches by chromosome, position, and ID (variants_chrom,variants_pos,variants_id), e.g., 'chr1-12345-rs123'. Default is 'pos'.
- common_variants_intersection (tuple of arrays, optional): Precomputed indices of common variants between
selfand the referencesnpobj. If None, the intersection is computed within the function. - inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwithout mismatching variants. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwithout mismatching variants ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1682 def shuffle_variants(self, inplace: bool = False) -> Optional['SNPObject']: 1683 """ 1684 Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated 1685 data (e.g., `calldata_gt` and variant-specific attributes) remain aligned. 1686 1687 Args: 1688 inplace (bool, default=False): 1689 If True, modifies `self` in place. If False, returns a new `SNPObject` with 1690 shuffled variants. Default is False. 1691 1692 Returns: 1693 **Optional[SNPObject]:** 1694 A new `SNPObject` without shuffled variant positions if `inplace=False`. 1695 If `inplace=True`, modifies `self` in place and returns None. 1696 """ 1697 # Generate a random permutation index for shuffling variant positions 1698 shuffle_index = np.random.permutation(self.n_snps) 1699 1700 # Apply shuffling to all relevant attributes using the class's dictionary-like interface 1701 if inplace: 1702 for key in self.keys(): 1703 if self[key] is not None: 1704 if key == 'calldata_gt': 1705 # `calldata_gt`` has a different shape, so it's shuffled along axis 0 1706 self[key] = self[key][shuffle_index, ...] 1707 elif 'variant' in key: 1708 # snpobj attributes are 1D arrays 1709 self[key] = np.asarray(self[key])[shuffle_index] 1710 return None 1711 else: 1712 shuffled_snpobj = self.copy() 1713 for key in shuffled_snpobj.keys(): 1714 if shuffled_snpobj[key] is not None: 1715 if key == 'calldata_gt': 1716 shuffled_snpobj[key] = shuffled_snpobj[key][shuffle_index, ...] 1717 elif 'variant' in key: 1718 shuffled_snpobj[key] = np.asarray(shuffled_snpobj[key])[shuffle_index] 1719 return shuffled_snpobj
Randomly shuffle the positions of variants in the SNPObject, ensuring that all associated
data (e.g., calldata_gt and variant-specific attributes) remain aligned.
Arguments:
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith shuffled variants. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwithout shuffled variant positions ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1721 def set_empty_to_missing(self, inplace: bool = False) -> Optional['SNPObject']: 1722 """ 1723 Replace empty strings `''` with missing values `'.'` in attributes of `self`. 1724 1725 Args: 1726 inplace (bool, default=False): 1727 If True, modifies `self` in place. If False, returns a new `SNPObject` with empty 1728 strings `''` replaced by missing values `'.'`. Default is False. 1729 1730 Returns: 1731 **Optional[SNPObject]:** 1732 A new `SNPObject` with empty strings replaced if `inplace=False`. 1733 If `inplace=True`, modifies `self` in place and returns None. 1734 """ 1735 if inplace: 1736 if self.variants_alt is not None: 1737 self.variants_alt[self.variants_alt == ''] = '.' 1738 if self.variants_ref is not None: 1739 self.variants_ref[self.variants_ref == ''] = '.' 1740 if self.variants_qual is not None: 1741 self.variants_qual = self.variants_qual.astype(str) 1742 self.variants_qual[(self.variants_qual == '') | (self.variants_qual == 'nan')] = '.' 1743 if self.variants_chrom is not None: 1744 self.variants_chrom = self.variants_chrom.astype(str) 1745 self.variants_chrom[self.variants_chrom == ''] = '.' 1746 if self.variants_filter_pass is not None: 1747 self.variants_filter_pass[self.variants_filter_pass == ''] = '.' 1748 if self.variants_id is not None: 1749 self.variants_id[self.variants_id == ''] = '.' 1750 return self 1751 else: 1752 snpobj = self.copy() 1753 if snpobj.variants_alt is not None: 1754 snpobj.variants_alt[snpobj.variants_alt == ''] = '.' 1755 if snpobj.variants_ref is not None: 1756 snpobj.variants_ref[snpobj.variants_ref == ''] = '.' 1757 if snpobj.variants_qual is not None: 1758 snpobj.variants_qual = snpobj.variants_qual.astype(str) 1759 snpobj.variants_qual[(snpobj.variants_qual == '') | (snpobj.variants_qual == 'nan')] = '.' 1760 if snpobj.variants_chrom is not None: 1761 snpobj.variants_chrom[snpobj.variants_chrom == ''] = '.' 1762 if snpobj.variants_filter_pass is not None: 1763 snpobj.variants_filter_pass[snpobj.variants_filter_pass == ''] = '.' 1764 if snpobj.variants_id is not None: 1765 snpobj.variants_id[snpobj.variants_id == ''] = '.' 1766 return snpobj
Replace empty strings '' with missing values '.' in attributes of self.
Arguments:
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newSNPObjectwith empty strings''replaced by missing values'.'. Default is False.
Returns:
Optional[SNPObject]: A new
SNPObjectwith empty strings replaced ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
1768 def convert_to_window_level( 1769 self, 1770 window_size: Optional[int] = None, 1771 physical_pos: Optional[np.ndarray] = None, 1772 chromosomes: Optional[np.ndarray] = None, 1773 window_sizes: Optional[np.ndarray] = None, 1774 laiobj: Optional['LocalAncestryObject'] = None 1775 ) -> 'LocalAncestryObject': 1776 """ 1777 Aggregate the `calldata_lai` attribute into genomic windows within a 1778 `snputils.ancestry.genobj.LocalAncestryObject`. 1779 1780 **Options for defining windows (in order of precedence):** 1781 1782 1. **Fixed window size**: 1783 - Use `window_size` to specify how many SNPs go into each window. The last window on each 1784 chromosome may be larger if SNPs are not evenly divisible by the size. 1785 1786 2. **Custom start and end positions**: 1787 - Provide `physical_pos` (2D array of shape (n_windows, 2)) as the [start, end] base-pair 1788 coordinates for each window. 1789 - If `chromosomes` is not provided and `self` has exactly one chromosome, all windows are 1790 assumed to belong to that chromosome. 1791 - If multiple chromosomes exist but `chromosomes` is missing, an error will be raised. 1792 - Optionally, provide `window_sizes` to store the SNP count per-window. 1793 1794 3. **Matching existing windows**: 1795 - Reuse window definitions (`physical_pos`, `chromosomes`, `window_sizes`) from an existing `laiobj`. 1796 1797 Args: 1798 window_size (int, optional): 1799 Number of SNPs in each window if defining fixed-size windows. If the total number of 1800 SNPs in a chromosome is not evenly divisible by the window size, the last window on that 1801 chromosome will include all remaining SNPs and therefore be larger than the specified size. 1802 physical_pos (array of shape (n_windows, 2), optional): 1803 A 2D array containing the start and end physical positions for each window. 1804 chromosomes (array of shape (n_windows,), optional): 1805 An array with chromosome numbers corresponding to each genomic window. 1806 window_sizes (array of shape (n_windows,), optional): 1807 An array specifying the number of SNPs in each genomic window. 1808 laiobj (LocalAncestryObject, optional): 1809 A reference `LocalAncestryObject` from which to copy existing window definitions. 1810 1811 Returns: 1812 **LocalAncestryObject:** 1813 A LocalAncestryObject containing window-level ancestry data. 1814 """ 1815 from snputils.ancestry.genobj.local import LocalAncestryObject 1816 1817 if window_size is None and physical_pos is None and laiobj is None: 1818 raise ValueError("One of `window_size`, `physical_pos`, or `laiobj` must be provided.") 1819 1820 # Fixed window size 1821 if window_size is not None: 1822 physical_pos = [] # Boundaries [start, end] of each window 1823 chromosomes = [] # Chromosome for each window 1824 window_sizes = [] # Number of SNPs for each window 1825 for chrom in self.unique_chrom: 1826 # Extract indices corresponding to this chromosome 1827 mask_chrom = (self.variants_chrom == chrom) 1828 # Subset to this chromosome 1829 pos_chrom = self.variants_pos[mask_chrom] 1830 # Number of SNPs for this chromosome 1831 n_snps_chrom = pos_chrom.size 1832 1833 # Initialize the start of the first window with the position of the first SNP 1834 current_start = self.variants_pos[0] 1835 1836 # Number of full windows with exactly `window_size` SNPs 1837 n_full_windows = n_snps_chrom // window_size 1838 1839 # Build all but the last window 1840 for i in range(n_full_windows-1): 1841 current_end = self.variants_pos[(i+1) * window_size - 1] 1842 physical_pos.append([current_start, current_end]) 1843 chromosomes.append(chrom) 1844 window_sizes.append(window_size) 1845 current_start = self.variants_pos[(i+1) * window_size] 1846 1847 # Build the last window 1848 current_end = self.variants_pos[-1] 1849 physical_pos.append([current_start, current_end]) 1850 chromosomes.append(chrom) 1851 window_sizes.append(n_snps_chrom - ((n_full_windows - 1) * window_size)) 1852 1853 physical_pos = np.array(physical_pos) 1854 chromosomes = np.array(chromosomes) 1855 window_sizes = np.array(window_sizes) 1856 1857 # Custom start and end positions 1858 elif physical_pos is not None: 1859 # Check if there is exactly one chromosome 1860 if chromosomes is None: 1861 unique_chrom = self.unique_chrom 1862 if len(unique_chrom) == 1: 1863 # We assume all windows belong to this single chromosome 1864 single_chrom = unique_chrom[0] 1865 chromosomes = np.array([single_chrom] * physical_pos.shape[0]) 1866 else: 1867 raise ValueError("Multiple chromosomes detected, but `chromosomes` was not provided.") 1868 1869 # Match existing windows to a reference laiobj 1870 elif laiobj is not None: 1871 physical_pos = laiobj.physical_pos 1872 chromosomes = laiobj.chromosomes 1873 window_sizes = laiobj.window_sizes 1874 1875 # Allocate an output LAI array 1876 n_windows = physical_pos.shape[0] 1877 n_samples = self.n_samples 1878 if self.calldata_lai.ndim == 3: 1879 lai = np.zeros((n_windows, n_samples, 2)) 1880 else: 1881 lai = np.zeros((n_windows, n_samples*2)) 1882 1883 # For each window, find the relevant SNPs and compute the mode of the ancestries 1884 for i, ((start, end), chrom) in enumerate(zip(physical_pos, chromosomes)): 1885 snps_mask = ( 1886 (self.variants_chrom == chrom) & 1887 (self.variants_pos >= start) & 1888 (self.variants_pos <= end) 1889 ) 1890 if np.any(snps_mask): 1891 lai_mask = self.calldata_lai[snps_mask, ...] 1892 mode_ancestries = mode(lai_mask, axis=0, nan_policy='omit').mode 1893 lai[i] = mode_ancestries 1894 else: 1895 lai[i] = np.nan 1896 1897 # Generate haplotype labels, e.g. "Sample1.0", "Sample1.1" 1898 haplotypes = [f"{sample}.{i}" for sample in self.samples for i in range(2)] 1899 1900 # If original data was (n_snps, n_samples, 2), flatten to (n_windows, n_samples*2) 1901 if self.calldata_lai.ndim == 3: 1902 lai = lai.reshape(n_windows, -1) 1903 1904 # Aggregate into a LocalAncestryObject 1905 return LocalAncestryObject( 1906 haplotypes=haplotypes, 1907 lai=lai, 1908 samples=self.samples, 1909 ancestry_map=self.ancestry_map, 1910 window_sizes=window_sizes, 1911 physical_pos=physical_pos, 1912 chromosomes=chromosomes 1913 )
Aggregate the calldata_lai attribute into genomic windows within a
snputils.ancestry.genobj.LocalAncestryObject.
Options for defining windows (in order of precedence):
- Fixed window size:
- Use
window_sizeto specify how many SNPs go into each window. The last window on each chromosome may be larger if SNPs are not evenly divisible by the size.
- Custom start and end positions:
- Provide
physical_pos(2D array of shape (n_windows, 2)) as the [start, end] base-pair coordinates for each window. - If
chromosomesis not provided andselfhas exactly one chromosome, all windows are assumed to belong to that chromosome. - If multiple chromosomes exist but
chromosomesis missing, an error will be raised. - Optionally, provide
window_sizesto store the SNP count per-window.
- Matching existing windows:
- Reuse window definitions (
physical_pos,chromosomes,window_sizes) from an existinglaiobj.
Arguments:
- window_size (int, optional): Number of SNPs in each window if defining fixed-size windows. If the total number of SNPs in a chromosome is not evenly divisible by the window size, the last window on that chromosome will include all remaining SNPs and therefore be larger than the specified size.
- physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
- chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
- window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
- laiobj (LocalAncestryObject, optional): A reference
LocalAncestryObjectfrom which to copy existing window definitions.
Returns:
LocalAncestryObject: A LocalAncestryObject containing window-level ancestry data.
1915 def save(self, file: Union[str, Path]) -> None: 1916 """ 1917 Save the data stored in `self` to a specified file. 1918 1919 The format of the saved file is determined by the file extension provided in the `file` 1920 argument. 1921 1922 **Supported formats:** 1923 1924 - `.bed`: Binary PED (Plink) format. 1925 - `.pgen`: Plink2 binary genotype format. 1926 - `.vcf`: Variant Call Format. 1927 - `.pkl`: Pickle format for saving `self` in serialized form. 1928 1929 Args: 1930 file (str or pathlib.Path): 1931 Path to the file where the data will be saved. The extension of the file determines the save format. 1932 Supported extensions: `.bed`, `.pgen`, `.vcf`, `.pkl`. 1933 """ 1934 ext = Path(file).suffix.lower() 1935 if ext == '.bed': 1936 self.save_bed(file) 1937 elif ext == '.pgen': 1938 self.save_pgen(file) 1939 elif ext == '.vcf': 1940 self.save_vcf(file) 1941 elif ext == '.pkl': 1942 self.save_pickle(file) 1943 else: 1944 raise ValueError(f"Unsupported file extension: {ext}")
Save the data stored in self to a specified file.
The format of the saved file is determined by the file extension provided in the file
argument.
Supported formats:
.bed: Binary PED (Plink) format..pgen: Plink2 binary genotype format..vcf: Variant Call Format..pkl: Pickle format for savingselfin serialized form.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format.
Supported extensions:
.bed,.pgen,.vcf,.pkl.
1946 def save_bed(self, file: Union[str, Path]) -> None: 1947 """ 1948 Save the data stored in `self` to a `.bed` file. 1949 1950 Args: 1951 file (str or pathlib.Path): 1952 Path to the file where the data will be saved. It should end with `.bed`. 1953 If the provided path does not have this extension, it will be appended. 1954 """ 1955 from snputils.snp.io.write.bed import BEDWriter 1956 writer = BEDWriter(snpobj=self, filename=file) 1957 writer.write()
Save the data stored in self to a .bed file.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.bed. If the provided path does not have this extension, it will be appended.
1959 def save_pgen(self, file: Union[str, Path]) -> None: 1960 """ 1961 Save the data stored in `self` to a `.pgen` file. 1962 1963 Args: 1964 file (str or pathlib.Path): 1965 Path to the file where the data will be saved. It should end with `.pgen`. 1966 If the provided path does not have this extension, it will be appended. 1967 """ 1968 from snputils.snp.io.write.pgen import PGENWriter 1969 writer = PGENWriter(snpobj=self, filename=file) 1970 writer.write()
Save the data stored in self to a .pgen file.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pgen. If the provided path does not have this extension, it will be appended.
1972 def save_vcf(self, file: Union[str, Path]) -> None: 1973 """ 1974 Save the data stored in `self` to a `.vcf` file. 1975 1976 Args: 1977 file (str or pathlib.Path): 1978 Path to the file where the data will be saved. It should end with `.vcf`. 1979 If the provided path does not have this extension, it will be appended. 1980 """ 1981 from snputils.snp.io.write.vcf import VCFWriter 1982 writer = VCFWriter(snpobj=self, filename=file) 1983 writer.write()
Save the data stored in self to a .vcf file.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.vcf. If the provided path does not have this extension, it will be appended.
1985 def save_pickle(self, file: Union[str, Path]) -> None: 1986 """ 1987 Save `self` in serialized form to a `.pkl` file. 1988 1989 Args: 1990 file (str or pathlib.Path): 1991 Path to the file where the data will be saved. It should end with `.pkl`. 1992 If the provided path does not have this extension, it will be appended. 1993 """ 1994 import pickle 1995 with open(file, 'wb') as file: 1996 pickle.dump(self, file)
Save self in serialized form to a .pkl file.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl. If the provided path does not have this extension, it will be appended.
16class GRGObject: 17 """ 18 A class for Single Nucleotide Polymorphism (SNP) data. 19 """ 20 def __init__( 21 self, 22 calldata_gt: Optional[GRGType] = None, 23 filename: Optional[str] = None, 24 mutable: Optional[bool] = None 25 ) -> None: 26 """ 27 Args: 28 calldata_gt (GRG | MutableGRG, optional): 29 A Genotype Representation Graph containing genotype data for each sample. 30 filename (str, optional) 31 File storing the GRG. 32 """ 33 self.__calldata_gt = calldata_gt 34 self.__filename = filename 35 self.__mutable = mutable 36 self.__latest = False 37 38 def __getitem__(self, key: str) -> Any: 39 """ 40 To access an attribute of the class using the square bracket notation, 41 similar to a dictionary. 42 """ 43 try: 44 return getattr(self, key) 45 except AttributeError: 46 raise KeyError(f'Invalid key: {key}.') 47 48 def __setitem__(self, key: str, value: Any): 49 """ 50 To set an attribute of the class using the square bracket notation, 51 similar to a dictionary. 52 """ 53 try: 54 setattr(self, key, value) 55 except AttributeError: 56 raise KeyError(f'Invalid key: {key}.') 57 58 @property 59 def calldata_gt(self) -> Optional[GRGType]: 60 """ 61 Retrieve `calldata_gt`. 62 63 Returns: 64 **GRG | MutableGRG:** 65 An GRG containing genotype data for all samples. 66 """ 67 return self.__calldata_gt 68 69 @calldata_gt.setter 70 def calldata_gt(self, x: GRGType): 71 """ 72 Update `calldata_gt`. 73 """ 74 self.__calldata_gt = x 75 76 77 @property 78 def filename(self) -> str: 79 """ 80 Retrieve `filename`. 81 82 Returns: 83 **str** 84 A string containing the file name. 85 """ 86 return self.__filename 87 88 @filename.setter 89 def filename(self, x: str): 90 """ 91 Update `filename`. 92 """ 93 self.__filename = x 94 95 @property 96 def mutable(self) -> Optional[bool]: 97 return self.__mutable 98 99 def allele_freq(self) -> np.ndarray: 100 # allele frequency array 101 al_freq = np.ones(self.calldata_gt.num_samples) / self.calldata_gt.num_samples 102 return pyg.dot_product(self.calldata_gt, al_freq, pyg.TraversalDirection.UP) 103 104 def dot_product(self, array: np.ndarray, traversal_direction: pyg.TraversalDirection): 105 return pyg.dot_product(self.calldata_gt, array, traversal_direction) 106 107 # TODO: consider moving this elsewhere. 108 def allele_freq_from_file(self, filename: Optional[str] = None) -> pd.DataFrame: 109 newfile = filename if filename is not None else self.__filename 110 if newfile is None: 111 raise ValueError("Either pass in a filename, or store an existing GRG filename.") 112 113 with tempfile.NamedTemporaryFile() as fp: 114 subprocess.run(["grg", "process", "freq", f"{newfile}"], stdout=fp, check=True) 115 fp.seek(0) # set the file cursor 116 return pd.read_csv(fp.name, sep="\t") 117 118 119 def gwas(self, phenotype_file: str, filename: Optional[str] = None) -> pd.DataFrame: 120 grg_file = filename if filename is not None else self.__filename 121 if grg_file is None: 122 raise ValueError("Either pass in a GRG filename, or store an existing GRG filename.") 123 124 with tempfile.NamedTemporaryFile(suffix=".tsv") as fp: 125 try: 126 subprocess.run( 127 ["grapp", "assoc", "-p", f"{phenotype_file}", "-o", fp.name, f"{grg_file}"], 128 check=True, 129 ) 130 except FileNotFoundError as exc: 131 raise ImportError( 132 "GWAS support requires the optional dependency 'grapp'. " 133 "Install it with: pip install grapp" 134 ) from exc 135 return pd.read_csv(fp.name, sep="\t") 136 137 def merge(self, combine_nodes: bool = False, *args) -> None: 138 # assert self.__mutable and isinstance(self.calldata_gt, pyg.MutableGRG), "GRG must be mutable" 139 for arg in args: 140 if not isinstance(arg, str): 141 raise TypeError("All merge inputs must be strings.") 142 # list of files, and combine_nodes 143 self.__calldata_gt.merge(list(args), combine_nodes) 144 #pep8 be damned 145 # if inplace: self.__calldata_gt = merged_data 146 # else : return merged_data 147 148 def n_samples(self, ploidy = 2) -> int: 149 """ 150 Get number of samples from GRG. Diploid by default. 151 """ 152 return int(self.__calldata_gt.num_samples / ploidy) 153 154 def n_snps(self) -> int: 155 return self.__calldata_gt.num_mutations 156 157 def _sample_ids(self, n_samples: int, sample_prefix: str) -> np.ndarray: 158 default_ids = [f"{sample_prefix}_{idx}" for idx in range(n_samples)] 159 if self.__calldata_gt is None: 160 return np.asarray(default_ids, dtype=object) 161 162 has_individual_ids = bool(getattr(self.__calldata_gt, "has_individual_ids", False)) 163 num_individuals = int(getattr(self.__calldata_gt, "num_individuals", 0)) 164 if has_individual_ids and n_samples == num_individuals: 165 ids = [] 166 for idx in range(n_samples): 167 try: 168 sample_id = str(self.__calldata_gt.get_individual_id(idx)) 169 except RuntimeError: 170 sample_id = "" 171 ids.append(sample_id if sample_id else default_ids[idx]) 172 else: 173 ids = default_ids 174 175 # Keep IDs unique for downstream writers. 176 seen = {} 177 unique_ids = [] 178 for idx, sample_id in enumerate(ids): 179 count = seen.get(sample_id, 0) 180 unique_ids.append(sample_id if count == 0 else f"{sample_id}_{count}") 181 seen[sample_id] = count + 1 182 183 return np.asarray(unique_ids, dtype=object) 184 185 def to_snpobject( 186 self, 187 sum_strands: bool = False, 188 chrom: str = ".", 189 sample_prefix: str = "sample", 190 ): 191 """ 192 Convert the GRG to a dense SNPObject. 193 194 Notes: 195 - This materializes the full genotype matrix, so memory usage scales with 196 `num_mutations * num_samples`. 197 - For diploid GRGs and `sum_strands=False`, output has shape 198 `(n_snps, n_samples, 2)`. 199 - For `sum_strands=True`, output has shape `(n_snps, n_samples)` with 200 per-individual allele counts. 201 """ 202 from snputils.snp.genobj.snpobj import SNPObject 203 204 if self.__calldata_gt is None: 205 raise ValueError("Cannot convert to SNPObject: `calldata_gt` is None.") 206 207 grg = self.__calldata_gt 208 n_mutations = int(grg.num_mutations) 209 n_haplotypes = int(grg.num_samples) 210 ploidy = int(getattr(grg, "ploidy", 2)) 211 212 if ploidy <= 0: 213 raise ValueError(f"Invalid ploidy in GRG: {ploidy}") 214 if n_haplotypes % ploidy != 0: 215 raise ValueError( 216 f"GRG has {n_haplotypes} haplotypes, not divisible by ploidy {ploidy}." 217 ) 218 219 n_individuals = n_haplotypes // ploidy 220 chrom = str(chrom) 221 222 def _empty(shape): 223 return np.empty(shape, dtype=np.int8) 224 225 if sum_strands: 226 if n_mutations == 0: 227 calldata_gt = _empty((0, n_individuals)) 228 elif ploidy == 1: 229 mutation_eye = np.eye(n_mutations, dtype=np.float64) 230 hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN) 231 calldata_gt = np.rint(hap_matrix).astype(np.int8, copy=False) 232 else: 233 mutation_eye = np.eye(n_mutations, dtype=np.float64) 234 diploid_matrix = pyg.matmul( 235 grg, mutation_eye, pyg.TraversalDirection.DOWN, by_individual=True 236 ) 237 calldata_gt = np.rint(diploid_matrix).astype(np.int8, copy=False) 238 sample_ids = self._sample_ids(n_individuals, sample_prefix) 239 else: 240 if ploidy != 2: 241 raise ValueError( 242 "Phased SNPObject output requires diploid GRGs. " 243 "Use `sum_strands=True` for non-diploid data." 244 ) 245 if n_mutations == 0: 246 calldata_gt = _empty((0, n_individuals, ploidy)) 247 else: 248 mutation_eye = np.eye(n_mutations, dtype=np.float64) 249 hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN) 250 hap_matrix = np.rint(hap_matrix).astype(np.int8, copy=False) 251 calldata_gt = hap_matrix.reshape(n_mutations, n_individuals, ploidy) 252 sample_ids = self._sample_ids(n_individuals, sample_prefix) 253 254 variants_ref = np.empty(n_mutations, dtype=object) 255 variants_alt = np.empty(n_mutations, dtype=object) 256 variants_pos = np.empty(n_mutations, dtype=np.int64) 257 variants_id = np.empty(n_mutations, dtype=object) 258 259 for mut_id in range(n_mutations): 260 mutation = grg.get_mutation_by_id(mut_id) 261 position = int(round(float(mutation.position))) 262 ref = str(mutation.ref_allele) if str(mutation.ref_allele) else "." 263 alt = str(mutation.allele) if str(mutation.allele) else "." 264 variants_pos[mut_id] = position 265 variants_ref[mut_id] = ref 266 variants_alt[mut_id] = alt 267 variants_id[mut_id] = f"{chrom}:{position}" 268 269 variants_chrom = np.full(n_mutations, chrom, dtype=object) 270 variants_filter_pass = np.full(n_mutations, "PASS", dtype=object) 271 variants_qual = np.full(n_mutations, np.nan, dtype=np.float32) 272 273 return SNPObject( 274 calldata_gt=calldata_gt, 275 samples=sample_ids, 276 variants_ref=variants_ref, 277 variants_alt=variants_alt, 278 variants_chrom=variants_chrom, 279 variants_filter_pass=variants_filter_pass, 280 variants_id=variants_id, 281 variants_pos=variants_pos, 282 variants_qual=variants_qual, 283 ) 284 285 def copy(self) -> GRGObject: 286 """ 287 Create and return a copy of `self`. 288 289 Returns: 290 **GRGObject:** 291 A new instance of the current object. 292 """ 293 return copy.deepcopy(self) 294 295 def keys(self) -> List[str]: 296 """ 297 Retrieve a list of public attribute names for `self`. 298 299 Returns: 300 **list of str:** 301 A list of attribute names, with internal name-mangling removed, 302 for easier reference to public attributes in the instance. 303 """ 304 return [attr.replace('_GRGObject__', '') for attr in vars(self)] 305 306 def to_grg(self, filename: str, 307 allow_simplify: bool = True): 308 pyg.save_grg(self.__calldata_gt, filename, allow_simplify)
A class for Single Nucleotide Polymorphism (SNP) data.
20 def __init__( 21 self, 22 calldata_gt: Optional[GRGType] = None, 23 filename: Optional[str] = None, 24 mutable: Optional[bool] = None 25 ) -> None: 26 """ 27 Args: 28 calldata_gt (GRG | MutableGRG, optional): 29 A Genotype Representation Graph containing genotype data for each sample. 30 filename (str, optional) 31 File storing the GRG. 32 """ 33 self.__calldata_gt = calldata_gt 34 self.__filename = filename 35 self.__mutable = mutable 36 self.__latest = False
Arguments:
- calldata_gt (GRG | MutableGRG, optional): A Genotype Representation Graph containing genotype data for each sample.
- filename (str, optional) File storing the GRG.
108 def allele_freq_from_file(self, filename: Optional[str] = None) -> pd.DataFrame: 109 newfile = filename if filename is not None else self.__filename 110 if newfile is None: 111 raise ValueError("Either pass in a filename, or store an existing GRG filename.") 112 113 with tempfile.NamedTemporaryFile() as fp: 114 subprocess.run(["grg", "process", "freq", f"{newfile}"], stdout=fp, check=True) 115 fp.seek(0) # set the file cursor 116 return pd.read_csv(fp.name, sep="\t")
119 def gwas(self, phenotype_file: str, filename: Optional[str] = None) -> pd.DataFrame: 120 grg_file = filename if filename is not None else self.__filename 121 if grg_file is None: 122 raise ValueError("Either pass in a GRG filename, or store an existing GRG filename.") 123 124 with tempfile.NamedTemporaryFile(suffix=".tsv") as fp: 125 try: 126 subprocess.run( 127 ["grapp", "assoc", "-p", f"{phenotype_file}", "-o", fp.name, f"{grg_file}"], 128 check=True, 129 ) 130 except FileNotFoundError as exc: 131 raise ImportError( 132 "GWAS support requires the optional dependency 'grapp'. " 133 "Install it with: pip install grapp" 134 ) from exc 135 return pd.read_csv(fp.name, sep="\t")
137 def merge(self, combine_nodes: bool = False, *args) -> None: 138 # assert self.__mutable and isinstance(self.calldata_gt, pyg.MutableGRG), "GRG must be mutable" 139 for arg in args: 140 if not isinstance(arg, str): 141 raise TypeError("All merge inputs must be strings.") 142 # list of files, and combine_nodes 143 self.__calldata_gt.merge(list(args), combine_nodes) 144 #pep8 be damned 145 # if inplace: self.__calldata_gt = merged_data 146 # else : return merged_data
148 def n_samples(self, ploidy = 2) -> int: 149 """ 150 Get number of samples from GRG. Diploid by default. 151 """ 152 return int(self.__calldata_gt.num_samples / ploidy)
Get number of samples from GRG. Diploid by default.
185 def to_snpobject( 186 self, 187 sum_strands: bool = False, 188 chrom: str = ".", 189 sample_prefix: str = "sample", 190 ): 191 """ 192 Convert the GRG to a dense SNPObject. 193 194 Notes: 195 - This materializes the full genotype matrix, so memory usage scales with 196 `num_mutations * num_samples`. 197 - For diploid GRGs and `sum_strands=False`, output has shape 198 `(n_snps, n_samples, 2)`. 199 - For `sum_strands=True`, output has shape `(n_snps, n_samples)` with 200 per-individual allele counts. 201 """ 202 from snputils.snp.genobj.snpobj import SNPObject 203 204 if self.__calldata_gt is None: 205 raise ValueError("Cannot convert to SNPObject: `calldata_gt` is None.") 206 207 grg = self.__calldata_gt 208 n_mutations = int(grg.num_mutations) 209 n_haplotypes = int(grg.num_samples) 210 ploidy = int(getattr(grg, "ploidy", 2)) 211 212 if ploidy <= 0: 213 raise ValueError(f"Invalid ploidy in GRG: {ploidy}") 214 if n_haplotypes % ploidy != 0: 215 raise ValueError( 216 f"GRG has {n_haplotypes} haplotypes, not divisible by ploidy {ploidy}." 217 ) 218 219 n_individuals = n_haplotypes // ploidy 220 chrom = str(chrom) 221 222 def _empty(shape): 223 return np.empty(shape, dtype=np.int8) 224 225 if sum_strands: 226 if n_mutations == 0: 227 calldata_gt = _empty((0, n_individuals)) 228 elif ploidy == 1: 229 mutation_eye = np.eye(n_mutations, dtype=np.float64) 230 hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN) 231 calldata_gt = np.rint(hap_matrix).astype(np.int8, copy=False) 232 else: 233 mutation_eye = np.eye(n_mutations, dtype=np.float64) 234 diploid_matrix = pyg.matmul( 235 grg, mutation_eye, pyg.TraversalDirection.DOWN, by_individual=True 236 ) 237 calldata_gt = np.rint(diploid_matrix).astype(np.int8, copy=False) 238 sample_ids = self._sample_ids(n_individuals, sample_prefix) 239 else: 240 if ploidy != 2: 241 raise ValueError( 242 "Phased SNPObject output requires diploid GRGs. " 243 "Use `sum_strands=True` for non-diploid data." 244 ) 245 if n_mutations == 0: 246 calldata_gt = _empty((0, n_individuals, ploidy)) 247 else: 248 mutation_eye = np.eye(n_mutations, dtype=np.float64) 249 hap_matrix = pyg.matmul(grg, mutation_eye, pyg.TraversalDirection.DOWN) 250 hap_matrix = np.rint(hap_matrix).astype(np.int8, copy=False) 251 calldata_gt = hap_matrix.reshape(n_mutations, n_individuals, ploidy) 252 sample_ids = self._sample_ids(n_individuals, sample_prefix) 253 254 variants_ref = np.empty(n_mutations, dtype=object) 255 variants_alt = np.empty(n_mutations, dtype=object) 256 variants_pos = np.empty(n_mutations, dtype=np.int64) 257 variants_id = np.empty(n_mutations, dtype=object) 258 259 for mut_id in range(n_mutations): 260 mutation = grg.get_mutation_by_id(mut_id) 261 position = int(round(float(mutation.position))) 262 ref = str(mutation.ref_allele) if str(mutation.ref_allele) else "." 263 alt = str(mutation.allele) if str(mutation.allele) else "." 264 variants_pos[mut_id] = position 265 variants_ref[mut_id] = ref 266 variants_alt[mut_id] = alt 267 variants_id[mut_id] = f"{chrom}:{position}" 268 269 variants_chrom = np.full(n_mutations, chrom, dtype=object) 270 variants_filter_pass = np.full(n_mutations, "PASS", dtype=object) 271 variants_qual = np.full(n_mutations, np.nan, dtype=np.float32) 272 273 return SNPObject( 274 calldata_gt=calldata_gt, 275 samples=sample_ids, 276 variants_ref=variants_ref, 277 variants_alt=variants_alt, 278 variants_chrom=variants_chrom, 279 variants_filter_pass=variants_filter_pass, 280 variants_id=variants_id, 281 variants_pos=variants_pos, 282 variants_qual=variants_qual, 283 )
Convert the GRG to a dense SNPObject.
Notes:
- This materializes the full genotype matrix, so memory usage scales with
num_mutations * num_samples.- For diploid GRGs and
sum_strands=False, output has shape(n_snps, n_samples, 2).- For
sum_strands=True, output has shape(n_snps, n_samples)with per-individual allele counts.
285 def copy(self) -> GRGObject: 286 """ 287 Create and return a copy of `self`. 288 289 Returns: 290 **GRGObject:** 291 A new instance of the current object. 292 """ 293 return copy.deepcopy(self)
Create and return a copy of self.
Returns:
GRGObject: A new instance of the current object.
295 def keys(self) -> List[str]: 296 """ 297 Retrieve a list of public attribute names for `self`. 298 299 Returns: 300 **list of str:** 301 A list of attribute names, with internal name-mangling removed, 302 for easier reference to public attributes in the instance. 303 """ 304 return [attr.replace('_GRGObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
8class SNPReader: 9 def __new__(cls, 10 filename: Union[str, pathlib.Path], 11 vcf_backend: str = 'polars') -> SNPReader: 12 """ 13 Automatically detect the SNP file format from the file extension, and return its corresponding reader. 14 15 Args: 16 filename: Filename of the file to read. 17 vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'. 18 19 Raises: 20 ValueError: If the filename does not have an extension or the extension is not supported. 21 """ 22 filename = pathlib.Path(filename) 23 suffixes = filename.suffixes 24 if not suffixes: 25 raise ValueError("The filename should have an extension when using SNPReader.") 26 27 extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1] 28 extension = extension.lower() 29 30 if extension == ".vcf": 31 if vcf_backend == 'polars': 32 from snputils.snp.io.read.vcf import VCFReaderPolars 33 34 return VCFReaderPolars(filename) 35 elif vcf_backend == 'scikit-allel': 36 from snputils.snp.io.read.vcf import VCFReader 37 38 return VCFReader(filename) 39 else: 40 raise ValueError(f"VCF backend not supported: {vcf_backend}") 41 elif extension in (".bed", ".bim", ".fam"): 42 from snputils.snp.io.read.bed import BEDReader 43 44 return BEDReader(filename) 45 elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"): 46 from snputils.snp.io.read.pgen import PGENReader 47 48 return PGENReader(filename) 49 else: 50 raise ValueError(f"File format not supported: {filename}")
9 def __new__(cls, 10 filename: Union[str, pathlib.Path], 11 vcf_backend: str = 'polars') -> SNPReader: 12 """ 13 Automatically detect the SNP file format from the file extension, and return its corresponding reader. 14 15 Args: 16 filename: Filename of the file to read. 17 vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'. 18 19 Raises: 20 ValueError: If the filename does not have an extension or the extension is not supported. 21 """ 22 filename = pathlib.Path(filename) 23 suffixes = filename.suffixes 24 if not suffixes: 25 raise ValueError("The filename should have an extension when using SNPReader.") 26 27 extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1] 28 extension = extension.lower() 29 30 if extension == ".vcf": 31 if vcf_backend == 'polars': 32 from snputils.snp.io.read.vcf import VCFReaderPolars 33 34 return VCFReaderPolars(filename) 35 elif vcf_backend == 'scikit-allel': 36 from snputils.snp.io.read.vcf import VCFReader 37 38 return VCFReader(filename) 39 else: 40 raise ValueError(f"VCF backend not supported: {vcf_backend}") 41 elif extension in (".bed", ".bim", ".fam"): 42 from snputils.snp.io.read.bed import BEDReader 43 44 return BEDReader(filename) 45 elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"): 46 from snputils.snp.io.read.pgen import PGENReader 47 48 return PGENReader(filename) 49 else: 50 raise ValueError(f"File format not supported: {filename}")
Automatically detect the SNP file format from the file extension, and return its corresponding reader.
Arguments:
- filename: Filename of the file to read.
- vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
Raises:
- ValueError: If the filename does not have an extension or the extension is not supported.
16@SNPBaseReader.register 17class BEDReader(SNPBaseReader): 18 def read( 19 self, 20 fields: Optional[List[str]] = None, 21 exclude_fields: Optional[List[str]] = None, 22 sample_ids: Optional[np.ndarray] = None, 23 sample_idxs: Optional[np.ndarray] = None, 24 variant_ids: Optional[np.ndarray] = None, 25 variant_idxs: Optional[np.ndarray] = None, 26 sum_strands: bool = False, 27 separator: Optional[str] = None, 28 ) -> SNPObject: 29 """ 30 Read a bed fileset (bed, bim, fam) into a SNPObject. 31 32 Args: 33 fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. 34 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. 35 To extract all fields, set fields to None. Defaults to None. 36 exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. 37 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. 38 To exclude no fields, set exclude_fields to None. Defaults to None. 39 sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read. 40 sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read. 41 variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read. 42 variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read. 43 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 44 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 45 Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger. 46 separator: Separator used in the pvar file. If None, the separator is automatically detected. 47 If the automatic detection fails, please specify the separator manually. 48 49 Returns: 50 **SNPObject**: 51 A SNPObject instance. 52 """ 53 assert ( 54 sample_idxs is None or sample_ids is None 55 ), "Only one of sample_idxs and sample_ids can be specified" 56 assert ( 57 variant_idxs is None or variant_ids is None 58 ), "Only one of variant_idxs and variant_ids can be specified" 59 60 if isinstance(fields, str): 61 fields = [fields] 62 if isinstance(exclude_fields, str): 63 exclude_fields = [exclude_fields] 64 65 fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"] 66 exclude_fields = exclude_fields or [] 67 fields = [field for field in fields if field not in exclude_fields] 68 only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None 69 70 filename_noext = str(self.filename) 71 if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"): 72 filename_noext = filename_noext[:-4] 73 74 if only_read_bed: 75 with open(filename_noext + '.fam', 'r') as f: 76 file_num_samples = sum(1 for _ in f) # Get sample count from fam file 77 file_num_variants = None # Not needed 78 else: 79 log.info(f"Reading {filename_noext}.bim") 80 81 if separator is None: 82 with open(filename_noext + ".bim", "r") as file: 83 separator = csv.Sniffer().sniff(file.readline()).delimiter 84 85 bim = pl.read_csv( 86 filename_noext + ".bim", 87 separator=separator, 88 has_header=False, 89 new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"], 90 schema_overrides={ 91 "#CHROM": pl.String, 92 "ID": pl.String, 93 "CM": pl.Float64, 94 "POS": pl.Int64, 95 "ALT": pl.String, 96 "REF": pl.String 97 }, 98 null_values=["NA"] 99 ).with_row_index() 100 file_num_variants = bim.height 101 102 if variant_ids is not None: 103 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 104 variant_id_or_pos = ( 105 pl.col("ID").is_in(variant_id_values) 106 | pl.concat_str( 107 [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)] 108 ).is_in(variant_id_values) 109 ) 110 variant_idxs = ( 111 bim.filter(variant_id_or_pos) 112 .select("index") 113 .to_series() 114 .to_numpy() 115 ) 116 117 if variant_idxs is None: 118 num_variants = file_num_variants 119 variant_idxs = np.arange(num_variants, dtype=np.uint32) 120 else: 121 requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel() 122 bim = bim.filter(pl.col("index").is_in(requested_variant_idxs)) 123 variant_idxs = bim.select("index").to_series().to_numpy() 124 variant_idxs = np.asarray(variant_idxs, dtype=np.uint32) 125 num_variants = np.size(variant_idxs) 126 127 log.info(f"Reading {filename_noext}.fam") 128 129 fam = pl.read_csv( 130 filename_noext + ".fam", 131 separator=separator, 132 has_header=False, 133 new_columns=["Family ID", "IID", "Father ID", 134 "Mother ID", "Sex code", "Phenotype value"], 135 schema_overrides={ 136 "Family ID": pl.String, 137 "IID": pl.String, 138 "Father ID": pl.String, 139 "Mother ID": pl.String, 140 "Sex code": pl.String, 141 }, 142 null_values=["NA"] 143 ).with_row_index() 144 file_num_samples = fam.height 145 146 if sample_ids is not None: 147 sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy() 148 149 if sample_idxs is None: 150 num_samples = file_num_samples 151 else: 152 num_samples = np.size(sample_idxs) 153 sample_idxs = np.array(sample_idxs, dtype=np.uint32) 154 fam = fam.filter(pl.col("index").is_in(sample_idxs)) 155 156 if "GT" in fields: 157 log.info(f"Reading {filename_noext}.bed") 158 pgen_reader = pg.PgenReader( 159 str.encode(filename_noext + ".bed"), 160 raw_sample_ct=file_num_samples, 161 variant_ct=file_num_variants, 162 sample_subset=sample_idxs, 163 ) 164 165 if only_read_bed: 166 num_samples = pgen_reader.get_raw_sample_ct() 167 num_variants = pgen_reader.get_variant_ct() 168 variant_idxs = np.arange(num_variants, dtype=np.uint32) 169 170 # required arrays: variant_idxs + sample_idxs + genotypes 171 if not sum_strands: 172 required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4 173 else: 174 required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples 175 log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each") 176 177 if not sum_strands: 178 genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32) # cannot use int8 because of pgenlib 179 pgen_reader.read_alleles_list(variant_idxs, genotypes) 180 genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2)) 181 else: 182 genotypes = np.empty((num_variants, num_samples), dtype=np.int8) 183 pgen_reader.read_list(variant_idxs, genotypes) 184 pgen_reader.close() 185 else: 186 genotypes = None 187 188 log.info("Constructing SNPObject") 189 190 snpobj = SNPObject( 191 calldata_gt=genotypes if "GT" in fields else None, 192 samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None, 193 **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None 194 for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()} 195 ) 196 197 log.info("Finished constructing SNPObject") 198 return snpobj 199 200 def _resolve_variant_idxs_for_iter( 201 self, 202 *, 203 variant_ids: Optional[np.ndarray], 204 variant_idxs: Optional[np.ndarray], 205 separator: Optional[str], 206 ) -> np.ndarray: 207 """ 208 Resolve variant selectors to canonical file-order row indices. 209 """ 210 filename_noext = str(self.filename) 211 if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"): 212 filename_noext = filename_noext[:-4] 213 214 local_separator = separator 215 if local_separator is None: 216 with open(filename_noext + ".bim", "r") as file: 217 local_separator = csv.Sniffer().sniff(file.readline()).delimiter 218 219 bim = pl.read_csv( 220 filename_noext + ".bim", 221 separator=local_separator, 222 has_header=False, 223 new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"], 224 schema_overrides={ 225 "#CHROM": pl.String, 226 "ID": pl.String, 227 "CM": pl.Float64, 228 "POS": pl.Int64, 229 "ALT": pl.String, 230 "REF": pl.String, 231 }, 232 null_values=["NA"], 233 ).with_row_index() 234 235 if variant_ids is not None: 236 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 237 variant_id_or_pos = ( 238 pl.col("ID").is_in(variant_id_values) 239 | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in( 240 variant_id_values 241 ) 242 ) 243 resolved = ( 244 bim.filter(variant_id_or_pos) 245 .select("index") 246 .to_series() 247 .to_numpy() 248 ) 249 return np.asarray(resolved, dtype=np.uint32) 250 251 if variant_idxs is not None: 252 requested = np.asarray(variant_idxs, dtype=np.uint32).ravel() 253 resolved = ( 254 bim.filter(pl.col("index").is_in(requested)) 255 .select("index") 256 .to_series() 257 .to_numpy() 258 ) 259 return np.asarray(resolved, dtype=np.uint32) 260 261 return np.arange(bim.height, dtype=np.uint32) 262 263 def iter_read( 264 self, 265 fields: Optional[List[str]] = None, 266 exclude_fields: Optional[List[str]] = None, 267 sample_ids: Optional[np.ndarray] = None, 268 sample_idxs: Optional[np.ndarray] = None, 269 variant_ids: Optional[np.ndarray] = None, 270 variant_idxs: Optional[np.ndarray] = None, 271 sum_strands: bool = False, 272 separator: Optional[str] = None, 273 chunk_size: int = 10_000, 274 ) -> Iterator[SNPObject]: 275 """ 276 Stream the BED fileset in variant chunks. 277 278 This yields a sequence of SNPObject chunks along the SNP axis. 279 """ 280 if chunk_size < 1: 281 raise ValueError("chunk_size must be >= 1.") 282 if sample_idxs is not None and sample_ids is not None: 283 raise ValueError("Only one of sample_idxs and sample_ids can be specified.") 284 if variant_idxs is not None and variant_ids is not None: 285 raise ValueError("Only one of variant_idxs and variant_ids can be specified.") 286 287 selectors = self._resolve_variant_idxs_for_iter( 288 variant_ids=variant_ids, 289 variant_idxs=variant_idxs, 290 separator=separator, 291 ) 292 293 n_selectors = int(selectors.size) 294 for start in range(0, n_selectors, int(chunk_size)): 295 stop = min(start + int(chunk_size), n_selectors) 296 selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32) 297 yield self.read( 298 fields=fields, 299 exclude_fields=exclude_fields, 300 sample_ids=sample_ids, 301 sample_idxs=sample_idxs, 302 variant_idxs=selector_chunk, 303 sum_strands=sum_strands, 304 separator=separator, 305 )
Abstract class for SNP readers.
Attributes:
- _filename: The path to the file storing SNP data.
18 def read( 19 self, 20 fields: Optional[List[str]] = None, 21 exclude_fields: Optional[List[str]] = None, 22 sample_ids: Optional[np.ndarray] = None, 23 sample_idxs: Optional[np.ndarray] = None, 24 variant_ids: Optional[np.ndarray] = None, 25 variant_idxs: Optional[np.ndarray] = None, 26 sum_strands: bool = False, 27 separator: Optional[str] = None, 28 ) -> SNPObject: 29 """ 30 Read a bed fileset (bed, bim, fam) into a SNPObject. 31 32 Args: 33 fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. 34 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. 35 To extract all fields, set fields to None. Defaults to None. 36 exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. 37 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. 38 To exclude no fields, set exclude_fields to None. Defaults to None. 39 sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read. 40 sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read. 41 variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read. 42 variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read. 43 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 44 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 45 Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger. 46 separator: Separator used in the pvar file. If None, the separator is automatically detected. 47 If the automatic detection fails, please specify the separator manually. 48 49 Returns: 50 **SNPObject**: 51 A SNPObject instance. 52 """ 53 assert ( 54 sample_idxs is None or sample_ids is None 55 ), "Only one of sample_idxs and sample_ids can be specified" 56 assert ( 57 variant_idxs is None or variant_ids is None 58 ), "Only one of variant_idxs and variant_ids can be specified" 59 60 if isinstance(fields, str): 61 fields = [fields] 62 if isinstance(exclude_fields, str): 63 exclude_fields = [exclude_fields] 64 65 fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"] 66 exclude_fields = exclude_fields or [] 67 fields = [field for field in fields if field not in exclude_fields] 68 only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None 69 70 filename_noext = str(self.filename) 71 if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"): 72 filename_noext = filename_noext[:-4] 73 74 if only_read_bed: 75 with open(filename_noext + '.fam', 'r') as f: 76 file_num_samples = sum(1 for _ in f) # Get sample count from fam file 77 file_num_variants = None # Not needed 78 else: 79 log.info(f"Reading {filename_noext}.bim") 80 81 if separator is None: 82 with open(filename_noext + ".bim", "r") as file: 83 separator = csv.Sniffer().sniff(file.readline()).delimiter 84 85 bim = pl.read_csv( 86 filename_noext + ".bim", 87 separator=separator, 88 has_header=False, 89 new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"], 90 schema_overrides={ 91 "#CHROM": pl.String, 92 "ID": pl.String, 93 "CM": pl.Float64, 94 "POS": pl.Int64, 95 "ALT": pl.String, 96 "REF": pl.String 97 }, 98 null_values=["NA"] 99 ).with_row_index() 100 file_num_variants = bim.height 101 102 if variant_ids is not None: 103 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 104 variant_id_or_pos = ( 105 pl.col("ID").is_in(variant_id_values) 106 | pl.concat_str( 107 [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)] 108 ).is_in(variant_id_values) 109 ) 110 variant_idxs = ( 111 bim.filter(variant_id_or_pos) 112 .select("index") 113 .to_series() 114 .to_numpy() 115 ) 116 117 if variant_idxs is None: 118 num_variants = file_num_variants 119 variant_idxs = np.arange(num_variants, dtype=np.uint32) 120 else: 121 requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel() 122 bim = bim.filter(pl.col("index").is_in(requested_variant_idxs)) 123 variant_idxs = bim.select("index").to_series().to_numpy() 124 variant_idxs = np.asarray(variant_idxs, dtype=np.uint32) 125 num_variants = np.size(variant_idxs) 126 127 log.info(f"Reading {filename_noext}.fam") 128 129 fam = pl.read_csv( 130 filename_noext + ".fam", 131 separator=separator, 132 has_header=False, 133 new_columns=["Family ID", "IID", "Father ID", 134 "Mother ID", "Sex code", "Phenotype value"], 135 schema_overrides={ 136 "Family ID": pl.String, 137 "IID": pl.String, 138 "Father ID": pl.String, 139 "Mother ID": pl.String, 140 "Sex code": pl.String, 141 }, 142 null_values=["NA"] 143 ).with_row_index() 144 file_num_samples = fam.height 145 146 if sample_ids is not None: 147 sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy() 148 149 if sample_idxs is None: 150 num_samples = file_num_samples 151 else: 152 num_samples = np.size(sample_idxs) 153 sample_idxs = np.array(sample_idxs, dtype=np.uint32) 154 fam = fam.filter(pl.col("index").is_in(sample_idxs)) 155 156 if "GT" in fields: 157 log.info(f"Reading {filename_noext}.bed") 158 pgen_reader = pg.PgenReader( 159 str.encode(filename_noext + ".bed"), 160 raw_sample_ct=file_num_samples, 161 variant_ct=file_num_variants, 162 sample_subset=sample_idxs, 163 ) 164 165 if only_read_bed: 166 num_samples = pgen_reader.get_raw_sample_ct() 167 num_variants = pgen_reader.get_variant_ct() 168 variant_idxs = np.arange(num_variants, dtype=np.uint32) 169 170 # required arrays: variant_idxs + sample_idxs + genotypes 171 if not sum_strands: 172 required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4 173 else: 174 required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples 175 log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each") 176 177 if not sum_strands: 178 genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32) # cannot use int8 because of pgenlib 179 pgen_reader.read_alleles_list(variant_idxs, genotypes) 180 genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2)) 181 else: 182 genotypes = np.empty((num_variants, num_samples), dtype=np.int8) 183 pgen_reader.read_list(variant_idxs, genotypes) 184 pgen_reader.close() 185 else: 186 genotypes = None 187 188 log.info("Constructing SNPObject") 189 190 snpobj = SNPObject( 191 calldata_gt=genotypes if "GT" in fields else None, 192 samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None, 193 **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None 194 for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()} 195 ) 196 197 log.info("Finished constructing SNPObject") 198 return snpobj
Read a bed fileset (bed, bim, fam) into a SNPObject.
Arguments:
- fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To extract all fields, set fields to None. Defaults to None.
- exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To exclude no fields, set exclude_fields to None. Defaults to None.
- sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
- sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
- variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
- variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
- sum_strands: If True, maternal and paternal strands are combined into a single
int8array with values{0, 1, 2}. If False, strands are stored separately as anint8array with values{0, 1}for each strand. Note: With the pgenlib backend,Falseuses~8×more RAM, thoughcalldata_gtis only2×larger. - separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.
Returns:
SNPObject: A SNPObject instance.
263 def iter_read( 264 self, 265 fields: Optional[List[str]] = None, 266 exclude_fields: Optional[List[str]] = None, 267 sample_ids: Optional[np.ndarray] = None, 268 sample_idxs: Optional[np.ndarray] = None, 269 variant_ids: Optional[np.ndarray] = None, 270 variant_idxs: Optional[np.ndarray] = None, 271 sum_strands: bool = False, 272 separator: Optional[str] = None, 273 chunk_size: int = 10_000, 274 ) -> Iterator[SNPObject]: 275 """ 276 Stream the BED fileset in variant chunks. 277 278 This yields a sequence of SNPObject chunks along the SNP axis. 279 """ 280 if chunk_size < 1: 281 raise ValueError("chunk_size must be >= 1.") 282 if sample_idxs is not None and sample_ids is not None: 283 raise ValueError("Only one of sample_idxs and sample_ids can be specified.") 284 if variant_idxs is not None and variant_ids is not None: 285 raise ValueError("Only one of variant_idxs and variant_ids can be specified.") 286 287 selectors = self._resolve_variant_idxs_for_iter( 288 variant_ids=variant_ids, 289 variant_idxs=variant_idxs, 290 separator=separator, 291 ) 292 293 n_selectors = int(selectors.size) 294 for start in range(0, n_selectors, int(chunk_size)): 295 stop = min(start + int(chunk_size), n_selectors) 296 selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32) 297 yield self.read( 298 fields=fields, 299 exclude_fields=exclude_fields, 300 sample_ids=sample_ids, 301 sample_idxs=sample_idxs, 302 variant_idxs=selector_chunk, 303 sum_strands=sum_strands, 304 separator=separator, 305 )
Stream the BED fileset in variant chunks.
This yields a sequence of SNPObject chunks along the SNP axis.
9@SNPBaseReader.register 10class GRGReader(SNPBaseReader): 11 def read(self, 12 mutable: Optional[bool] = None, 13 load_up_edges: Optional[bool] = None, 14 binary_mutations: Optional[bool] = None) -> GRGObject: 15 """ 16 Read in a GRG or TSKit File 17 """ 18 file = str(pathlib.Path(self.filename).resolve()) 19 extension = pathlib.Path(file).suffix.lower() 20 edges = load_up_edges if load_up_edges is not None else True 21 binmuts = binary_mutations if binary_mutations is not None else False 22 23 if extension == ".trees": 24 return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True) 25 if mutable: 26 return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True) 27 28 return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)
Abstract class for SNP readers.
Attributes:
- _filename: The path to the file storing SNP data.
11 def read(self, 12 mutable: Optional[bool] = None, 13 load_up_edges: Optional[bool] = None, 14 binary_mutations: Optional[bool] = None) -> GRGObject: 15 """ 16 Read in a GRG or TSKit File 17 """ 18 file = str(pathlib.Path(self.filename).resolve()) 19 extension = pathlib.Path(file).suffix.lower() 20 edges = load_up_edges if load_up_edges is not None else True 21 binmuts = binary_mutations if binary_mutations is not None else False 22 23 if extension == ".trees": 24 return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True) 25 if mutable: 26 return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True) 27 28 return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)
Read in a GRG or TSKit File
9class GRGWriter: 10 def __init__(self, grgobj: Union[pyg.GRG, pyg.MutableGRG], filename: str): 11 self.grgobj = grgobj 12 self.mutability = False if isinstance(self.grgobj, pyg.GRG) else True 13 self.filename = filename 14 15 def write(self, allow_simplify : Optional[bool] = None, 16 subset : Optional[bool] = None, 17 direction : Optional[pyg.TraversalDirection] = None, 18 seed_list : Optional[List[int]] = None, 19 bp_range : Optional[Tuple[int, int]] = None): 20 """ 21 """ 22 23 if subset: 24 if direction is None: 25 raise ValueError("If subset is True, 'direction' must be provided.") 26 if seed_list is None: 27 raise ValueError("If subset is True, 'seed_list' must be provided.") 28 _bp_range = (0,0) if bp_range is None else bp_range 29 pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 30 else: 31 _allow_simplify = True if allow_simplify is None else allow_simplify 32 pyg.save_grg(self.grgobj, self.filename, _allow_simplify)
15 def write(self, allow_simplify : Optional[bool] = None, 16 subset : Optional[bool] = None, 17 direction : Optional[pyg.TraversalDirection] = None, 18 seed_list : Optional[List[int]] = None, 19 bp_range : Optional[Tuple[int, int]] = None): 20 """ 21 """ 22 23 if subset: 24 if direction is None: 25 raise ValueError("If subset is True, 'direction' must be provided.") 26 if seed_list is None: 27 raise ValueError("If subset is True, 'seed_list' must be provided.") 28 _bp_range = (0,0) if bp_range is None else bp_range 29 pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 30 else: 31 _allow_simplify = True if allow_simplify is None else allow_simplify 32 pyg.save_grg(self.grgobj, self.filename, _allow_simplify)
24@SNPBaseReader.register 25class PGENReader(SNPBaseReader): 26 def read( 27 self, 28 fields: Optional[List[str]] = None, 29 exclude_fields: Optional[List[str]] = None, 30 sample_ids: Optional[np.ndarray] = None, 31 sample_idxs: Optional[np.ndarray] = None, 32 variant_ids: Optional[np.ndarray] = None, 33 variant_idxs: Optional[np.ndarray] = None, 34 sum_strands: bool = False, 35 separator: str = None, 36 ) -> SNPObject: 37 """ 38 Read a pgen fileset (pgen, psam, pvar) into a SNPObject. 39 40 Args: 41 fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. 42 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. 43 To extract all fields, set fields to None. Defaults to None. 44 exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. 45 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. 46 To exclude no fields, set exclude_fields to None. Defaults to None. 47 sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read. 48 sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read. 49 variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read. 50 variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read. 51 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 52 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 53 Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger. 54 separator: Separator used in the pvar file. If None, the separator is automatically detected. 55 If the automatic detection fails, please specify the separator manually. 56 57 Returns: 58 **SNPObject**: 59 A SNPObject instance. 60 """ 61 assert ( 62 sample_idxs is None or sample_ids is None 63 ), "Only one of sample_idxs and sample_ids can be specified" 64 assert ( 65 variant_idxs is None or variant_ids is None 66 ), "Only one of variant_idxs and variant_ids can be specified" 67 68 if isinstance(fields, str): 69 fields = [fields] 70 if isinstance(exclude_fields, str): 71 exclude_fields = [exclude_fields] 72 73 fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"] 74 exclude_fields = exclude_fields or [] 75 fields = [field for field in fields if field not in exclude_fields] 76 only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None 77 78 filename_noext = str(self.filename) 79 for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]: 80 if filename_noext.endswith(ext): 81 filename_noext = filename_noext[:-len(ext)] 82 break 83 84 if only_read_pgen: 85 file_num_samples = None # Not needed for pgen 86 file_num_variants = None # Not needed 87 else: 88 pvar_extensions = [".pvar", ".pvar.zst"] 89 pvar_filename = None 90 for ext in pvar_extensions: 91 possible_pvar = filename_noext + ext 92 if os.path.exists(possible_pvar): 93 pvar_filename = possible_pvar 94 break 95 if pvar_filename is None: 96 raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}") 97 98 log.info(f"Reading {pvar_filename}") 99 100 pvar_has_header = True 101 pvar_header_line_num = 0 102 with _open_textfile(pvar_filename) as file: 103 for line_num, line in enumerate(file): 104 if line.startswith("##"): # Metadata 105 continue 106 else: 107 if separator is None: 108 separator = csv.Sniffer().sniff(file.readline()).delimiter 109 if line.startswith("#CHROM"): # Header 110 pvar_header_line_num = line_num 111 header = line.strip().split() 112 break 113 elif not line.startswith("#"): # If no header, look at line 1 114 pvar_has_header = False 115 cols_in_pvar = len(line.strip().split(separator)) 116 if cols_in_pvar == 5: 117 header = ["#CHROM", "ID", "POS", "ALT", "REF"] 118 elif cols_in_pvar == 6: 119 header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"] 120 else: 121 raise ValueError( 122 f"{pvar_filename} is not a valid pvar file." 123 ) 124 break 125 126 pvar_reading_args = { 127 'separator': separator, 128 'skip_rows': pvar_header_line_num, 129 'has_header': pvar_has_header, 130 'new_columns': None if pvar_has_header else header, 131 'schema_overrides': { 132 "#CHROM": pl.String, 133 "POS": pl.UInt32, 134 "ID": pl.String, 135 "REF": pl.String, 136 "ALT": pl.String, 137 }, 138 'null_values': ["NA"], 139 } 140 if pvar_filename.endswith('.zst'): 141 pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy() 142 else: 143 pvar = pl.scan_csv(pvar_filename, **pvar_reading_args) 144 145 # We need to map requested IDs to row positions before reading genotypes. 146 variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect() 147 file_num_variants = variant_meta.height 148 149 if variant_ids is not None: 150 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 151 variant_id_or_pos = ( 152 pl.col("ID").is_in(variant_id_values) 153 | pl.concat_str( 154 [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)] 155 ).is_in(variant_id_values) 156 ) 157 variant_idxs = ( 158 variant_meta.filter(variant_id_or_pos) 159 .select("index") 160 .to_series() 161 .to_numpy() 162 ) 163 164 if variant_idxs is None: 165 num_variants = file_num_variants 166 variant_idxs = np.arange(num_variants, dtype=np.uint32) 167 pvar = pvar.collect() 168 else: 169 pvar = ( 170 pvar.with_row_index() 171 .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel())) 172 .collect() 173 ) 174 variant_idxs = pvar.select("index").to_series().to_numpy() 175 variant_idxs = np.asarray(variant_idxs, dtype=np.uint32) 176 num_variants = np.size(variant_idxs) 177 pvar = pvar.drop("index") 178 179 log.info(f"Reading {filename_noext}.psam") 180 181 with open(filename_noext + ".psam") as file: 182 first_line = file.readline().strip() 183 psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID")) 184 185 psam = pl.read_csv( 186 filename_noext + ".psam", 187 separator=separator, 188 has_header=psam_has_header, 189 new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"], 190 null_values=["NA"], 191 ).with_row_index() 192 if "#IID" in psam.columns: 193 psam = psam.rename({"#IID": "IID"}) 194 if "#FID" in psam.columns: 195 psam = psam.rename({"#FID": "FID"}) 196 197 file_num_samples = psam.height 198 199 if sample_ids is not None: 200 psam = psam.filter(pl.col("IID").is_in(sample_ids)) 201 sample_idxs = psam.select("index").to_series().to_numpy() 202 num_samples = np.size(sample_idxs) 203 elif sample_idxs is not None: 204 num_samples = np.size(sample_idxs) 205 sample_idxs = np.array(sample_idxs, dtype=np.uint32) 206 psam = psam.filter(pl.col("index").is_in(sample_idxs)) 207 else: 208 num_samples = file_num_samples 209 210 if "GT" in fields: 211 log.info(f"Reading {filename_noext}.pgen") 212 pgen_reader = pg.PgenReader( 213 str.encode(filename_noext + ".pgen"), 214 raw_sample_ct=file_num_samples, 215 variant_ct=file_num_variants, 216 sample_subset=sample_idxs, 217 ) 218 219 if only_read_pgen: 220 num_samples = pgen_reader.get_raw_sample_ct() 221 num_variants = pgen_reader.get_variant_ct() 222 variant_idxs = np.arange(num_variants, dtype=np.uint32) 223 224 # required arrays: variant_idxs + sample_idxs + genotypes 225 if not sum_strands: 226 required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4 227 else: 228 required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples 229 log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each") 230 231 if not sum_strands: 232 genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32) # cannot use int8 because of pgenlib 233 pgen_reader.read_alleles_list(variant_idxs, genotypes) 234 genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2)) 235 else: 236 genotypes = np.empty((num_variants, num_samples), dtype=np.int8) 237 pgen_reader.read_list(variant_idxs, genotypes) 238 pgen_reader.close() 239 else: 240 genotypes = None 241 242 log.info("Constructing SNPObject") 243 244 snpobj = SNPObject( 245 calldata_gt=genotypes if "GT" in fields else None, 246 samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None, 247 **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None 248 for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()} 249 ) 250 251 log.info("Finished constructing SNPObject") 252 return snpobj 253 254 def _resolve_variant_idxs_for_iter( 255 self, 256 *, 257 variant_ids: Optional[np.ndarray], 258 variant_idxs: Optional[np.ndarray], 259 separator: str = None, 260 ) -> np.ndarray: 261 """ 262 Resolve variant selectors to canonical file-order row indices. 263 """ 264 filename_noext = str(self.filename) 265 for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]: 266 if filename_noext.endswith(ext): 267 filename_noext = filename_noext[:-len(ext)] 268 break 269 270 pvar_filename = None 271 for ext in [".pvar", ".pvar.zst"]: 272 candidate = filename_noext + ext 273 if os.path.exists(candidate): 274 pvar_filename = candidate 275 break 276 if pvar_filename is None: 277 raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}") 278 279 local_separator = separator 280 281 pvar_has_header = True 282 pvar_header_line_num = 0 283 with _open_textfile(pvar_filename) as file: 284 for line_num, line in enumerate(file): 285 if line.startswith("##"): 286 continue 287 if local_separator is None: 288 local_separator = csv.Sniffer().sniff(file.readline()).delimiter 289 if line.startswith("#CHROM"): 290 pvar_header_line_num = line_num 291 header = line.strip().split() 292 break 293 if not line.startswith("#"): 294 pvar_has_header = False 295 cols_in_pvar = len(line.strip().split(local_separator)) 296 if cols_in_pvar == 5: 297 header = ["#CHROM", "ID", "POS", "ALT", "REF"] 298 elif cols_in_pvar == 6: 299 header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"] 300 else: 301 raise ValueError(f"{pvar_filename} is not a valid pvar file.") 302 break 303 304 pvar_reading_args = { 305 "separator": local_separator, 306 "skip_rows": pvar_header_line_num, 307 "has_header": pvar_has_header, 308 "new_columns": None if pvar_has_header else header, 309 "schema_overrides": { 310 "#CHROM": pl.String, 311 "POS": pl.UInt32, 312 "ID": pl.String, 313 "REF": pl.String, 314 "ALT": pl.String, 315 }, 316 "null_values": ["NA"], 317 } 318 if pvar_filename.endswith(".zst"): 319 pvar = pl.read_csv(pvar_filename, **pvar_reading_args) 320 else: 321 pvar = pl.scan_csv(pvar_filename, **pvar_reading_args).collect() 322 323 variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index() 324 325 if variant_ids is not None: 326 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 327 variant_id_or_pos = ( 328 pl.col("ID").is_in(variant_id_values) 329 | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in( 330 variant_id_values 331 ) 332 ) 333 resolved = ( 334 variant_meta.filter(variant_id_or_pos) 335 .select("index") 336 .to_series() 337 .to_numpy() 338 ) 339 return np.asarray(resolved, dtype=np.uint32) 340 341 if variant_idxs is not None: 342 requested = np.asarray(variant_idxs, dtype=np.uint32).ravel() 343 resolved = ( 344 variant_meta.filter(pl.col("index").is_in(requested)) 345 .select("index") 346 .to_series() 347 .to_numpy() 348 ) 349 return np.asarray(resolved, dtype=np.uint32) 350 351 return np.arange(variant_meta.height, dtype=np.uint32) 352 353 def iter_read( 354 self, 355 fields: Optional[List[str]] = None, 356 exclude_fields: Optional[List[str]] = None, 357 sample_ids: Optional[np.ndarray] = None, 358 sample_idxs: Optional[np.ndarray] = None, 359 variant_ids: Optional[np.ndarray] = None, 360 variant_idxs: Optional[np.ndarray] = None, 361 sum_strands: bool = False, 362 separator: str = None, 363 chunk_size: int = 10_000, 364 ) -> Iterator[SNPObject]: 365 """ 366 Stream the PGEN fileset in variant chunks. 367 368 This yields a sequence of SNPObject chunks along the SNP axis. 369 """ 370 if chunk_size < 1: 371 raise ValueError("chunk_size must be >= 1.") 372 if sample_idxs is not None and sample_ids is not None: 373 raise ValueError("Only one of sample_idxs and sample_ids can be specified.") 374 if variant_idxs is not None and variant_ids is not None: 375 raise ValueError("Only one of variant_idxs and variant_ids can be specified.") 376 377 selectors = self._resolve_variant_idxs_for_iter( 378 variant_ids=variant_ids, 379 variant_idxs=variant_idxs, 380 separator=separator, 381 ) 382 383 n_selectors = int(selectors.size) 384 for start in range(0, n_selectors, int(chunk_size)): 385 stop = min(start + int(chunk_size), n_selectors) 386 selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32) 387 yield self.read( 388 fields=fields, 389 exclude_fields=exclude_fields, 390 sample_ids=sample_ids, 391 sample_idxs=sample_idxs, 392 variant_idxs=selector_chunk, 393 sum_strands=sum_strands, 394 separator=separator, 395 )
Abstract class for SNP readers.
Attributes:
- _filename: The path to the file storing SNP data.
26 def read( 27 self, 28 fields: Optional[List[str]] = None, 29 exclude_fields: Optional[List[str]] = None, 30 sample_ids: Optional[np.ndarray] = None, 31 sample_idxs: Optional[np.ndarray] = None, 32 variant_ids: Optional[np.ndarray] = None, 33 variant_idxs: Optional[np.ndarray] = None, 34 sum_strands: bool = False, 35 separator: str = None, 36 ) -> SNPObject: 37 """ 38 Read a pgen fileset (pgen, psam, pvar) into a SNPObject. 39 40 Args: 41 fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. 42 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. 43 To extract all fields, set fields to None. Defaults to None. 44 exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. 45 Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. 46 To exclude no fields, set exclude_fields to None. Defaults to None. 47 sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read. 48 sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read. 49 variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read. 50 variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read. 51 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 52 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 53 Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger. 54 separator: Separator used in the pvar file. If None, the separator is automatically detected. 55 If the automatic detection fails, please specify the separator manually. 56 57 Returns: 58 **SNPObject**: 59 A SNPObject instance. 60 """ 61 assert ( 62 sample_idxs is None or sample_ids is None 63 ), "Only one of sample_idxs and sample_ids can be specified" 64 assert ( 65 variant_idxs is None or variant_ids is None 66 ), "Only one of variant_idxs and variant_ids can be specified" 67 68 if isinstance(fields, str): 69 fields = [fields] 70 if isinstance(exclude_fields, str): 71 exclude_fields = [exclude_fields] 72 73 fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"] 74 exclude_fields = exclude_fields or [] 75 fields = [field for field in fields if field not in exclude_fields] 76 only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None 77 78 filename_noext = str(self.filename) 79 for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]: 80 if filename_noext.endswith(ext): 81 filename_noext = filename_noext[:-len(ext)] 82 break 83 84 if only_read_pgen: 85 file_num_samples = None # Not needed for pgen 86 file_num_variants = None # Not needed 87 else: 88 pvar_extensions = [".pvar", ".pvar.zst"] 89 pvar_filename = None 90 for ext in pvar_extensions: 91 possible_pvar = filename_noext + ext 92 if os.path.exists(possible_pvar): 93 pvar_filename = possible_pvar 94 break 95 if pvar_filename is None: 96 raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}") 97 98 log.info(f"Reading {pvar_filename}") 99 100 pvar_has_header = True 101 pvar_header_line_num = 0 102 with _open_textfile(pvar_filename) as file: 103 for line_num, line in enumerate(file): 104 if line.startswith("##"): # Metadata 105 continue 106 else: 107 if separator is None: 108 separator = csv.Sniffer().sniff(file.readline()).delimiter 109 if line.startswith("#CHROM"): # Header 110 pvar_header_line_num = line_num 111 header = line.strip().split() 112 break 113 elif not line.startswith("#"): # If no header, look at line 1 114 pvar_has_header = False 115 cols_in_pvar = len(line.strip().split(separator)) 116 if cols_in_pvar == 5: 117 header = ["#CHROM", "ID", "POS", "ALT", "REF"] 118 elif cols_in_pvar == 6: 119 header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"] 120 else: 121 raise ValueError( 122 f"{pvar_filename} is not a valid pvar file." 123 ) 124 break 125 126 pvar_reading_args = { 127 'separator': separator, 128 'skip_rows': pvar_header_line_num, 129 'has_header': pvar_has_header, 130 'new_columns': None if pvar_has_header else header, 131 'schema_overrides': { 132 "#CHROM": pl.String, 133 "POS": pl.UInt32, 134 "ID": pl.String, 135 "REF": pl.String, 136 "ALT": pl.String, 137 }, 138 'null_values': ["NA"], 139 } 140 if pvar_filename.endswith('.zst'): 141 pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy() 142 else: 143 pvar = pl.scan_csv(pvar_filename, **pvar_reading_args) 144 145 # We need to map requested IDs to row positions before reading genotypes. 146 variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect() 147 file_num_variants = variant_meta.height 148 149 if variant_ids is not None: 150 variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)] 151 variant_id_or_pos = ( 152 pl.col("ID").is_in(variant_id_values) 153 | pl.concat_str( 154 [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)] 155 ).is_in(variant_id_values) 156 ) 157 variant_idxs = ( 158 variant_meta.filter(variant_id_or_pos) 159 .select("index") 160 .to_series() 161 .to_numpy() 162 ) 163 164 if variant_idxs is None: 165 num_variants = file_num_variants 166 variant_idxs = np.arange(num_variants, dtype=np.uint32) 167 pvar = pvar.collect() 168 else: 169 pvar = ( 170 pvar.with_row_index() 171 .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel())) 172 .collect() 173 ) 174 variant_idxs = pvar.select("index").to_series().to_numpy() 175 variant_idxs = np.asarray(variant_idxs, dtype=np.uint32) 176 num_variants = np.size(variant_idxs) 177 pvar = pvar.drop("index") 178 179 log.info(f"Reading {filename_noext}.psam") 180 181 with open(filename_noext + ".psam") as file: 182 first_line = file.readline().strip() 183 psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID")) 184 185 psam = pl.read_csv( 186 filename_noext + ".psam", 187 separator=separator, 188 has_header=psam_has_header, 189 new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"], 190 null_values=["NA"], 191 ).with_row_index() 192 if "#IID" in psam.columns: 193 psam = psam.rename({"#IID": "IID"}) 194 if "#FID" in psam.columns: 195 psam = psam.rename({"#FID": "FID"}) 196 197 file_num_samples = psam.height 198 199 if sample_ids is not None: 200 psam = psam.filter(pl.col("IID").is_in(sample_ids)) 201 sample_idxs = psam.select("index").to_series().to_numpy() 202 num_samples = np.size(sample_idxs) 203 elif sample_idxs is not None: 204 num_samples = np.size(sample_idxs) 205 sample_idxs = np.array(sample_idxs, dtype=np.uint32) 206 psam = psam.filter(pl.col("index").is_in(sample_idxs)) 207 else: 208 num_samples = file_num_samples 209 210 if "GT" in fields: 211 log.info(f"Reading {filename_noext}.pgen") 212 pgen_reader = pg.PgenReader( 213 str.encode(filename_noext + ".pgen"), 214 raw_sample_ct=file_num_samples, 215 variant_ct=file_num_variants, 216 sample_subset=sample_idxs, 217 ) 218 219 if only_read_pgen: 220 num_samples = pgen_reader.get_raw_sample_ct() 221 num_variants = pgen_reader.get_variant_ct() 222 variant_idxs = np.arange(num_variants, dtype=np.uint32) 223 224 # required arrays: variant_idxs + sample_idxs + genotypes 225 if not sum_strands: 226 required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4 227 else: 228 required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples 229 log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each") 230 231 if not sum_strands: 232 genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32) # cannot use int8 because of pgenlib 233 pgen_reader.read_alleles_list(variant_idxs, genotypes) 234 genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2)) 235 else: 236 genotypes = np.empty((num_variants, num_samples), dtype=np.int8) 237 pgen_reader.read_list(variant_idxs, genotypes) 238 pgen_reader.close() 239 else: 240 genotypes = None 241 242 log.info("Constructing SNPObject") 243 244 snpobj = SNPObject( 245 calldata_gt=genotypes if "GT" in fields else None, 246 samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None, 247 **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None 248 for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()} 249 ) 250 251 log.info("Finished constructing SNPObject") 252 return snpobj
Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
Arguments:
- fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To extract all fields, set fields to None. Defaults to None.
- exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To exclude no fields, set exclude_fields to None. Defaults to None.
- sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
- sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
- variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
- variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
- sum_strands: If True, maternal and paternal strands are combined into a single
int8array with values{0, 1, 2}. If False, strands are stored separately as anint8array with values{0, 1}for each strand. Note: With the pgenlib backend,Falseuses~8×more RAM, thoughcalldata_gtis only2×larger. - separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.
Returns:
SNPObject: A SNPObject instance.
353 def iter_read( 354 self, 355 fields: Optional[List[str]] = None, 356 exclude_fields: Optional[List[str]] = None, 357 sample_ids: Optional[np.ndarray] = None, 358 sample_idxs: Optional[np.ndarray] = None, 359 variant_ids: Optional[np.ndarray] = None, 360 variant_idxs: Optional[np.ndarray] = None, 361 sum_strands: bool = False, 362 separator: str = None, 363 chunk_size: int = 10_000, 364 ) -> Iterator[SNPObject]: 365 """ 366 Stream the PGEN fileset in variant chunks. 367 368 This yields a sequence of SNPObject chunks along the SNP axis. 369 """ 370 if chunk_size < 1: 371 raise ValueError("chunk_size must be >= 1.") 372 if sample_idxs is not None and sample_ids is not None: 373 raise ValueError("Only one of sample_idxs and sample_ids can be specified.") 374 if variant_idxs is not None and variant_ids is not None: 375 raise ValueError("Only one of variant_idxs and variant_ids can be specified.") 376 377 selectors = self._resolve_variant_idxs_for_iter( 378 variant_ids=variant_ids, 379 variant_idxs=variant_idxs, 380 separator=separator, 381 ) 382 383 n_selectors = int(selectors.size) 384 for start in range(0, n_selectors, int(chunk_size)): 385 stop = min(start + int(chunk_size), n_selectors) 386 selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32) 387 yield self.read( 388 fields=fields, 389 exclude_fields=exclude_fields, 390 sample_ids=sample_ids, 391 sample_idxs=sample_idxs, 392 variant_idxs=selector_chunk, 393 sum_strands=sum_strands, 394 separator=separator, 395 )
Stream the PGEN fileset in variant chunks.
This yields a sequence of SNPObject chunks along the SNP axis.
21@SNPBaseReader.register 22class VCFReader(SNPBaseReader): 23 def __init__(self, filename: Union[str, pathlib.Path]): 24 super().__init__(filename) 25 self._igd_path: Optional[pathlib.Path] = None 26 self._grg_path: Optional[pathlib.Path] = None 27 self.debug : bool = False 28 def read( 29 self, 30 fields: Optional[List[str]] = None, 31 exclude_fields: Optional[List[str]] = None, 32 rename_fields: Optional[dict] = None, 33 fills: Optional[dict] = None, 34 region: Optional[str] = None, 35 samples: Optional[List[str]] = None, 36 sum_strands: bool = False, 37 ) -> SNPObject: 38 """ 39 Read a vcf file into a SNPObject. 40 41 Args: 42 fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS', 43 'calldata/GT']. If you are feeling lazy, you can drop the 'variants/' 44 and 'calldata/' prefixes, in which case the fields will be matched 45 against fields declared in the VCF header, with variants taking priority 46 over calldata if a field with the same ID exists both in INFO and FORMAT 47 headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out 48 for fields like 'DP' which can be both INFO and FORMAT. To extract all 49 fields, provide just the string '*'. To extract all variants fields 50 (including all INFO fields) provide 'variants/*'. To extract all 51 calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'. 52 exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'. 53 rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names. 54 fills: Override the fill value used for empty values. Should be a dictionary 55 mapping field names to fill values. 56 region: Genomic region to extract variants for. If provided, should be a 57 tabix-style region string, which can be either just a chromosome name 58 (e.g., '2L'), or a chromosome name followed by 1-based beginning and 59 end coordinates (e.g., '2L:100000-200000'). Note that only variants 60 whose start position (POS) is within the requested range will be included. 61 This is slightly different from the default tabix behaviour, where a 62 variant (e.g., deletion) may be included if its position (POS) occurs 63 before the requested region but its reference allele overlaps the 64 region - such a variant will not be included in the data returned 65 by this function. 66 samples: Selection of samples to extract calldata for. If provided, should be 67 a list of strings giving sample identifiers. May also be a list of 68 integers giving indices of selected samples. 69 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 70 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 71 72 Returns: 73 **SNPObject**: 74 A SNPObject instance. 75 """ 76 log.info(f"Reading {self.filename}") 77 78 vcf_dict = allel.read_vcf( 79 str(self.filename), 80 fields=fields, 81 exclude_fields=exclude_fields, 82 rename_fields=rename_fields, 83 fills=fills, 84 region=region, 85 samples=samples, 86 alt_number=1, 87 ) 88 assert vcf_dict is not None # suppress Flake8 warning 89 90 genotypes = vcf_dict["calldata/GT"].astype(np.int8) 91 if sum_strands: 92 genotypes = genotypes.sum(axis=2, dtype=np.int8) 93 94 snpobj = SNPObject( 95 calldata_gt=genotypes, 96 samples=vcf_dict["samples"], 97 variants_ref=vcf_dict["variants/REF"], 98 variants_alt=vcf_dict["variants/ALT"], 99 variants_chrom=vcf_dict["variants/CHROM"], 100 variants_filter_pass=vcf_dict["variants/FILTER_PASS"], 101 variants_id=vcf_dict["variants/ID"], 102 variants_pos=vcf_dict["variants/POS"], 103 variants_qual=vcf_dict["variants/QUAL"], 104 ) 105 106 log.info(f"Finished reading {self.filename}") 107 return snpobj 108 def to_igd(self, 109 igd_file : Optional[str] = None, 110 logfile_out : Optional[str] = None, 111 logfile_err : Optional[str] = None) -> None: 112 """ 113 Convert the current VCF input file to IGD via `grg convert`. 114 115 Args: 116 igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`. 117 logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null). 118 logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null). 119 120 """ 121 122 if not exists(self.filename): 123 raise FileNotFoundError(f"File {self.filename} does not exist") 124 125 lf_o : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a") 126 lf_e : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a") 127 name, _ext1 = splitext(str(self.filename)) 128 name, _ext2 = splitext(name) 129 if igd_file is None: 130 self._igd_path = pathlib.Path(name + ".igd") 131 else: 132 self._igd_path = pathlib.Path(igd_file) 133 134 try: 135 subprocess.run( 136 ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))], 137 stdout=lf_o, 138 stderr=lf_e, 139 check=True, 140 ) 141 finally: 142 if not isinstance(lf_o, int): 143 lf_o.close() 144 if not isinstance(lf_e, int): 145 lf_e.close() 146 147 def to_grg(self, 148 range: Optional[str] = None, 149 parts: Optional[int] = None, 150 jobs: Optional[int] = None, 151 trees: Optional[int] = None, 152 binmuts: Optional[bool] = None, 153 no_file_cleanup: Optional[bool] = None, 154 maf_flip: Optional[bool] = None, 155 population_ids: Optional[str] = None, 156 mutation_batch_size: Optional[int] = None, 157 igd_file: Optional[str] = None, 158 out_file: Optional[str] = None, 159 verbose: Optional[bool] = None, 160 no_merge: Optional[bool] = None, 161 force: Optional[bool] = None, 162 logfile_out: Optional[str] = None, 163 logfile_err: Optional[str] = None 164 ) -> None: 165 """ 166 Convert VCF input to a GRG file via `grg construct`. 167 168 If `igd_file` exists, it is used as construct input. If it does not 169 exist, it is first created via `to_igd` and then used for construction. 170 """ 171 input_file = pathlib.Path(self.filename).resolve() 172 if igd_file is not None: 173 candidate_igd = pathlib.Path(igd_file) 174 if candidate_igd.exists(): 175 self._igd_path = candidate_igd.resolve() 176 else: 177 self.to_igd(igd_file, logfile_out, logfile_err) 178 input_file = pathlib.Path(self._igd_path).resolve() 179 180 if out_file is not None: 181 self._grg_path = pathlib.Path(out_file) 182 else: 183 default_stem = splitext(str(input_file))[0] 184 if default_stem.endswith(".vcf"): 185 default_stem = splitext(default_stem)[0] 186 self._grg_path = pathlib.Path(default_stem + ".grg") 187 188 lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a") 189 lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a") 190 args = ["grg", "construct"] 191 args += self._setarg(range, "-r", None) 192 args += self._setarg(parts, "-p", 50) 193 args += self._setarg(jobs, "-j", multiprocessing.cpu_count()) 194 args += self._setarg(trees, "-t", 16) 195 args += self._setarg(binmuts, "--binary-muts", None) 196 args += self._setarg(no_file_cleanup, "--no-file-cleanup", None) 197 args += self._setarg(maf_flip, "--maf-flip", None) 198 args += self._setarg(population_ids, "--population-ids", None) 199 args += self._setarg(mutation_batch_size, "--mutation-batch-size", None) 200 args += self._setarg(str(self._grg_path), "--out-file", None) 201 args += self._setarg(verbose, "--verbose", None) 202 args += self._setarg(no_merge, "--no-merge", None) 203 args += self._setarg(force, "--force", None) 204 args += [str(input_file)] 205 log.debug("Running grg construct command: %s", args) 206 try: 207 subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True) 208 finally: 209 if not isinstance(lf_o, int): 210 lf_o.close() 211 if not isinstance(lf_e, int): 212 lf_e.close() 213 214 def _setarg(self, x: Optional[Any], flag: str, default_arg: Optional[Any] = None) -> List[str]: 215 if isinstance(x, bool): 216 return [flag] if x else [] 217 if x is None and default_arg is not None: 218 return [flag, f"{default_arg}"] 219 elif x is not None: 220 return [flag, f"{x}"] 221 else: 222 return []
Abstract class for SNP readers.
Attributes:
- _filename: The path to the file storing SNP data.
23 def __init__(self, filename: Union[str, pathlib.Path]): 24 super().__init__(filename) 25 self._igd_path: Optional[pathlib.Path] = None 26 self._grg_path: Optional[pathlib.Path] = None 27 self.debug : bool = False
Initialize the SNPBaseReader.
Arguments:
- filename: The path to the file storing SNP data.
28 def read( 29 self, 30 fields: Optional[List[str]] = None, 31 exclude_fields: Optional[List[str]] = None, 32 rename_fields: Optional[dict] = None, 33 fills: Optional[dict] = None, 34 region: Optional[str] = None, 35 samples: Optional[List[str]] = None, 36 sum_strands: bool = False, 37 ) -> SNPObject: 38 """ 39 Read a vcf file into a SNPObject. 40 41 Args: 42 fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS', 43 'calldata/GT']. If you are feeling lazy, you can drop the 'variants/' 44 and 'calldata/' prefixes, in which case the fields will be matched 45 against fields declared in the VCF header, with variants taking priority 46 over calldata if a field with the same ID exists both in INFO and FORMAT 47 headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out 48 for fields like 'DP' which can be both INFO and FORMAT. To extract all 49 fields, provide just the string '*'. To extract all variants fields 50 (including all INFO fields) provide 'variants/*'. To extract all 51 calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'. 52 exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'. 53 rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names. 54 fills: Override the fill value used for empty values. Should be a dictionary 55 mapping field names to fill values. 56 region: Genomic region to extract variants for. If provided, should be a 57 tabix-style region string, which can be either just a chromosome name 58 (e.g., '2L'), or a chromosome name followed by 1-based beginning and 59 end coordinates (e.g., '2L:100000-200000'). Note that only variants 60 whose start position (POS) is within the requested range will be included. 61 This is slightly different from the default tabix behaviour, where a 62 variant (e.g., deletion) may be included if its position (POS) occurs 63 before the requested region but its reference allele overlaps the 64 region - such a variant will not be included in the data returned 65 by this function. 66 samples: Selection of samples to extract calldata for. If provided, should be 67 a list of strings giving sample identifiers. May also be a list of 68 integers giving indices of selected samples. 69 sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 70 If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 71 72 Returns: 73 **SNPObject**: 74 A SNPObject instance. 75 """ 76 log.info(f"Reading {self.filename}") 77 78 vcf_dict = allel.read_vcf( 79 str(self.filename), 80 fields=fields, 81 exclude_fields=exclude_fields, 82 rename_fields=rename_fields, 83 fills=fills, 84 region=region, 85 samples=samples, 86 alt_number=1, 87 ) 88 assert vcf_dict is not None # suppress Flake8 warning 89 90 genotypes = vcf_dict["calldata/GT"].astype(np.int8) 91 if sum_strands: 92 genotypes = genotypes.sum(axis=2, dtype=np.int8) 93 94 snpobj = SNPObject( 95 calldata_gt=genotypes, 96 samples=vcf_dict["samples"], 97 variants_ref=vcf_dict["variants/REF"], 98 variants_alt=vcf_dict["variants/ALT"], 99 variants_chrom=vcf_dict["variants/CHROM"], 100 variants_filter_pass=vcf_dict["variants/FILTER_PASS"], 101 variants_id=vcf_dict["variants/ID"], 102 variants_pos=vcf_dict["variants/POS"], 103 variants_qual=vcf_dict["variants/QUAL"], 104 ) 105 106 log.info(f"Finished reading {self.filename}") 107 return snpobj
Read a vcf file into a SNPObject.
Arguments:
- fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS', 'calldata/GT']. If you are feeling lazy, you can drop the 'variants/' and 'calldata/' prefixes, in which case the fields will be matched against fields declared in the VCF header, with variants taking priority over calldata if a field with the same ID exists both in INFO and FORMAT headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out for fields like 'DP' which can be both INFO and FORMAT. To extract all fields, provide just the string ''. To extract all variants fields (including all INFO fields) provide 'variants/'. To extract all calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
- exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
- rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
- fills: Override the fill value used for empty values. Should be a dictionary mapping field names to fill values.
- region: Genomic region to extract variants for. If provided, should be a tabix-style region string, which can be either just a chromosome name (e.g., '2L'), or a chromosome name followed by 1-based beginning and end coordinates (e.g., '2L:100000-200000'). Note that only variants whose start position (POS) is within the requested range will be included. This is slightly different from the default tabix behaviour, where a variant (e.g., deletion) may be included if its position (POS) occurs before the requested region but its reference allele overlaps the region - such a variant will not be included in the data returned by this function.
- samples: Selection of samples to extract calldata for. If provided, should be a list of strings giving sample identifiers. May also be a list of integers giving indices of selected samples.
- sum_strands: If True, maternal and paternal strands are combined into a single
int8array with values{0, 1, 2}. If False, strands are stored separately as anint8array with values{0, 1}for each strand.
Returns:
SNPObject: A SNPObject instance.
108 def to_igd(self, 109 igd_file : Optional[str] = None, 110 logfile_out : Optional[str] = None, 111 logfile_err : Optional[str] = None) -> None: 112 """ 113 Convert the current VCF input file to IGD via `grg convert`. 114 115 Args: 116 igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`. 117 logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null). 118 logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null). 119 120 """ 121 122 if not exists(self.filename): 123 raise FileNotFoundError(f"File {self.filename} does not exist") 124 125 lf_o : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a") 126 lf_e : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a") 127 name, _ext1 = splitext(str(self.filename)) 128 name, _ext2 = splitext(name) 129 if igd_file is None: 130 self._igd_path = pathlib.Path(name + ".igd") 131 else: 132 self._igd_path = pathlib.Path(igd_file) 133 134 try: 135 subprocess.run( 136 ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))], 137 stdout=lf_o, 138 stderr=lf_e, 139 check=True, 140 ) 141 finally: 142 if not isinstance(lf_o, int): 143 lf_o.close() 144 if not isinstance(lf_e, int): 145 lf_e.close()
Convert the current VCF input file to IGD via grg convert.
Arguments:
- igd_file: Output IGD file path. Defaults to
<vcf_stem>.igd. - logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
- logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
147 def to_grg(self, 148 range: Optional[str] = None, 149 parts: Optional[int] = None, 150 jobs: Optional[int] = None, 151 trees: Optional[int] = None, 152 binmuts: Optional[bool] = None, 153 no_file_cleanup: Optional[bool] = None, 154 maf_flip: Optional[bool] = None, 155 population_ids: Optional[str] = None, 156 mutation_batch_size: Optional[int] = None, 157 igd_file: Optional[str] = None, 158 out_file: Optional[str] = None, 159 verbose: Optional[bool] = None, 160 no_merge: Optional[bool] = None, 161 force: Optional[bool] = None, 162 logfile_out: Optional[str] = None, 163 logfile_err: Optional[str] = None 164 ) -> None: 165 """ 166 Convert VCF input to a GRG file via `grg construct`. 167 168 If `igd_file` exists, it is used as construct input. If it does not 169 exist, it is first created via `to_igd` and then used for construction. 170 """ 171 input_file = pathlib.Path(self.filename).resolve() 172 if igd_file is not None: 173 candidate_igd = pathlib.Path(igd_file) 174 if candidate_igd.exists(): 175 self._igd_path = candidate_igd.resolve() 176 else: 177 self.to_igd(igd_file, logfile_out, logfile_err) 178 input_file = pathlib.Path(self._igd_path).resolve() 179 180 if out_file is not None: 181 self._grg_path = pathlib.Path(out_file) 182 else: 183 default_stem = splitext(str(input_file))[0] 184 if default_stem.endswith(".vcf"): 185 default_stem = splitext(default_stem)[0] 186 self._grg_path = pathlib.Path(default_stem + ".grg") 187 188 lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a") 189 lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a") 190 args = ["grg", "construct"] 191 args += self._setarg(range, "-r", None) 192 args += self._setarg(parts, "-p", 50) 193 args += self._setarg(jobs, "-j", multiprocessing.cpu_count()) 194 args += self._setarg(trees, "-t", 16) 195 args += self._setarg(binmuts, "--binary-muts", None) 196 args += self._setarg(no_file_cleanup, "--no-file-cleanup", None) 197 args += self._setarg(maf_flip, "--maf-flip", None) 198 args += self._setarg(population_ids, "--population-ids", None) 199 args += self._setarg(mutation_batch_size, "--mutation-batch-size", None) 200 args += self._setarg(str(self._grg_path), "--out-file", None) 201 args += self._setarg(verbose, "--verbose", None) 202 args += self._setarg(no_merge, "--no-merge", None) 203 args += self._setarg(force, "--force", None) 204 args += [str(input_file)] 205 log.debug("Running grg construct command: %s", args) 206 try: 207 subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True) 208 finally: 209 if not isinstance(lf_o, int): 210 lf_o.close() 211 if not isinstance(lf_e, int): 212 lf_e.close()
Convert VCF input to a GRG file via grg construct.
If igd_file exists, it is used as construct input. If it does not
exist, it is first created via to_igd and then used for construction.
14class BEDWriter: 15 """Writes an object in bed/bim/fam formats in the specified output path. 16 17 Args: 18 snpobj: The SNPObject to be written. 19 file: The output file path. 20 21 """ 22 23 def __init__(self, snpobj: SNPObject, filename: str): 24 self.__snpobj = snpobj.copy() 25 self.__filename = Path(filename) 26 27 def write( 28 self, 29 rename_missing_values: bool = True, 30 before: Union[int, float, str] = -1, 31 after: Union[int, float, str] = '.' 32 ): 33 """ 34 Writes the SNPObject to bed/bim/fam formats. 35 36 Args: 37 rename_missing_values (bool, optional): 38 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 39 Defaults to True. 40 before (int, float, or str, default=-1): 41 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 42 Default is -1. 43 after (int, float, or str, default='.'): 44 The value that will replace `before`. Default is '.'. 45 """ 46 # Save .bed file 47 if self.__filename.suffix != '.bed': 48 self.__filename = self.__filename.with_suffix('.bed') 49 50 log.info(f"Writing .bed file: {self.__filename}") 51 52 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 53 if rename_missing_values: 54 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 55 56 # If the input matrix has three dimensions, it indicates that the data is divided into two strands. 57 if len(self.__snpobj.calldata_gt.shape) == 3: 58 # Sum the two strands 59 self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8) 60 61 # Infer the number of samples and variants from the matrix 62 samples, variants = self.__snpobj.calldata_gt.shape 63 64 # Define the PgenWriter to save the data 65 data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'), 66 sample_ct=samples, 67 variant_ct=variants, 68 nonref_flags=True, 69 hardcall_phase_present=False, 70 dosage_present=True, 71 dosage_phase_present=False) 72 73 # Fill the data_save object with the matrix of individuals x variants 74 for snp_i in range(0, variants): 75 data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i])) 76 77 # Save the .bed file 78 data_save.close() 79 80 log.info(f"Finished writing .bed file: {self.__filename}") 81 82 # Remove .bed from the file name 83 if self.__filename.suffix == '.bed': 84 self.__filename = self.__filename.with_suffix('') 85 86 # Save .fam file 87 log.info(f"Writing .fam file: {self.__filename}") 88 89 # Fill .fam file 90 fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait']) 91 fam_file['iid'] = self.__snpobj.samples 92 fam_file['fid'] = self.__snpobj.samples 93 94 # Save .fam file 95 fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False) 96 log.info(f"Finished writing .fam file: {self.__filename}") 97 98 # Save .bim file 99 log.info(f"Writing .bim file: {self.__filename}") 100 101 # Fill .bim file 102 bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1']) 103 bim_file['chrom'] = self.__snpobj.variants_chrom 104 bim_file['snp'] = self.__snpobj.variants_id 105 bim_file['cm'] = 0 # TODO: read, save and write too if available? 106 log.warning("The .bim file is being saved with 0 cM values.") 107 bim_file['pos'] = self.__snpobj.variants_pos 108 bim_file['a0'] = self.__snpobj.variants_alt 109 bim_file['a1'] = self.__snpobj.variants_ref 110 111 # Save .bim file 112 bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False) 113 log.info(f"Finished writing .bim file: {self.__filename}")
Writes an object in bed/bim/fam formats in the specified output path.
Arguments:
- snpobj: The SNPObject to be written.
- file: The output file path.
27 def write( 28 self, 29 rename_missing_values: bool = True, 30 before: Union[int, float, str] = -1, 31 after: Union[int, float, str] = '.' 32 ): 33 """ 34 Writes the SNPObject to bed/bim/fam formats. 35 36 Args: 37 rename_missing_values (bool, optional): 38 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 39 Defaults to True. 40 before (int, float, or str, default=-1): 41 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 42 Default is -1. 43 after (int, float, or str, default='.'): 44 The value that will replace `before`. Default is '.'. 45 """ 46 # Save .bed file 47 if self.__filename.suffix != '.bed': 48 self.__filename = self.__filename.with_suffix('.bed') 49 50 log.info(f"Writing .bed file: {self.__filename}") 51 52 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 53 if rename_missing_values: 54 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 55 56 # If the input matrix has three dimensions, it indicates that the data is divided into two strands. 57 if len(self.__snpobj.calldata_gt.shape) == 3: 58 # Sum the two strands 59 self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8) 60 61 # Infer the number of samples and variants from the matrix 62 samples, variants = self.__snpobj.calldata_gt.shape 63 64 # Define the PgenWriter to save the data 65 data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'), 66 sample_ct=samples, 67 variant_ct=variants, 68 nonref_flags=True, 69 hardcall_phase_present=False, 70 dosage_present=True, 71 dosage_phase_present=False) 72 73 # Fill the data_save object with the matrix of individuals x variants 74 for snp_i in range(0, variants): 75 data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i])) 76 77 # Save the .bed file 78 data_save.close() 79 80 log.info(f"Finished writing .bed file: {self.__filename}") 81 82 # Remove .bed from the file name 83 if self.__filename.suffix == '.bed': 84 self.__filename = self.__filename.with_suffix('') 85 86 # Save .fam file 87 log.info(f"Writing .fam file: {self.__filename}") 88 89 # Fill .fam file 90 fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait']) 91 fam_file['iid'] = self.__snpobj.samples 92 fam_file['fid'] = self.__snpobj.samples 93 94 # Save .fam file 95 fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False) 96 log.info(f"Finished writing .fam file: {self.__filename}") 97 98 # Save .bim file 99 log.info(f"Writing .bim file: {self.__filename}") 100 101 # Fill .bim file 102 bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1']) 103 bim_file['chrom'] = self.__snpobj.variants_chrom 104 bim_file['snp'] = self.__snpobj.variants_id 105 bim_file['cm'] = 0 # TODO: read, save and write too if available? 106 log.warning("The .bim file is being saved with 0 cM values.") 107 bim_file['pos'] = self.__snpobj.variants_pos 108 bim_file['a0'] = self.__snpobj.variants_alt 109 bim_file['a1'] = self.__snpobj.variants_ref 110 111 # Save .bim file 112 bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False) 113 log.info(f"Finished writing .bim file: {self.__filename}")
Writes the SNPObject to bed/bim/fam formats.
Arguments:
- rename_missing_values (bool, optional): If True, renames potential missing values in
snpobj.calldata_gtbefore writing. Defaults to True. - before (int, float, or str, default=-1): The current representation of missing values in
calldata_gt. Common values might be -1, '.', or NaN. Default is -1. - after (int, float, or str, default='.'): The value that will replace
before. Default is '.'.
15class PGENWriter: 16 """ 17 Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path. 18 """ 19 20 def __init__(self, snpobj: SNPObject, filename: str): 21 """ 22 Initializes the PGENWriter instance. 23 24 Args: 25 snpobj (SNPObject): The SNPObject containing genotype data to be written. 26 filename (str): Base path for the output files (excluding extension). 27 """ 28 self.__snpobj = snpobj 29 self.__filename = Path(filename) 30 31 def write( 32 self, 33 vzs: bool = False, 34 rename_missing_values: bool = True, 35 before: Union[int, float, str] = -1, 36 after: Union[int, float, str] = '.' 37 ): 38 """ 39 Writes the SNPObject data to .pgen, .psam, and .pvar files. 40 41 Args: 42 vzs (bool, optional): 43 If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False. 44 rename_missing_values (bool, optional): 45 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 46 Defaults to True. 47 before (int, float, or str, default=-1): 48 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 49 Default is -1. 50 after (int, float, or str, default='.'): 51 The value that will replace `before`. Default is '.'. 52 """ 53 file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst") 54 if self.__filename.suffix in file_extensions: 55 self.__filename = self.__filename.with_suffix('') 56 57 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 58 if rename_missing_values: 59 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 60 61 self.write_pvar(vzs=vzs) 62 self.write_psam() 63 self.write_pgen() 64 65 def write_pvar(self, vzs: bool = False): 66 """ 67 Writes variant data to the .pvar file. 68 69 Args: 70 vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False. 71 """ 72 output_filename = f"{self.__filename}.pvar" 73 if vzs: 74 output_filename += ".zst" 75 log.info(f"Writing to {output_filename} (compressed)") 76 else: 77 log.info(f"Writing to {output_filename}") 78 79 df = pl.DataFrame( 80 { 81 "#CHROM": self.__snpobj.variants_chrom, 82 "POS": self.__snpobj.variants_pos, 83 "ID": self.__snpobj.variants_id, 84 "REF": self.__snpobj.variants_ref, 85 "ALT": self.__snpobj.variants_alt, 86 "FILTER": self.__snpobj.variants_filter_pass, 87 # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost) 88 } 89 ) 90 # TODO: add header to the .pvar file, if not it's lost 91 92 # Write the DataFrame to a CSV string 93 csv_data = df.write_csv(None, separator="\t") 94 95 if vzs: 96 # Compress the CSV data using zstd 97 cctx = zstd.ZstdCompressor() 98 compressed_data = cctx.compress(csv_data.encode('utf-8')) 99 with open(output_filename, 'wb') as f: 100 f.write(compressed_data) 101 else: 102 with open(output_filename, 'w') as f: 103 f.write(csv_data) 104 105 def write_psam(self): 106 """ 107 Writes sample metadata to the .psam file. 108 """ 109 log.info(f"Writing {self.__filename}.psam") 110 df = pl.DataFrame( 111 { 112 "#IID": self.__snpobj.samples, 113 "SEX": "NA", # Add SEX as nan for now 114 # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost) 115 } 116 ) 117 df.write_csv(f"{self.__filename}.psam", separator="\t") 118 119 def write_pgen(self): 120 """ 121 Writes the genotype data to a .pgen file. 122 """ 123 log.info(f"Writing to {self.__filename}.pgen") 124 summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True 125 if not summed_strands: 126 num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape 127 # Flatten the genotype matrix for pgenlib 128 flat_genotypes = self.__snpobj.calldata_gt.reshape( 129 num_variants, num_samples * num_alleles 130 ) 131 with pg.PgenWriter( 132 filename=f"{self.__filename}.pgen".encode('utf-8'), 133 sample_ct=num_samples, 134 variant_ct=num_variants, 135 hardcall_phase_present=True, 136 ) as writer: 137 for variant_index in range(num_variants): 138 writer.append_alleles( 139 flat_genotypes[variant_index].astype(np.int32), all_phased=True 140 ) 141 else: 142 num_variants, num_samples = self.__snpobj.calldata_gt.shape 143 # Transpose to (samples, variants) 144 genotypes = self.__snpobj.calldata_gt.T # Shape is (samples, variants) 145 with pg.PgenWriter( 146 filename=f"{self.__filename}.pgen".encode('utf-8'), 147 sample_ct=num_samples, 148 variant_ct=num_variants, 149 hardcall_phase_present=False, 150 ) as writer: 151 for variant_index in range(num_variants): 152 variant_genotypes = genotypes[:, variant_index].astype(np.int8) 153 # Map missing genotypes to -9 if necessary 154 variant_genotypes[variant_genotypes == -1] = -9 155 writer.append_biallelic(np.ascontiguousarray(variant_genotypes))
Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path.
20 def __init__(self, snpobj: SNPObject, filename: str): 21 """ 22 Initializes the PGENWriter instance. 23 24 Args: 25 snpobj (SNPObject): The SNPObject containing genotype data to be written. 26 filename (str): Base path for the output files (excluding extension). 27 """ 28 self.__snpobj = snpobj 29 self.__filename = Path(filename)
Initializes the PGENWriter instance.
Arguments:
- snpobj (SNPObject): The SNPObject containing genotype data to be written.
- filename (str): Base path for the output files (excluding extension).
31 def write( 32 self, 33 vzs: bool = False, 34 rename_missing_values: bool = True, 35 before: Union[int, float, str] = -1, 36 after: Union[int, float, str] = '.' 37 ): 38 """ 39 Writes the SNPObject data to .pgen, .psam, and .pvar files. 40 41 Args: 42 vzs (bool, optional): 43 If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False. 44 rename_missing_values (bool, optional): 45 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 46 Defaults to True. 47 before (int, float, or str, default=-1): 48 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 49 Default is -1. 50 after (int, float, or str, default='.'): 51 The value that will replace `before`. Default is '.'. 52 """ 53 file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst") 54 if self.__filename.suffix in file_extensions: 55 self.__filename = self.__filename.with_suffix('') 56 57 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 58 if rename_missing_values: 59 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 60 61 self.write_pvar(vzs=vzs) 62 self.write_psam() 63 self.write_pgen()
Writes the SNPObject data to .pgen, .psam, and .pvar files.
Arguments:
- vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
- rename_missing_values (bool, optional): If True, renames potential missing values in
snpobj.calldata_gtbefore writing. Defaults to True. - before (int, float, or str, default=-1): The current representation of missing values in
calldata_gt. Common values might be -1, '.', or NaN. Default is -1. - after (int, float, or str, default='.'): The value that will replace
before. Default is '.'.
65 def write_pvar(self, vzs: bool = False): 66 """ 67 Writes variant data to the .pvar file. 68 69 Args: 70 vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False. 71 """ 72 output_filename = f"{self.__filename}.pvar" 73 if vzs: 74 output_filename += ".zst" 75 log.info(f"Writing to {output_filename} (compressed)") 76 else: 77 log.info(f"Writing to {output_filename}") 78 79 df = pl.DataFrame( 80 { 81 "#CHROM": self.__snpobj.variants_chrom, 82 "POS": self.__snpobj.variants_pos, 83 "ID": self.__snpobj.variants_id, 84 "REF": self.__snpobj.variants_ref, 85 "ALT": self.__snpobj.variants_alt, 86 "FILTER": self.__snpobj.variants_filter_pass, 87 # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost) 88 } 89 ) 90 # TODO: add header to the .pvar file, if not it's lost 91 92 # Write the DataFrame to a CSV string 93 csv_data = df.write_csv(None, separator="\t") 94 95 if vzs: 96 # Compress the CSV data using zstd 97 cctx = zstd.ZstdCompressor() 98 compressed_data = cctx.compress(csv_data.encode('utf-8')) 99 with open(output_filename, 'wb') as f: 100 f.write(compressed_data) 101 else: 102 with open(output_filename, 'w') as f: 103 f.write(csv_data)
Writes variant data to the .pvar file.
Arguments:
- vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
105 def write_psam(self): 106 """ 107 Writes sample metadata to the .psam file. 108 """ 109 log.info(f"Writing {self.__filename}.psam") 110 df = pl.DataFrame( 111 { 112 "#IID": self.__snpobj.samples, 113 "SEX": "NA", # Add SEX as nan for now 114 # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost) 115 } 116 ) 117 df.write_csv(f"{self.__filename}.psam", separator="\t")
Writes sample metadata to the .psam file.
119 def write_pgen(self): 120 """ 121 Writes the genotype data to a .pgen file. 122 """ 123 log.info(f"Writing to {self.__filename}.pgen") 124 summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True 125 if not summed_strands: 126 num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape 127 # Flatten the genotype matrix for pgenlib 128 flat_genotypes = self.__snpobj.calldata_gt.reshape( 129 num_variants, num_samples * num_alleles 130 ) 131 with pg.PgenWriter( 132 filename=f"{self.__filename}.pgen".encode('utf-8'), 133 sample_ct=num_samples, 134 variant_ct=num_variants, 135 hardcall_phase_present=True, 136 ) as writer: 137 for variant_index in range(num_variants): 138 writer.append_alleles( 139 flat_genotypes[variant_index].astype(np.int32), all_phased=True 140 ) 141 else: 142 num_variants, num_samples = self.__snpobj.calldata_gt.shape 143 # Transpose to (samples, variants) 144 genotypes = self.__snpobj.calldata_gt.T # Shape is (samples, variants) 145 with pg.PgenWriter( 146 filename=f"{self.__filename}.pgen".encode('utf-8'), 147 sample_ct=num_samples, 148 variant_ct=num_variants, 149 hardcall_phase_present=False, 150 ) as writer: 151 for variant_index in range(num_variants): 152 variant_genotypes = genotypes[:, variant_index].astype(np.int8) 153 # Map missing genotypes to -9 if necessary 154 variant_genotypes[variant_genotypes == -1] = -9 155 writer.append_biallelic(np.ascontiguousarray(variant_genotypes))
Writes the genotype data to a .pgen file.
14class VCFWriter: 15 """ 16 A writer class for exporting SNP data from a `snputils.snp.genobj.SNPObject` 17 into an `.vcf` file. 18 """ 19 def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False): 20 """ 21 Args: 22 snpobj (SNPObject): 23 A SNPObject instance. 24 file (str or pathlib.Path): 25 Path to the file where the data will be saved. It should end with `.vcf`. 26 If the provided path does not have this extension, the `.vcf` extension will be appended. 27 n_jobs: 28 Number of jobs to run in parallel. 29 - `None`: use 1 job unless within a `joblib.parallel_backend` context. 30 - `-1`: use all available processors. 31 - Any other integer: use the specified number of jobs. 32 phased: 33 If True, genotype data is written in "maternal|paternal" format. 34 If False, genotype data is written in "maternal/paternal" format. 35 """ 36 self.__snpobj = snpobj 37 self.__filename = Path(filename) 38 self.__n_jobs = n_jobs 39 self.__phased = phased 40 41 def write( 42 self, 43 chrom_partition: bool = False, 44 rename_missing_values: bool = True, 45 before: Union[int, float, str] = -1, 46 after: Union[int, float, str] = '.', 47 variants_info: Optional[Sequence[str]] = None, 48 ): 49 """ 50 Writes the SNP data to VCF file(s). 51 52 Args: 53 chrom_partition (bool, optional): 54 If True, individual VCF files are generated for each chromosome. 55 If False, a single VCF file containing data for all chromosomes is created. Defaults to False. 56 rename_missing_values (bool, optional): 57 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 58 Defaults to True. 59 before (int, float, or str, default=-1): 60 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 61 Default is -1. 62 after (int, float, or str, default='.'): 63 The value that will replace `before`. Default is '.'. 64 variants_info (sequence of str, optional): 65 Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count. 66 When provided, a ##INFO header line for END is written if any value contains ``END=``. 67 """ 68 self.__chrom_partition = chrom_partition 69 70 file_extensions = (".vcf", ".bcf") 71 if self.__filename.suffix in file_extensions: 72 self.__file_extension = self.__filename.suffix 73 self.__filename = self.__filename.with_suffix('') 74 else: 75 self.__file_extension = ".vcf" 76 77 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 78 if rename_missing_values: 79 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 80 81 data = self.__snpobj 82 83 if self.__chrom_partition: 84 chroms = data.unique_chrom 85 86 for chrom in chroms: 87 data_chrom = data.filter_variants(chrom=chrom, inplace=False) 88 if variants_info is not None: 89 mask = data.variants_chrom == chrom 90 info_chrom = [variants_info[i] for i in np.where(mask)[0]] 91 else: 92 info_chrom = None 93 log.debug(f'Storing chromosome {chrom}') 94 self._write_chromosome_data(chrom, data_chrom, info_chrom) 95 else: 96 self._write_chromosome_data("All", data, variants_info) 97 98 def _write_chromosome_data( 99 self, chrom, data_chrom, variants_info: Optional[Sequence[str]] = None 100 ): 101 """ 102 Writes the SNP data for a specific chromosome to a VCF file. 103 104 Args: 105 chrom: The chromosome name. 106 data_chrom: The SNPObject instance containing the data for the chromosome. 107 variants_info: Optional per-variant INFO strings; length must match variant count. 108 """ 109 npy3 = data_chrom.calldata_gt 110 n_windows, n_samples, _ = npy3.shape 111 112 if chrom == "All": 113 file = self.__filename.with_suffix(self.__file_extension) 114 else: 115 file = self.__filename.parent / f"{self.__filename.stem}_{chrom}{self.__file_extension}" 116 117 out = open(file, "w") 118 out.write("##fileformat=VCFv4.1\n") 119 out.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">\n') 120 if variants_info is not None and any("END=" in s for s in variants_info): 121 out.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the segment">\n') 122 for c in set(data_chrom.variants_chrom): 123 out.write(f"##contig=<ID={c}>\n") 124 cols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + list(data_chrom.samples) 125 out.write("\t".join(cols) + "\n") 126 127 sep = "|" if self.__phased else "/" 128 for i in range(n_windows): 129 chrom_val = data_chrom.variants_chrom[i] 130 pos = data_chrom.variants_pos[i] 131 vid = data_chrom.variants_id[i] 132 ref = data_chrom.variants_ref[i] 133 alt = data_chrom.variants_alt[i] 134 info_str = variants_info[i] if variants_info is not None else "." 135 row = npy3[i] 136 genotypes = [ 137 f"{row[s,0]}{sep}{row[s,1]}" 138 for s in range(n_samples) 139 ] 140 line = "\t".join([ 141 str(chrom_val), str(pos), vid, ref, alt, 142 ".", "PASS", info_str, "GT", *genotypes 143 ]) 144 out.write(line + "\n") 145 out.close()
A writer class for exporting SNP data from a snputils.snp.genobj.SNPObject
into an .vcf file.
19 def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False): 20 """ 21 Args: 22 snpobj (SNPObject): 23 A SNPObject instance. 24 file (str or pathlib.Path): 25 Path to the file where the data will be saved. It should end with `.vcf`. 26 If the provided path does not have this extension, the `.vcf` extension will be appended. 27 n_jobs: 28 Number of jobs to run in parallel. 29 - `None`: use 1 job unless within a `joblib.parallel_backend` context. 30 - `-1`: use all available processors. 31 - Any other integer: use the specified number of jobs. 32 phased: 33 If True, genotype data is written in "maternal|paternal" format. 34 If False, genotype data is written in "maternal/paternal" format. 35 """ 36 self.__snpobj = snpobj 37 self.__filename = Path(filename) 38 self.__n_jobs = n_jobs 39 self.__phased = phased
Arguments:
- snpobj (SNPObject): A SNPObject instance.
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.vcf. If the provided path does not have this extension, the.vcfextension will be appended. - n_jobs: Number of jobs to run in parallel.
None: use 1 job unless within ajoblib.parallel_backendcontext.-1: use all available processors.- Any other integer: use the specified number of jobs.
- phased: If True, genotype data is written in "maternal|paternal" format.
If False, genotype data is written in "maternal/paternal" format.
41 def write( 42 self, 43 chrom_partition: bool = False, 44 rename_missing_values: bool = True, 45 before: Union[int, float, str] = -1, 46 after: Union[int, float, str] = '.', 47 variants_info: Optional[Sequence[str]] = None, 48 ): 49 """ 50 Writes the SNP data to VCF file(s). 51 52 Args: 53 chrom_partition (bool, optional): 54 If True, individual VCF files are generated for each chromosome. 55 If False, a single VCF file containing data for all chromosomes is created. Defaults to False. 56 rename_missing_values (bool, optional): 57 If True, renames potential missing values in `snpobj.calldata_gt` before writing. 58 Defaults to True. 59 before (int, float, or str, default=-1): 60 The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN. 61 Default is -1. 62 after (int, float, or str, default='.'): 63 The value that will replace `before`. Default is '.'. 64 variants_info (sequence of str, optional): 65 Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count. 66 When provided, a ##INFO header line for END is written if any value contains ``END=``. 67 """ 68 self.__chrom_partition = chrom_partition 69 70 file_extensions = (".vcf", ".bcf") 71 if self.__filename.suffix in file_extensions: 72 self.__file_extension = self.__filename.suffix 73 self.__filename = self.__filename.with_suffix('') 74 else: 75 self.__file_extension = ".vcf" 76 77 # Optionally rename potential missing values in `snpobj.calldata_gt` before writing 78 if rename_missing_values: 79 self.__snpobj.rename_missings(before=before, after=after, inplace=True) 80 81 data = self.__snpobj 82 83 if self.__chrom_partition: 84 chroms = data.unique_chrom 85 86 for chrom in chroms: 87 data_chrom = data.filter_variants(chrom=chrom, inplace=False) 88 if variants_info is not None: 89 mask = data.variants_chrom == chrom 90 info_chrom = [variants_info[i] for i in np.where(mask)[0]] 91 else: 92 info_chrom = None 93 log.debug(f'Storing chromosome {chrom}') 94 self._write_chromosome_data(chrom, data_chrom, info_chrom) 95 else: 96 self._write_chromosome_data("All", data, variants_info)
Writes the SNP data to VCF file(s).
Arguments:
- chrom_partition (bool, optional): If True, individual VCF files are generated for each chromosome. If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
- rename_missing_values (bool, optional): If True, renames potential missing values in
snpobj.calldata_gtbefore writing. Defaults to True. - before (int, float, or str, default=-1): The current representation of missing values in
calldata_gt. Common values might be -1, '.', or NaN. Default is -1. - after (int, float, or str, default='.'): The value that will replace
before. Default is '.'. - variants_info (sequence of str, optional): Per-variant INFO column values (e.g.
["END=2000", "END=3000"]). Length must match variant count. When provided, a ##INFO header line for END is written if any value containsEND=.
11def read_snp(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject: 12 """ 13 Automatically detect the file format and read it into a SNPObject. 14 15 Args: 16 filename: Filename of the file to read. 17 **kwargs: Additional arguments passed to the reader method. 18 19 Raises: 20 ValueError: If the filename does not have an extension or the extension is not supported. 21 """ 22 from snputils.snp.io.read.auto import SNPReader 23 24 return SNPReader(filename).read(**kwargs)
Automatically detect the file format and read it into a SNPObject.
Arguments:
- filename: Filename of the file to read.
- **kwargs: Additional arguments passed to the reader method.
Raises:
- ValueError: If the filename does not have an extension or the extension is not supported.
27def read_bed(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject: 28 """ 29 Read a BED fileset into a SNPObject. 30 31 Args: 32 filename: Filename of the BED fileset to read. 33 **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.bed.BEDReader` for possible parameters. 34 """ 35 from snputils.snp.io.read.bed import BEDReader 36 37 return BEDReader(filename).read(**kwargs)
Read a BED fileset into a SNPObject.
Arguments:
- filename: Filename of the BED fileset to read.
- **kwargs: Additional arguments passed to the reader method. See
snputils.snp.io.read.bed.BEDReaderfor possible parameters.
40def read_pgen(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject: 41 """ 42 Read a PGEN fileset into a SNPObject. 43 44 Args: 45 filename: Filename of the PGEN fileset to read. 46 **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.pgen.PGENReader` for possible parameters. 47 """ 48 from snputils.snp.io.read.pgen import PGENReader 49 50 return PGENReader(filename).read(**kwargs)
Read a PGEN fileset into a SNPObject.
Arguments:
- filename: Filename of the PGEN fileset to read.
- **kwargs: Additional arguments passed to the reader method. See
snputils.snp.io.read.pgen.PGENReaderfor possible parameters.
53def read_vcf(filename: Union[str, pathlib.Path], 54 backend: str = 'polars', 55 **kwargs) -> SNPObject: 56 """ 57 Read a VCF fileset into a SNPObject. 58 59 Args: 60 filename: Filename of the VCF fileset to read. 61 backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. 62 **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.vcf.VCFReader` for possible parameters. 63 """ 64 from snputils.snp.io.read.vcf import VCFReader, VCFReaderPolars 65 if backend == 'polars': 66 print(f"Reading {filename} with polars backend") 67 return VCFReaderPolars(filename).read(**kwargs) 68 else: 69 print(f"Reading {filename} with scikit-allel backend") 70 return VCFReader(filename).read(**kwargs)
Read a VCF fileset into a SNPObject.
Arguments:
- filename: Filename of the VCF fileset to read.
- backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'.
- **kwargs: Additional arguments passed to the reader method. See
snputils.snp.io.read.vcf.VCFReaderfor possible parameters.
73def read_grg(filename: Union[str, pathlib.Path], **kwargs) -> "GRGObject": 74 """ 75 Read a GRG file into a GRGObject. 76 77 Args: 78 filename: Filename of the GRG file to read. 79 **kwargs: Additional arguments passed to the reader method. 80 """ 81 try: 82 from snputils.snp.io.read.grg import GRGReader 83 except ModuleNotFoundError as exc: 84 if exc.name == "pygrgl": 85 raise ImportError( 86 "GRG support requires the optional dependency 'pygrgl'. " 87 "Install it with: pip install pygrgl" 88 ) from exc 89 raise 90 91 return GRGReader(filename).read(**kwargs)
Read a GRG file into a GRGObject.
Arguments:
- filename: Filename of the GRG file to read.
- **kwargs: Additional arguments passed to the reader method.
17class LocalAncestryObject(AncestryObject): 18 """ 19 A class for window-level Local Ancestry Inference (LAI) data. 20 """ 21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check() 73 74 def __getitem__(self, key): 75 """ 76 To access an attribute of the class using the square bracket notation, 77 similar to a dictionary. 78 """ 79 try: 80 return getattr(self, key) 81 except AttributeError: 82 raise KeyError(f'Invalid key: {key}') 83 84 def __setitem__(self, key, value): 85 """ 86 To set an attribute of the class using the square bracket notation, 87 similar to a dictionary. 88 """ 89 try: 90 setattr(self, key, value) 91 except AttributeError: 92 raise KeyError(f'Invalid key: {key}') 93 94 @property 95 def haplotypes(self) -> List[str]: 96 """ 97 Retrieve `haplotypes`. 98 99 Returns: 100 **list of length n_haplotypes:** A list of unique haplotype identifiers. 101 """ 102 return self.__haplotypes 103 104 @haplotypes.setter 105 def haplotypes(self, x): 106 """ 107 Update `haplotypes`. 108 """ 109 self.__haplotypes = x 110 111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai 122 123 @lai.setter 124 def lai(self, x): 125 """ 126 Update `lai`. 127 """ 128 self.__lai = x 129 130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None 144 145 @samples.setter 146 def samples(self, x): 147 """ 148 Update `samples`. 149 """ 150 self.__samples = x 151 152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map 161 162 @ancestry_map.setter 163 def ancestry_map(self, x): 164 """ 165 Update `ancestry_map`. 166 """ 167 self.__ancestry_map = x 168 169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes 179 180 @window_sizes.setter 181 def window_sizes(self, x): 182 """ 183 Update `window_sizes`. 184 """ 185 self.__window_sizes = x 186 187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos 197 198 @centimorgan_pos.setter 199 def centimorgan_pos(self, x): 200 """ 201 Update `centimorgan_pos`. 202 """ 203 self.__centimorgan_pos = x 204 205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes 215 216 @chromosomes.setter 217 def chromosomes(self, x): 218 """ 219 Update `chromosomes`. 220 """ 221 self.__chromosomes = x 222 223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos 233 234 @physical_pos.setter 235 def physical_pos(self, x): 236 """ 237 Update `physical_pos`. 238 """ 239 self.__physical_pos = x 240 241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2 258 259 @property 260 def n_ancestries(self) -> int: 261 """ 262 Retrieve `n_ancestries`. 263 264 Returns: 265 **int:** The total number of unique ancestries. 266 """ 267 return len(np.unique(self.__lai)) 268 269 @property 270 def n_haplotypes(self) -> int: 271 """ 272 Retrieve `n_haplotypes`. 273 274 Returns: 275 **int:** The total number of haplotypes. 276 """ 277 if self.__haplotypes is not None: 278 return len(self.__haplotypes) 279 else: 280 return self.__lai.shape[1] 281 282 @property 283 def n_windows(self) -> int: 284 """ 285 Retrieve `n_windows`. 286 287 Returns: 288 **int:** The total number of genomic windows. 289 """ 290 return self.__lai.shape[0] 291 292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self) 301 302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 312 313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj 396 397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 reorder: bool = False, 403 inplace: bool = False 404 ) -> Optional['LocalAncestryObject']: 405 """ 406 Filter samples based on specified names or indexes. 407 408 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 409 samples. Each sample is associated with two haplotypes, which are included or excluded together. 410 The order of the samples is preserved. Set `reorder=True` to match the ordering of the 411 provided `samples` and/or `indexes` lists when including. 412 413 If both samples and indexes are provided, any sample matching either a name in samples or an index in 414 indexes will be included or excluded. 415 416 Negative indexes are supported and follow 417 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 418 419 Args: 420 samples (str or array_like of str, optional): 421 Name(s) of the samples to include or exclude. Can be a single sample name or a 422 sequence of sample names. Default is None. 423 indexes (int or array_like of int, optional): 424 Index(es) of the samples to include or exclude. Can be a single index or a sequence 425 of indexes. Negative indexes are supported. Default is None. 426 include (bool, default=True): 427 If True, includes only the specified samples. If False, excludes the specified 428 samples. Default is True. 429 inplace (bool, default=False): 430 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 431 samples filtered. Default is False. 432 433 Returns: 434 **Optional[LocalAncestryObject]:** 435 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 436 If `inplace=True`, modifies `self` in place and returns None. 437 """ 438 if samples is None and indexes is None: 439 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 440 441 n_haplotypes = self.n_haplotypes 442 n_samples = self.n_samples 443 444 # Create mask based on sample names 445 if samples is not None: 446 samples = np.asarray(samples).ravel() 447 # Extract sample names from haplotype identifiers 448 haplotype_ids = np.array(self['haplotypes']) 449 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 450 # Create mask for haplotypes belonging to specified samples 451 mask_samples = np.isin(sample_names, samples) 452 else: 453 mask_samples = np.zeros(n_haplotypes, dtype=bool) 454 455 # Create mask based on sample indexes 456 if indexes is not None: 457 indexes = np.asarray(indexes).ravel() 458 459 # Validate indexes, allowing negative indexes 460 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 461 if out_of_bounds_indexes.size > 0: 462 raise ValueError(f"One or more sample indexes are out of bounds.") 463 464 # Adjust negative indexes 465 indexes = np.mod(indexes, n_samples) 466 467 # Get haplotype indexes for the specified sample indexes 468 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 469 # Create mask for haplotypes 470 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 471 mask_indexes[haplotype_indexes] = True 472 else: 473 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 474 475 # Combine masks using logical OR (union of samples) 476 mask_combined = mask_samples | mask_indexes 477 478 if not include: 479 mask_combined = ~mask_combined 480 481 # Optionally compute an ordering of selected samples that follows the provided lists 482 ordered_sample_indices = None 483 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 484 if include and reorder: 485 sel_sample_indices = np.where(sample_mask)[0] 486 ordered_list: List[int] = [] 487 added = np.zeros(self.n_samples, dtype=bool) 488 489 # Source of sample names for ordering logic 490 haplotype_ids = np.array(self['haplotypes']) 491 sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2] 492 493 # Respect the order in `samples` 494 if samples is not None: 495 for s in np.atleast_1d(samples): 496 # Find the sample index by name (first occurrence) 497 matches = np.where(sample_names_by_sample == s)[0] 498 for idx in matches: 499 if sample_mask[idx] and not added[idx]: 500 ordered_list.append(int(idx)) 501 added[idx] = True 502 503 # Then respect the order in `indexes` 504 if indexes is not None: 505 adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples) 506 for idx in adj_idx: 507 if sample_mask[idx] and not added[idx]: 508 ordered_list.append(int(idx)) 509 added[idx] = True 510 511 # Append any remaining selected samples in their original order 512 for idx in sel_sample_indices: 513 if not added[idx]: 514 ordered_list.append(int(idx)) 515 516 ordered_sample_indices = np.asarray(ordered_list, dtype=int) 517 518 # Filter / reorder arrays 519 if ordered_sample_indices is not None: 520 hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1]) 521 filtered_lai = self['lai'][:, hap_idx] 522 filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist() 523 filtered_samples = ( 524 np.array(self['samples'])[ordered_sample_indices].tolist() 525 if self['samples'] is not None else None 526 ) 527 else: 528 # Filter `lai` 529 filtered_lai = self['lai'][:, mask_combined] 530 # Filter `haplotypes` 531 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 532 # Filter `samples`, checking if they are None before filtering 533 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 534 535 if inplace: 536 self['haplotypes'] = filtered_haplotypes 537 self['samples'] = filtered_samples 538 self['lai'] = filtered_lai 539 return None 540 else: 541 laiobj = self.copy() 542 laiobj['haplotypes'] = filtered_haplotypes 543 laiobj['samples'] = filtered_samples 544 laiobj['lai'] = filtered_lai 545 return laiobj 546 547 def convert_to_snp_level( 548 self, 549 snpobject: Optional['SNPObject'] = None, 550 variants_chrom: Optional[np.ndarray] = None, 551 variants_pos: Optional[np.ndarray] = None, 552 variants_ref: Optional[np.ndarray] = None, 553 variants_alt: Optional[np.ndarray] = None, 554 variants_filter_pass: Optional[np.ndarray] = None, 555 variants_id: Optional[np.ndarray] = None, 556 variants_qual: Optional[np.ndarray] = None, 557 lai_format: str = "3D" 558 ) -> 'SNPObject': 559 """ 560 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 561 with optional support for SNP data. 562 563 If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 564 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 565 coordinates are used to assign ancestry values based on their respective windows. 566 567 If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 568 In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned. 569 570 Args: 571 snpobject (SNPObject, optional): 572 An existing `SNPObject` to extract SNP attributes from. 573 variants_chrom (array of shape (n_snps,), optional): 574 An array containing the chromosome for each SNP. 575 variants_pos (array of shape (n_snps,), optional): 576 An array containing the chromosomal positions for each SNP. 577 variants_ref (array of shape (n_snps,), optional): 578 An array containing the reference allele for each SNP. 579 variants_alt (array of shape (n_snps,), optional): 580 An array containing the alternate allele for each SNP. 581 variants_filter_pass (array of shape (n_snps,), optional): 582 An array indicating whether each SNP passed control checks. 583 variants_id (array of shape (n_snps,), optional): 584 An array containing unique identifiers (IDs) for each SNP. 585 variants_qual (array of shape (n_snps,), optional): 586 An array containing the Phred-scaled quality score for each SNP. 587 lai_format (str, optional): 588 Determines the shape of `calldata_lai`: 589 - `"3D"` (default): Shape `(n_snps, n_samples, 2)`. 590 - `"2D"`: Shape `(n_snps, n_samples * 2)`. 591 592 Returns: 593 **SNPObject**: 594 A `SNPObject` containing SNP-level ancestry data and updated SNP attributes. 595 """ 596 from snputils.snp.genobj.snpobj import SNPObject 597 598 assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'." 599 600 # Extract attributes from SNPObject if provided 601 if snpobject is not None: 602 variants_chrom = variants_chrom or snpobject.variants_chrom 603 variants_pos = variants_pos or snpobject.variants_pos 604 variants_ref = variants_ref or snpobject.variants_ref 605 variants_alt = variants_alt or snpobject.variants_alt 606 variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass 607 variants_id = variants_id or snpobject.variants_id 608 variants_qual = variants_qual or snpobject.variants_qual 609 610 n_samples = self.n_samples 611 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None 612 613 if variants_pos is None or variants_chrom is None: 614 # Generate all SNP positions and corresponding chromosome labels between window boundaries 615 variants_pos_list = [] 616 variants_chrom_list = [] 617 ancestry_list = [] 618 619 for i in range(self.n_windows): 620 start = int(self.physical_pos[i, 0]) 621 end = int(self.physical_pos[i, 1]) 622 chrom = self.chromosomes[i] 623 624 # Generate SNP positions at each base pair within the window range 625 positions_in_window = np.arange(start, end + 1) 626 if positions_in_window.size == 0: 627 continue # Skip windows that contain no valid SNP positions 628 629 n_positions = positions_in_window.size 630 variants_pos_list.append(positions_in_window) 631 variants_chrom_list.append(np.full(n_positions, chrom)) 632 633 ancestry_repeated = ( 634 np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0) 635 if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0) 636 ) 637 ancestry_list.append(ancestry_repeated) 638 639 # Store SNP positions, their corresponding chromosome labels, and their associated ancestry 640 variants_pos = np.concatenate(variants_pos_list) 641 variants_chrom = np.concatenate(variants_chrom_list) 642 calldata_lai = np.concatenate(ancestry_list) 643 else: 644 # Use the provided SNP positions and chromosomes 645 n_snps = len(variants_pos) 646 if len(variants_chrom) != n_snps: 647 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 648 649 # Initialize an array to store the corresponding window index for each SNP 650 # Default value is -1, meaning no matching window found 651 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 652 653 # Identify unique chromosome names sorted by order of appearence 654 _, idx = np.unique(variants_chrom, return_index=True) 655 unique_chroms = variants_chrom[np.sort(idx)] 656 657 # Iterate through each unique chromosome to map SNPs to windows 658 for chrom in unique_chroms: 659 # Get indices of SNPs that belong to the current chromosome 660 snp_indices = np.where(variants_chrom == chrom)[0] 661 snp_pos_chr = variants_pos[snp_indices] 662 663 # Get indices of windows that belong to the current chromosome 664 window_indices = np.where(self.chromosomes == chrom)[0] 665 if window_indices.size == 0: 666 continue # Skip if no windows exist for this chromosome 667 668 # Extract start and end positions of the windows on this chromosome 669 window_starts_chr = self.physical_pos[:, 0][window_indices] 670 window_ends_chr = self.physical_pos[:, 1][window_indices] 671 672 # Find the right-most window that a SNP would fit into (sorted order) 673 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 674 675 # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries 676 valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds]) 677 678 # Assign valid SNPs to their corresponding window indices 679 snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]] 680 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}") 681 682 # Initialize SNP-level ancestry array with a missing-value sentinel. 683 # `-1` marks SNPs that do not fall within any LAI window. 684 shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2) 685 calldata_lai = np.full(shape, -1, dtype=np.int16) 686 687 # Assign ancestry values to SNPs with valid window assignments 688 valid_mask = (snp_to_window_indices != -1) 689 snp_indices = np.where(valid_mask)[0] 690 snp_to_window_indices = snp_to_window_indices[snp_indices] 691 692 if lai_format == "3D": 693 calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices] 694 else: # "2D" 695 calldata_lai[snp_indices] = self.lai[snp_to_window_indices] 696 697 if snpobject is not None: 698 # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai`` 699 snpobject.variants_chrom = variants_chrom 700 snpobject.variants_pos = variants_pos 701 snpobject.variants_ref = variants_ref 702 snpobject.variants_alt = variants_alt 703 snpobject.variants_filter_pass = variants_filter_pass 704 snpobject.variants_id = variants_id 705 snpobject.variants_qual = variants_qual 706 snpobject.calldata_lai = calldata_lai 707 snpobject.ancestry_map = self.ancestry_map 708 return snpobject 709 else: 710 # Otherwise, create a new SNPObject 711 return SNPObject( 712 calldata_lai=calldata_lai.view(), 713 samples=self.samples, 714 variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref, 715 variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt, 716 variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass, 717 variants_chrom=variants_chrom.view(), 718 variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id, 719 variants_pos=variants_pos.view(), 720 variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual, 721 ancestry_map=self.ancestry_map 722 ) 723 724 def _sanity_check(self) -> None: 725 """ 726 Perform sanity checks on the parsed data to ensure data integrity. 727 728 This method checks that all unique ancestries in LAI are represented 729 in the ancestry map. 730 731 Args: 732 lai (np.ndarray): The LAI data array. 733 ancestry_map (dict, optional): A dictionary mapping ancestry codes to region names, if available. 734 """ 735 # Get unique ancestries from LAI data 736 unique_ancestries = np.unique(self.lai) 737 738 if self.ancestry_map is not None: 739 # Check if all unique ancestries in the LAI are present in the ancestry map 740 for ancestry in unique_ancestries: 741 if str(ancestry) not in self.ancestry_map: 742 warnings.warn( 743 f"Ancestry '{ancestry}' found in LAI data is not represented in the ancestry map." 744 ) 745 746 def save(self, file: Union[str, Path]) -> None: 747 """ 748 Save the data stored in `self` to a specified file. 749 If the file already exists, it will be overwritten. 750 751 The format of the saved file is determined by the file extension provided in the `file` 752 argument. 753 754 **Supported formats:** 755 756 - `.msp`: Text-based MSP format. 757 - `.msp.tsv`: Text-based MSP format with TSV extension. 758 - `.pkl`: Pickle format for saving `self` in serialized form. 759 760 Args: 761 file (str or pathlib.Path): 762 Path to the file where the data will be saved. The extension of the file determines the save format. 763 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 764 """ 765 path = Path(file) 766 suffixes = [suffix.lower() for suffix in path.suffixes] 767 768 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 769 self.save_msp(file) 770 elif suffixes[-1] == '.pkl': 771 self.save_pickle(file) 772 else: 773 raise ValueError( 774 f"Unsupported file extension: {suffixes[-1]}" 775 "Supported extensions are: .msp, .msp.tsv, .pkl." 776 ) 777 778 def save_msp(self, file: Union[str, Path]) -> None: 779 """ 780 Save the data stored in `self` to a `.msp` file. 781 If the file already exists, it will be overwritten. 782 783 Args: 784 file (str or pathlib.Path): 785 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 786 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 787 """ 788 from snputils.ancestry.io.local.write import MSPWriter 789 790 MSPWriter(self, file).write() 791 792 def save_pickle(self, file: Union[str, Path]) -> None: 793 """ 794 Save `self` in serialized form to a `.pkl` file. 795 If the file already exists, it will be overwritten. 796 797 Args: 798 file (str or pathlib.Path): 799 Path to the file where the data will be saved. It should end with `.pkl`. 800 If the provided path does not have this extension, it will be appended. 801 """ 802 import pickle 803 with open(file, 'wb') as file: 804 pickle.dump(self, file)
A class for window-level Local Ancestry Inference (LAI) data.
21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check()
Arguments:
- haplotypes (list of str of length n_haplotypes): A list of unique haplotype identifiers.
- lai (array of shape (n_windows, n_haplotypes)): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
- samples (list of str of length n_samples, optional): A list of unique sample identifiers.
- ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
- window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
- centimorgan_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end centimorgan positions for each window.
- chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
- physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai
Retrieve lai.
Returns:
array of shape (n_windows, n_haplotypes): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None
152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map
Retrieve ancestry_map.
Returns:
dict of str to str: A dictionary mapping ancestry codes to region names.
169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes
Retrieve window_sizes.
Returns:
array of shape (n_windows,): An array specifying the number of SNPs in each genomic window.
187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos
Retrieve centimorgan_pos.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end centimorgan positions for each window.
205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes
Retrieve chromosomes.
Returns:
array of shape (n_windows,): An array with chromosome numbers corresponding to each genomic window.
223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos
Retrieve physical_pos.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end physical positions for each window.
241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2
292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self)
Create and return a copy of self.
Returns:
LocalAncestryObject: A new instance of the current object.
302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj
Filter genomic windows based on specified indexes.
This method updates the lai attribute to include or exclude the specified genomic windows.
Attributes such as window_sizes, centimorgan_pos, chromosomes, and physical_pos will also be
updated accordingly if they are not None. The order of genomic windows is preserved.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- indexes (int or array-like of int): Index(es) of the windows to include or exclude. Can be a single integer or a sequence of integers. Negative indexes are supported.
- include (bool, default=True): If True, includes only the specified windows. If False, excludes the specified windows. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newLocalAncestryObjectwith the windows filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObjectwith the specified windows filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 reorder: bool = False, 403 inplace: bool = False 404 ) -> Optional['LocalAncestryObject']: 405 """ 406 Filter samples based on specified names or indexes. 407 408 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 409 samples. Each sample is associated with two haplotypes, which are included or excluded together. 410 The order of the samples is preserved. Set `reorder=True` to match the ordering of the 411 provided `samples` and/or `indexes` lists when including. 412 413 If both samples and indexes are provided, any sample matching either a name in samples or an index in 414 indexes will be included or excluded. 415 416 Negative indexes are supported and follow 417 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 418 419 Args: 420 samples (str or array_like of str, optional): 421 Name(s) of the samples to include or exclude. Can be a single sample name or a 422 sequence of sample names. Default is None. 423 indexes (int or array_like of int, optional): 424 Index(es) of the samples to include or exclude. Can be a single index or a sequence 425 of indexes. Negative indexes are supported. Default is None. 426 include (bool, default=True): 427 If True, includes only the specified samples. If False, excludes the specified 428 samples. Default is True. 429 inplace (bool, default=False): 430 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 431 samples filtered. Default is False. 432 433 Returns: 434 **Optional[LocalAncestryObject]:** 435 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 436 If `inplace=True`, modifies `self` in place and returns None. 437 """ 438 if samples is None and indexes is None: 439 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 440 441 n_haplotypes = self.n_haplotypes 442 n_samples = self.n_samples 443 444 # Create mask based on sample names 445 if samples is not None: 446 samples = np.asarray(samples).ravel() 447 # Extract sample names from haplotype identifiers 448 haplotype_ids = np.array(self['haplotypes']) 449 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 450 # Create mask for haplotypes belonging to specified samples 451 mask_samples = np.isin(sample_names, samples) 452 else: 453 mask_samples = np.zeros(n_haplotypes, dtype=bool) 454 455 # Create mask based on sample indexes 456 if indexes is not None: 457 indexes = np.asarray(indexes).ravel() 458 459 # Validate indexes, allowing negative indexes 460 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 461 if out_of_bounds_indexes.size > 0: 462 raise ValueError(f"One or more sample indexes are out of bounds.") 463 464 # Adjust negative indexes 465 indexes = np.mod(indexes, n_samples) 466 467 # Get haplotype indexes for the specified sample indexes 468 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 469 # Create mask for haplotypes 470 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 471 mask_indexes[haplotype_indexes] = True 472 else: 473 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 474 475 # Combine masks using logical OR (union of samples) 476 mask_combined = mask_samples | mask_indexes 477 478 if not include: 479 mask_combined = ~mask_combined 480 481 # Optionally compute an ordering of selected samples that follows the provided lists 482 ordered_sample_indices = None 483 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 484 if include and reorder: 485 sel_sample_indices = np.where(sample_mask)[0] 486 ordered_list: List[int] = [] 487 added = np.zeros(self.n_samples, dtype=bool) 488 489 # Source of sample names for ordering logic 490 haplotype_ids = np.array(self['haplotypes']) 491 sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2] 492 493 # Respect the order in `samples` 494 if samples is not None: 495 for s in np.atleast_1d(samples): 496 # Find the sample index by name (first occurrence) 497 matches = np.where(sample_names_by_sample == s)[0] 498 for idx in matches: 499 if sample_mask[idx] and not added[idx]: 500 ordered_list.append(int(idx)) 501 added[idx] = True 502 503 # Then respect the order in `indexes` 504 if indexes is not None: 505 adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples) 506 for idx in adj_idx: 507 if sample_mask[idx] and not added[idx]: 508 ordered_list.append(int(idx)) 509 added[idx] = True 510 511 # Append any remaining selected samples in their original order 512 for idx in sel_sample_indices: 513 if not added[idx]: 514 ordered_list.append(int(idx)) 515 516 ordered_sample_indices = np.asarray(ordered_list, dtype=int) 517 518 # Filter / reorder arrays 519 if ordered_sample_indices is not None: 520 hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1]) 521 filtered_lai = self['lai'][:, hap_idx] 522 filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist() 523 filtered_samples = ( 524 np.array(self['samples'])[ordered_sample_indices].tolist() 525 if self['samples'] is not None else None 526 ) 527 else: 528 # Filter `lai` 529 filtered_lai = self['lai'][:, mask_combined] 530 # Filter `haplotypes` 531 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 532 # Filter `samples`, checking if they are None before filtering 533 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 534 535 if inplace: 536 self['haplotypes'] = filtered_haplotypes 537 self['samples'] = filtered_samples 538 self['lai'] = filtered_lai 539 return None 540 else: 541 laiobj = self.copy() 542 laiobj['haplotypes'] = filtered_haplotypes 543 laiobj['samples'] = filtered_samples 544 laiobj['lai'] = filtered_lai 545 return laiobj
Filter samples based on specified names or indexes.
This method updates the lai, haplotypes, and samples attributes to include or exclude the specified
samples. Each sample is associated with two haplotypes, which are included or excluded together.
The order of the samples is preserved. Set reorder=True to match the ordering of the
provided samples and/or indexes lists when including.
If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
- indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newLocalAncestryObjectwith the samples filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObjectwith the specified samples filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
547 def convert_to_snp_level( 548 self, 549 snpobject: Optional['SNPObject'] = None, 550 variants_chrom: Optional[np.ndarray] = None, 551 variants_pos: Optional[np.ndarray] = None, 552 variants_ref: Optional[np.ndarray] = None, 553 variants_alt: Optional[np.ndarray] = None, 554 variants_filter_pass: Optional[np.ndarray] = None, 555 variants_id: Optional[np.ndarray] = None, 556 variants_qual: Optional[np.ndarray] = None, 557 lai_format: str = "3D" 558 ) -> 'SNPObject': 559 """ 560 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 561 with optional support for SNP data. 562 563 If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 564 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 565 coordinates are used to assign ancestry values based on their respective windows. 566 567 If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 568 In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned. 569 570 Args: 571 snpobject (SNPObject, optional): 572 An existing `SNPObject` to extract SNP attributes from. 573 variants_chrom (array of shape (n_snps,), optional): 574 An array containing the chromosome for each SNP. 575 variants_pos (array of shape (n_snps,), optional): 576 An array containing the chromosomal positions for each SNP. 577 variants_ref (array of shape (n_snps,), optional): 578 An array containing the reference allele for each SNP. 579 variants_alt (array of shape (n_snps,), optional): 580 An array containing the alternate allele for each SNP. 581 variants_filter_pass (array of shape (n_snps,), optional): 582 An array indicating whether each SNP passed control checks. 583 variants_id (array of shape (n_snps,), optional): 584 An array containing unique identifiers (IDs) for each SNP. 585 variants_qual (array of shape (n_snps,), optional): 586 An array containing the Phred-scaled quality score for each SNP. 587 lai_format (str, optional): 588 Determines the shape of `calldata_lai`: 589 - `"3D"` (default): Shape `(n_snps, n_samples, 2)`. 590 - `"2D"`: Shape `(n_snps, n_samples * 2)`. 591 592 Returns: 593 **SNPObject**: 594 A `SNPObject` containing SNP-level ancestry data and updated SNP attributes. 595 """ 596 from snputils.snp.genobj.snpobj import SNPObject 597 598 assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'." 599 600 # Extract attributes from SNPObject if provided 601 if snpobject is not None: 602 variants_chrom = variants_chrom or snpobject.variants_chrom 603 variants_pos = variants_pos or snpobject.variants_pos 604 variants_ref = variants_ref or snpobject.variants_ref 605 variants_alt = variants_alt or snpobject.variants_alt 606 variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass 607 variants_id = variants_id or snpobject.variants_id 608 variants_qual = variants_qual or snpobject.variants_qual 609 610 n_samples = self.n_samples 611 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None 612 613 if variants_pos is None or variants_chrom is None: 614 # Generate all SNP positions and corresponding chromosome labels between window boundaries 615 variants_pos_list = [] 616 variants_chrom_list = [] 617 ancestry_list = [] 618 619 for i in range(self.n_windows): 620 start = int(self.physical_pos[i, 0]) 621 end = int(self.physical_pos[i, 1]) 622 chrom = self.chromosomes[i] 623 624 # Generate SNP positions at each base pair within the window range 625 positions_in_window = np.arange(start, end + 1) 626 if positions_in_window.size == 0: 627 continue # Skip windows that contain no valid SNP positions 628 629 n_positions = positions_in_window.size 630 variants_pos_list.append(positions_in_window) 631 variants_chrom_list.append(np.full(n_positions, chrom)) 632 633 ancestry_repeated = ( 634 np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0) 635 if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0) 636 ) 637 ancestry_list.append(ancestry_repeated) 638 639 # Store SNP positions, their corresponding chromosome labels, and their associated ancestry 640 variants_pos = np.concatenate(variants_pos_list) 641 variants_chrom = np.concatenate(variants_chrom_list) 642 calldata_lai = np.concatenate(ancestry_list) 643 else: 644 # Use the provided SNP positions and chromosomes 645 n_snps = len(variants_pos) 646 if len(variants_chrom) != n_snps: 647 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 648 649 # Initialize an array to store the corresponding window index for each SNP 650 # Default value is -1, meaning no matching window found 651 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 652 653 # Identify unique chromosome names sorted by order of appearence 654 _, idx = np.unique(variants_chrom, return_index=True) 655 unique_chroms = variants_chrom[np.sort(idx)] 656 657 # Iterate through each unique chromosome to map SNPs to windows 658 for chrom in unique_chroms: 659 # Get indices of SNPs that belong to the current chromosome 660 snp_indices = np.where(variants_chrom == chrom)[0] 661 snp_pos_chr = variants_pos[snp_indices] 662 663 # Get indices of windows that belong to the current chromosome 664 window_indices = np.where(self.chromosomes == chrom)[0] 665 if window_indices.size == 0: 666 continue # Skip if no windows exist for this chromosome 667 668 # Extract start and end positions of the windows on this chromosome 669 window_starts_chr = self.physical_pos[:, 0][window_indices] 670 window_ends_chr = self.physical_pos[:, 1][window_indices] 671 672 # Find the right-most window that a SNP would fit into (sorted order) 673 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 674 675 # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries 676 valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds]) 677 678 # Assign valid SNPs to their corresponding window indices 679 snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]] 680 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}") 681 682 # Initialize SNP-level ancestry array with a missing-value sentinel. 683 # `-1` marks SNPs that do not fall within any LAI window. 684 shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2) 685 calldata_lai = np.full(shape, -1, dtype=np.int16) 686 687 # Assign ancestry values to SNPs with valid window assignments 688 valid_mask = (snp_to_window_indices != -1) 689 snp_indices = np.where(valid_mask)[0] 690 snp_to_window_indices = snp_to_window_indices[snp_indices] 691 692 if lai_format == "3D": 693 calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices] 694 else: # "2D" 695 calldata_lai[snp_indices] = self.lai[snp_to_window_indices] 696 697 if snpobject is not None: 698 # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai`` 699 snpobject.variants_chrom = variants_chrom 700 snpobject.variants_pos = variants_pos 701 snpobject.variants_ref = variants_ref 702 snpobject.variants_alt = variants_alt 703 snpobject.variants_filter_pass = variants_filter_pass 704 snpobject.variants_id = variants_id 705 snpobject.variants_qual = variants_qual 706 snpobject.calldata_lai = calldata_lai 707 snpobject.ancestry_map = self.ancestry_map 708 return snpobject 709 else: 710 # Otherwise, create a new SNPObject 711 return SNPObject( 712 calldata_lai=calldata_lai.view(), 713 samples=self.samples, 714 variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref, 715 variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt, 716 variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass, 717 variants_chrom=variants_chrom.view(), 718 variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id, 719 variants_pos=variants_pos.view(), 720 variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual, 721 ancestry_map=self.ancestry_map 722 )
Convert self into a snputils.snp.genobj.SNPObject SNP-level Local Ancestry Information (LAI),
with optional support for SNP data.
If SNP positions (variants_pos) and/or chromosomes (variants_chrom) are not specified, the method generates
SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP
coordinates are used to assign ancestry values based on their respective windows.
If a SNPObject is provided, its attributes are used unless explicitly overridden by the function arguments.
In that case, the SNPObject is updated with the (optional) new attributes and the computed calldata_lai, then returned.
Arguments:
- snpobject (SNPObject, optional): An existing
SNPObjectto extract SNP attributes from. - variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
- variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
- variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
- variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
- variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
- variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
- variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
- lai_format (str, optional): Determines the shape of
calldata_lai:"3D"(default): Shape(n_snps, n_samples, 2)."2D": Shape(n_snps, n_samples * 2).
Returns:
SNPObject: A
SNPObjectcontaining SNP-level ancestry data and updated SNP attributes.
746 def save(self, file: Union[str, Path]) -> None: 747 """ 748 Save the data stored in `self` to a specified file. 749 If the file already exists, it will be overwritten. 750 751 The format of the saved file is determined by the file extension provided in the `file` 752 argument. 753 754 **Supported formats:** 755 756 - `.msp`: Text-based MSP format. 757 - `.msp.tsv`: Text-based MSP format with TSV extension. 758 - `.pkl`: Pickle format for saving `self` in serialized form. 759 760 Args: 761 file (str or pathlib.Path): 762 Path to the file where the data will be saved. The extension of the file determines the save format. 763 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 764 """ 765 path = Path(file) 766 suffixes = [suffix.lower() for suffix in path.suffixes] 767 768 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 769 self.save_msp(file) 770 elif suffixes[-1] == '.pkl': 771 self.save_pickle(file) 772 else: 773 raise ValueError( 774 f"Unsupported file extension: {suffixes[-1]}" 775 "Supported extensions are: .msp, .msp.tsv, .pkl." 776 )
Save the data stored in self to a specified file.
If the file already exists, it will be overwritten.
The format of the saved file is determined by the file extension provided in the file
argument.
Supported formats:
.msp: Text-based MSP format..msp.tsv: Text-based MSP format with TSV extension..pkl: Pickle format for savingselfin serialized form.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format.
Supported extensions:
.msp,.msp.tsv,.pkl.
778 def save_msp(self, file: Union[str, Path]) -> None: 779 """ 780 Save the data stored in `self` to a `.msp` file. 781 If the file already exists, it will be overwritten. 782 783 Args: 784 file (str or pathlib.Path): 785 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 786 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 787 """ 788 from snputils.ancestry.io.local.write import MSPWriter 789 790 MSPWriter(self, file).write()
Save the data stored in self to a .msp file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.mspor.msp.tsv. If the provided path does not have one of these extensions, the.mspextension will be appended.
792 def save_pickle(self, file: Union[str, Path]) -> None: 793 """ 794 Save `self` in serialized form to a `.pkl` file. 795 If the file already exists, it will be overwritten. 796 797 Args: 798 file (str or pathlib.Path): 799 Path to the file where the data will be saved. It should end with `.pkl`. 800 If the provided path does not have this extension, it will be appended. 801 """ 802 import pickle 803 with open(file, 'wb') as file: 804 pickle.dump(self, file)
Save self in serialized form to a .pkl file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl. If the provided path does not have this extension, it will be appended.
10class GlobalAncestryObject(AncestryObject): 11 """ 12 A class for Global Ancestry Inference (GAI) data. 13 """ 14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: Optional[np.ndarray] = None, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 if P is not None: 42 n_snps, n_ancestries_P = P.shape 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if P is None: 62 snps = None 63 else: 64 if snps is None: 65 snps = list(range(n_snps)) 66 else: 67 snps = list(snps) 68 if len(snps) != n_snps: 69 raise ValueError( 70 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 71 ) 72 73 if ancestries is not None: 74 if len(ancestries) != n_samples: 75 raise ValueError( 76 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 77 ) 78 79 super().__init__(n_samples, n_ancestries) 80 81 # Store attributes 82 self.__Q = Q 83 self.__P = P 84 self.__samples = np.asarray(samples) 85 self.__snps = np.asarray(snps) if snps is not None else None 86 self.__ancestries = np.asarray(ancestries) if ancestries is not None else None 87 88 # Perform sanity checks 89 self._sanity_check() 90 91 def __getitem__(self, key): 92 """ 93 To access an attribute of the class using the square bracket notation, 94 similar to a dictionary. 95 """ 96 try: 97 return getattr(self, key) 98 except AttributeError: 99 raise KeyError(f'Invalid key: {key}') 100 101 def __setitem__(self, key, value): 102 """ 103 To set an attribute of the class using the square bracket notation, 104 similar to a dictionary. 105 """ 106 try: 107 setattr(self, key, value) 108 except AttributeError: 109 raise KeyError(f'Invalid key: {key}') 110 111 @property 112 def Q(self) -> np.ndarray: 113 """ 114 Retrieve `Q`. 115 116 Returns: 117 **array of shape (n_samples, n_ancestries):** 118 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 119 and each column corresponds to an ancestry. 120 """ 121 return self.__Q 122 123 @Q.setter 124 def Q(self, x: np.ndarray): 125 """ 126 Update `Q`. 127 """ 128 if x.shape != (self.n_samples, self.n_ancestries): 129 raise ValueError( 130 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {x.shape}." 131 ) 132 self.__Q = x 133 134 @property 135 def P(self) -> np.ndarray: 136 """ 137 Retrieve `P`. 138 139 Returns: 140 **array of shape (n_snps, n_ancestries):** 141 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 142 and each column corresponds to an ancestry. 143 """ 144 return self.__P 145 146 @P.setter 147 def P(self, x: np.ndarray): 148 """ 149 Update `P`. 150 """ 151 if x.shape[1] != self.n_ancestries: 152 raise ValueError( 153 f"P must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}." 154 ) 155 self.__P = x 156 self._sanity_check() 157 158 @property 159 def F(self) -> np.ndarray: 160 """ 161 Alias for `P`. 162 163 Returns: 164 **array of shape (n_snps, n_ancestries):** 165 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 166 and each column corresponds to an ancestry. 167 """ 168 return self.P 169 170 @F.setter 171 def F(self, x: np.ndarray): 172 """ 173 Update `F`. 174 """ 175 if x.shape[1] != self.n_ancestries: 176 raise ValueError( 177 f"F must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}." 178 ) 179 self.__P = x 180 181 @property 182 def samples(self) -> Optional[np.ndarray]: 183 """ 184 Retrieve `samples`. 185 186 Returns: 187 **array of shape (n_samples,):** 188 An array containing unique identifiers for each sample. If None, sample 189 identifiers are assigned as integers from `0` to `n_samples - 1`. 190 """ 191 return self.__samples 192 193 @samples.setter 194 def samples(self, x: Sequence): 195 """ 196 Update `samples`. 197 """ 198 x = list(x) 199 if len(x) != self.n_samples: 200 raise ValueError( 201 f"samples must have length {self.n_samples}; got length {len(x)}." 202 ) 203 self.__samples = x 204 205 @property 206 def snps(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `snps`. 209 210 Returns: 211 **array of shape (n_snps,):** 212 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 213 from `0` to `n_snps - 1`. 214 """ 215 return self.__snps 216 217 @snps.setter 218 def snps(self, x: Sequence): 219 """ 220 Update `snps`. 221 """ 222 x = list(x) 223 if len(x) != self.n_snps: 224 raise ValueError( 225 f"snps must have length {self.n_snps}; got length {len(x)}." 226 ) 227 self.__snps = np.asarray(x) 228 229 @property 230 def ancestries(self) -> Optional[np.ndarray]: 231 """ 232 Retrieve `ancestries`. 233 234 Returns: 235 **array of shape (n_samples,):** 236 An array containing ancestry labels for each sample. 237 """ 238 return self.__ancestries 239 240 @ancestries.setter 241 def ancestries(self, x: Sequence): 242 """ 243 Update `ancestries`. 244 """ 245 x = list(x) 246 num_x = len(x) 247 num_unique_x = len(np.unique(x)) 248 249 if num_x != self.n_samples: 250 raise ValueError( 251 f"ancestries must have length {self.n_samples}; got length {num_x}." 252 ) 253 if num_unique_x > self.n_ancestries: 254 raise ValueError( 255 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_x} unique labels." 256 ) 257 self.__ancestries = np.asarray(x) 258 259 @property 260 def n_samples(self) -> int: 261 """ 262 Retrieve `n_samples`. 263 264 Returns: 265 **int:** The total number of samples. 266 """ 267 return self.__Q.shape[0] 268 269 @property 270 def n_snps(self) -> int: 271 """ 272 Retrieve `n_snps`. 273 274 Returns: 275 **int:** The total number of SNPs. 276 """ 277 return 0 if self.__P is None else self.__P.shape[0] 278 279 @property 280 def n_ancestries(self) -> int: 281 """ 282 Retrieve `n_ancestries`. 283 284 Returns: 285 **int:** The total number of unique ancestries. 286 """ 287 return self.__Q.shape[1] 288 289 def copy(self) -> 'GlobalAncestryObject': 290 """ 291 Create and return a copy of `self`. 292 293 Returns: 294 **GlobalAncestryObject:** A new instance of the current object. 295 """ 296 return copy.copy(self) 297 298 def keys(self) -> List[str]: 299 """ 300 Retrieve a list of public attribute names for `self`. 301 302 Returns: 303 **list of str:** 304 A list of attribute names, with internal name-mangling removed, 305 for easier reference to public attributes in the instance. 306 """ 307 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 308 309 def _sanity_check(self) -> None: 310 """ 311 Perform sanity checks to ensure that matrix dimensions are consistent with expected sizes. 312 313 Raises: 314 **ValueError:** If any of the matrix dimensions do not match the expected sizes. 315 """ 316 # Check that the Q matrix has the correct shape 317 if self.__Q.shape != (self.n_samples, self.n_ancestries): 318 raise ValueError( 319 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {self.__Q.shape}." 320 ) 321 322 # Check that the P matrix has the correct shape (if provided) 323 if self.__P is not None: 324 if self.__P.shape != (self.n_snps, self.n_ancestries): 325 raise ValueError( 326 f"P must have shape ({self.n_snps}, {self.n_ancestries}); got {self.__P.shape}." 327 ) 328 329 # Check that samples length matches n_samples 330 if self.samples is not None: 331 if len(self.__samples) != self.n_samples: 332 raise ValueError( 333 f"samples must have length {self.n_samples}; got length {len(self.__samples)}." 334 ) 335 336 # Check that snps length matches n_snps 337 if self.snps is not None: 338 if len(self.__snps) != self.n_snps: 339 raise ValueError( 340 f"snps must have length {self.n_snps}; got length {len(self.__snps)}." 341 ) 342 343 # Check that ancestries length matches n_samples 344 if self.ancestries is not None: 345 if len(self.__ancestries) != self.n_samples: 346 raise ValueError( 347 f"ancestries must have length {self.n_samples}; got length {len(self.__ancestries)}." 348 ) 349 350 # Check number of unique ancestry labels 351 num_unique_ancestries = len(np.unique(self.__ancestries)) 352 if num_unique_ancestries > self.n_ancestries: 353 raise ValueError( 354 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_ancestries} unique labels." 355 ) 356 357 def save(self, file: Union[str, Path]) -> None: 358 """ 359 Save the data stored in `self` to a specified file or set of files. 360 361 The format of the saved file(s) is determined by the file extension provided in the `file` 362 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 363 the file is treated as a prefix for saving ADMIXTURE files. 364 365 **Supported formats:** 366 367 - `.pkl`: Pickle format for saving `self` in serialized form. 368 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 369 370 Args: 371 file (str or pathlib.Path): 372 Path to the file where the data will be saved. If the extension is `.pkl`, the object 373 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 374 """ 375 path = Path(file) 376 suffix = path.suffix.lower() 377 378 if suffix == '.pkl': 379 self.save_pickle(path) 380 else: 381 self.save_admixture(path) 382 383 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 384 """ 385 Save the data stored in `self` into multiple ADMIXTURE files. 386 If the file already exists, it will be overwritten. 387 388 **Output files:** 389 390 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 391 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 392 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 393 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 394 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 395 396 Args: 397 file_prefix (str or pathlib.Path): 398 The base prefix for output file names, including directory path but excluding file extensions. 399 The prefix is used to generate specific file names for each output, with file-specific 400 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 401 """ 402 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 403 404 AdmixtureWriter(self, file_prefix).write() 405 406 def save_pickle(self, file: Union[str, Path]) -> None: 407 """ 408 Save `self` in serialized form to a `.pkl` file. 409 If the file already exists, it will be overwritten. 410 411 Args: 412 file (str or pathlib.Path): 413 Path to the file where the data will be saved. It should end with `.pkl`. 414 If the provided path does not have this extension, it will be appended. 415 """ 416 import pickle 417 with open(file, 'wb') as file: 418 pickle.dump(self, file)
A class for Global Ancestry Inference (GAI) data.
14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: Optional[np.ndarray] = None, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 if P is not None: 42 n_snps, n_ancestries_P = P.shape 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if P is None: 62 snps = None 63 else: 64 if snps is None: 65 snps = list(range(n_snps)) 66 else: 67 snps = list(snps) 68 if len(snps) != n_snps: 69 raise ValueError( 70 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 71 ) 72 73 if ancestries is not None: 74 if len(ancestries) != n_samples: 75 raise ValueError( 76 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 77 ) 78 79 super().__init__(n_samples, n_ancestries) 80 81 # Store attributes 82 self.__Q = Q 83 self.__P = P 84 self.__samples = np.asarray(samples) 85 self.__snps = np.asarray(snps) if snps is not None else None 86 self.__ancestries = np.asarray(ancestries) if ancestries is not None else None 87 88 # Perform sanity checks 89 self._sanity_check()
Arguments:
- Q (array of shape (n_samples, n_ancestries)): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
- P (array of shape (n_snps, n_ancestries)): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
- samples (sequence of length n_samples, optional): A sequence containing unique identifiers for each sample. If None, sample identifiers
are assigned as integers from
0ton_samples - 1. - snps (sequence of length n_snps, optional): A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers
from
0ton_snps - 1. - ancestries (sequence of length n_samples, optional): A sequence containing ancestry labels for each sample.
111 @property 112 def Q(self) -> np.ndarray: 113 """ 114 Retrieve `Q`. 115 116 Returns: 117 **array of shape (n_samples, n_ancestries):** 118 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 119 and each column corresponds to an ancestry. 120 """ 121 return self.__Q
Retrieve Q.
Returns:
array of shape (n_samples, n_ancestries): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
134 @property 135 def P(self) -> np.ndarray: 136 """ 137 Retrieve `P`. 138 139 Returns: 140 **array of shape (n_snps, n_ancestries):** 141 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 142 and each column corresponds to an ancestry. 143 """ 144 return self.__P
Retrieve P.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
158 @property 159 def F(self) -> np.ndarray: 160 """ 161 Alias for `P`. 162 163 Returns: 164 **array of shape (n_snps, n_ancestries):** 165 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 166 and each column corresponds to an ancestry. 167 """ 168 return self.P
Alias for P.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
181 @property 182 def samples(self) -> Optional[np.ndarray]: 183 """ 184 Retrieve `samples`. 185 186 Returns: 187 **array of shape (n_samples,):** 188 An array containing unique identifiers for each sample. If None, sample 189 identifiers are assigned as integers from `0` to `n_samples - 1`. 190 """ 191 return self.__samples
Retrieve samples.
Returns:
array of shape (n_samples,): An array containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from
0ton_samples - 1.
205 @property 206 def snps(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `snps`. 209 210 Returns: 211 **array of shape (n_snps,):** 212 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 213 from `0` to `n_snps - 1`. 214 """ 215 return self.__snps
Retrieve snps.
Returns:
array of shape (n_snps,): An array containing identifiers for each SNP. If None, SNPs are assigned as integers from
0ton_snps - 1.
229 @property 230 def ancestries(self) -> Optional[np.ndarray]: 231 """ 232 Retrieve `ancestries`. 233 234 Returns: 235 **array of shape (n_samples,):** 236 An array containing ancestry labels for each sample. 237 """ 238 return self.__ancestries
Retrieve ancestries.
Returns:
array of shape (n_samples,): An array containing ancestry labels for each sample.
289 def copy(self) -> 'GlobalAncestryObject': 290 """ 291 Create and return a copy of `self`. 292 293 Returns: 294 **GlobalAncestryObject:** A new instance of the current object. 295 """ 296 return copy.copy(self)
Create and return a copy of self.
Returns:
GlobalAncestryObject: A new instance of the current object.
298 def keys(self) -> List[str]: 299 """ 300 Retrieve a list of public attribute names for `self`. 301 302 Returns: 303 **list of str:** 304 A list of attribute names, with internal name-mangling removed, 305 for easier reference to public attributes in the instance. 306 """ 307 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
357 def save(self, file: Union[str, Path]) -> None: 358 """ 359 Save the data stored in `self` to a specified file or set of files. 360 361 The format of the saved file(s) is determined by the file extension provided in the `file` 362 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 363 the file is treated as a prefix for saving ADMIXTURE files. 364 365 **Supported formats:** 366 367 - `.pkl`: Pickle format for saving `self` in serialized form. 368 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 369 370 Args: 371 file (str or pathlib.Path): 372 Path to the file where the data will be saved. If the extension is `.pkl`, the object 373 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 374 """ 375 path = Path(file) 376 suffix = path.suffix.lower() 377 378 if suffix == '.pkl': 379 self.save_pickle(path) 380 else: 381 self.save_admixture(path)
Save the data stored in self to a specified file or set of files.
The format of the saved file(s) is determined by the file extension provided in the file
argument. If the extension is .pkl, the object is serialized as a pickle file. Otherwise,
the file is treated as a prefix for saving ADMIXTURE files.
Supported formats:
.pkl: Pickle format for savingselfin serialized form.- Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. If the extension is
.pkl, the object is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
383 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 384 """ 385 Save the data stored in `self` into multiple ADMIXTURE files. 386 If the file already exists, it will be overwritten. 387 388 **Output files:** 389 390 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 391 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 392 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 393 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 394 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 395 396 Args: 397 file_prefix (str or pathlib.Path): 398 The base prefix for output file names, including directory path but excluding file extensions. 399 The prefix is used to generate specific file names for each output, with file-specific 400 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 401 """ 402 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 403 404 AdmixtureWriter(self, file_prefix).write()
Save the data stored in self into multiple ADMIXTURE files.
If the file already exists, it will be overwritten.
Output files:
<file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.<file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.<file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).<file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).<file_prefix>.map: Ancestry file (if ancestries information is available).
Arguments:
- file_prefix (str or pathlib.Path): The base prefix for output file names, including directory path but excluding file extensions.
The prefix is used to generate specific file names for each output, with file-specific
suffixes appended as described above (e.g.,
file_prefix.n_ancestries.Qfor the Q matrix file).
406 def save_pickle(self, file: Union[str, Path]) -> None: 407 """ 408 Save `self` in serialized form to a `.pkl` file. 409 If the file already exists, it will be overwritten. 410 411 Args: 412 file (str or pathlib.Path): 413 Path to the file where the data will be saved. It should end with `.pkl`. 414 If the provided path does not have this extension, it will be appended. 415 """ 416 import pickle 417 with open(file, 'wb') as file: 418 pickle.dump(self, file)
Save self in serialized form to a .pkl file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl. If the provided path does not have this extension, it will be appended.
30class MSPReader(LAIBaseReader): 31 """ 32 A reader class for parsing Local Ancestry Inference (LAI) data from an `.msp` or `msp.tsv` file 33 and constructing a `snputils.ancestry.genobj.LocalAncestryObject`. 34 """ 35 def __init__(self, file: Union[str, Path]) -> None: 36 """ 37 Args: 38 file (str or pathlib.Path): 39 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 40 """ 41 self.__file = Path(file) 42 43 @property 44 def file(self) -> Path: 45 """ 46 Retrieve `file`. 47 48 Returns: 49 **pathlib.Path:** 50 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 51 """ 52 return self.__file 53 54 def _get_samples(self, msp_df: pd.DataFrame, first_lai_col_indx: int) -> List[str]: 55 """ 56 Extract unique sample identifiers from the pandas DataFrame. 57 58 Args: 59 msp_df (pd.DataFrame): 60 The DataFrame representing the `.msp` data, including LAI columns. 61 first_lai_col_indx (int): 62 Index of the first column containing LAI data. 63 64 Returns: 65 **list:** List of unique sample identifiers. 66 """ 67 # Get all columns starting from the first LAI data column 68 query_samples_dub = msp_df.columns[first_lai_col_indx:] 69 70 # Select only one of the maternal/paternal samples by taking every second sample 71 single_ind_idx = np.arange(0, len(query_samples_dub), 2) 72 query_samples_sing = query_samples_dub[single_ind_idx] 73 74 # Remove the suffix from sample names to get clean identifiers 75 query_samples = [qs[:-2] for qs in query_samples_sing] 76 77 return query_samples 78 79 def _get_samples_from_haplotypes(self, haplotypes: List[str]) -> List[str]: 80 query_samples_dub = np.asarray(haplotypes, dtype=object) 81 single_ind_idx = np.arange(0, len(query_samples_dub), 2) 82 query_samples_sing = query_samples_dub[single_ind_idx] 83 return [str(qs)[:-2] for qs in query_samples_sing] 84 85 def _parse_header_and_comment(self) -> tuple[Optional[str], List[str]]: 86 with open(self.file) as f: 87 first_line = f.readline() 88 second_line = f.readline() 89 90 first_line_ = [h.strip() for h in first_line.split("\t")] 91 second_line_ = [h.strip() for h in second_line.split("\t")] 92 93 if "#chm" in first_line_: 94 return None, first_line_ 95 if "#chm" in second_line_: 96 return first_line, second_line_ 97 98 raise ValueError( 99 f"Header not found. Expected '#chm' in the first two lines. " 100 f"First line: {first_line.strip()} | Second line: {second_line.strip()}" 101 ) 102 103 def _get_first_lai_col_indx(self, header: List[str]) -> int: 104 column_counter = 1 105 if "spos" in header and "epos" in header: 106 column_counter += 2 107 if "sgpos" in header and "egpos" in header: 108 column_counter += 2 109 if "n snps" in header: 110 column_counter += 1 111 return column_counter 112 113 def read_metadata(self) -> MSPMetadata: 114 comment, header = self._parse_header_and_comment() 115 116 if len(header) != len(set(header)): 117 raise ValueError("Duplicate columns detected in the header.") 118 119 first_lai_col_indx = self._get_first_lai_col_indx(header) 120 haplotypes = header[first_lai_col_indx:] 121 samples = self._get_samples_from_haplotypes(haplotypes) 122 ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None 123 124 return MSPMetadata( 125 header=header, 126 comment=comment, 127 first_lai_col_indx=first_lai_col_indx, 128 haplotypes=haplotypes, 129 samples=samples, 130 ancestry_map=ancestry_map, 131 has_physical_pos=("spos" in header and "epos" in header), 132 has_centimorgan_pos=("sgpos" in header and "egpos" in header), 133 has_window_sizes=("n snps" in header), 134 ) 135 136 def iter_windows( 137 self, 138 chunk_size: int = 1024, 139 sample_indices: Optional[np.ndarray] = None, 140 ) -> Iterator[Dict[str, np.ndarray]]: 141 metadata = self.read_metadata() 142 143 if chunk_size < 1: 144 raise ValueError("chunk_size must be >= 1.") 145 146 header = metadata.header 147 first_lai_col_indx = metadata.first_lai_col_indx 148 column_index = {name: i for i, name in enumerate(header)} 149 chrom_col_idx = column_index["#chm"] 150 151 spos_col_idx: Optional[int] = None 152 epos_col_idx: Optional[int] = None 153 if metadata.has_physical_pos: 154 spos_col_idx = column_index["spos"] 155 epos_col_idx = column_index["epos"] 156 157 if sample_indices is None: 158 hap_col_indices = list(range(first_lai_col_indx, len(header))) 159 else: 160 sample_indices = np.asarray(sample_indices, dtype=np.int64) 161 if sample_indices.size == 0: 162 raise ValueError("sample_indices cannot be empty.") 163 if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)): 164 raise ValueError("sample_indices contain out-of-bounds sample indexes.") 165 166 hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64) 167 hap_indices[0::2] = 2 * sample_indices 168 hap_indices[1::2] = 2 * sample_indices + 1 169 hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist() 170 171 n_selected_haps = len(hap_col_indices) 172 n_total_haps = len(metadata.haplotypes) 173 all_haps_selected = ( 174 n_selected_haps == n_total_haps 175 and n_selected_haps > 0 176 and hap_col_indices[0] == first_lai_col_indx 177 and hap_col_indices[-1] == (len(header) - 1) 178 ) 179 180 # Pre-compute relative indices for the sample-subset path so the 181 # inner loop can use np.fromstring (C-level) + numpy fancy indexing 182 # instead of a Python for-loop over potentially millions of columns. 183 if not all_haps_selected: 184 _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx 185 else: 186 _relative_hap_idx = None 187 188 row_in_chunk = 0 189 window_start = 0 190 chromosomes_chunk = np.empty(int(chunk_size), dtype=object) 191 lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8) 192 physical_pos_chunk = ( 193 np.empty((int(chunk_size), 2), dtype=np.int64) 194 if metadata.has_physical_pos 195 else None 196 ) 197 198 with open(self.file, "r", encoding="utf-8") as handle: 199 for line_no, raw_line in enumerate(handle, start=1): 200 if not raw_line: 201 continue 202 if raw_line.startswith("#"): 203 continue 204 205 line = raw_line.rstrip("\n") 206 if not line: 207 continue 208 209 # Both paths split only at the metadata/haplotype boundary, 210 # then use np.fromstring (C parser) for the haplotype tail. 211 fields = line.split("\t", first_lai_col_indx) 212 if len(fields) != (first_lai_col_indx + 1): 213 raise ValueError( 214 f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} " 215 f"prefix segments when parsing haplotypes." 216 ) 217 218 chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx] 219 if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None: 220 physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx]) 221 physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx]) 222 223 lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8) 224 225 if all_haps_selected: 226 if lai_row.size != n_selected_haps: 227 raise ValueError( 228 f"Malformed MSP haplotype row at line {line_no}: expected " 229 f"{n_selected_haps} haplotype values, got {lai_row.size}." 230 ) 231 lai_chunk[row_in_chunk, :] = lai_row 232 else: 233 if lai_row.size < n_total_haps: 234 raise ValueError( 235 f"Malformed MSP haplotype row at line {line_no}: expected at least " 236 f"{n_total_haps} haplotype values, got {lai_row.size}." 237 ) 238 lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx] 239 240 row_in_chunk += 1 241 if row_in_chunk == chunk_size: 242 window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64) 243 yield { 244 "window_indexes": window_indexes, 245 "chromosomes": chromosomes_chunk, 246 "physical_pos": physical_pos_chunk, 247 "lai": lai_chunk, 248 } 249 250 window_start += row_in_chunk 251 row_in_chunk = 0 252 chromosomes_chunk = np.empty(int(chunk_size), dtype=object) 253 lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8) 254 if metadata.has_physical_pos: 255 physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64) 256 else: 257 physical_pos_chunk = None 258 259 if row_in_chunk > 0: 260 window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64) 261 yield { 262 "window_indexes": window_indexes, 263 "chromosomes": chromosomes_chunk[:row_in_chunk], 264 "physical_pos": ( 265 physical_pos_chunk[:row_in_chunk] 266 if physical_pos_chunk is not None 267 else None 268 ), 269 "lai": lai_chunk[:row_in_chunk], 270 } 271 272 def _get_ancestry_map_from_comment(self, comment: str) -> Dict[str, str]: 273 """ 274 Construct an ancestry map from the comment line of the `.msp` file. 275 276 This method parses the comment string to create a mapping of ancestry numerical identifiers 277 to their corresponding ancestry names (e.g., '0': 'African'). 278 279 Args: 280 comment (str): 281 The comment line containing ancestry mapping information. 282 283 Returns: 284 dict: A dictionary mapping ancestry codes (as strings) to ancestry names. 285 """ 286 comment = comment.strip() 287 288 # Remove everything before the colon, if present 289 if ':' in comment: 290 comment = comment.split(':', 1)[1].strip() 291 292 ancestry_map: Dict[str, str] = {} 293 294 # Split on tabs, spaces, commas, semicolons or any combination of them 295 tokens = [tok.strip() for tok in re.split(r'[,\t; ]+', comment) if tok] 296 297 for tok in tokens: 298 if '=' not in tok: 299 continue # Skip invalid pieces 300 301 left, right = (p.strip() for p in tok.split('=', 1)) 302 303 # Detect whether format is "Pop=0" or "0=Pop" 304 if left.isdigit() and not right.isdigit(): 305 ancestry_map[left] = right # 0=Africa 306 elif right.isdigit() and not left.isdigit(): 307 ancestry_map[right] = left # Africa=0 308 else: 309 # Fallback (if both sides are digits or both are pops, keep left as code) 310 ancestry_map[left] = right 311 312 return ancestry_map 313 314 def _replace_nan_with_none(self, array: Optional[np.ndarray]) -> Optional[np.ndarray]: 315 """ 316 Replace arrays that are fully NaN with `None`. 317 318 Args: 319 array (np.ndarray): Array to check. 320 321 Returns: 322 Optional[np.ndarray]: Returns `None` if the array is fully NaN, otherwise returns the original array. 323 """ 324 if array is not None: 325 if array.size == 0: # Check if the array is empty 326 return None 327 if np.issubdtype(array.dtype, np.number): # Check for numeric types 328 if np.isnan(array).all(): # Fully NaN numeric array 329 return None 330 elif array.dtype == np.object_ or np.issubdtype(array.dtype, np.str_): # String or object types 331 if np.all((array == '') | (array == None)): # Empty or None strings 332 return None 333 return array 334 335 def read(self) -> 'LocalAncestryObject': 336 """ 337 Read data from the provided `.msp` or `msp.tsv` `file` and construct a 338 `snputils.ancestry.genobj.LocalAncestryObject`. 339 340 **Expected MSP content:** 341 342 The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows. 343 Each row should correspond to a genomic window and include the following columns: 344 345 - `#chm`: Chromosome numbers corresponding to each genomic window. 346 - `spos`: Start physical position for each window. 347 - `epos`: End physical position for each window. 348 - `sgpos`: Start centimorgan position for each window. 349 - `egpos`: End centimorgan position for each window. 350 - `n snps`: Number of SNPs in each genomic window. 351 - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window. 352 - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window. 353 354 Returns: 355 **LocalAncestryObject:** 356 A LocalAncestryObject instance. 357 """ 358 log.info(f"Reading '{self.file}'...") 359 metadata = self.read_metadata() 360 comment = metadata.comment 361 header = metadata.header 362 363 # Read the main data into a DataFrame, skipping comment lines 364 msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header) 365 366 # Extract chromosomes data 367 chromosomes = msp_df['#chm'].astype(str).to_numpy() 368 369 # Extract physical positions (if available) 370 column_counter = metadata.first_lai_col_indx 371 if metadata.has_physical_pos: 372 physical_pos = msp_df[['spos', 'epos']].to_numpy() 373 else: 374 physical_pos = None 375 log.warning("Physical positions ('spos' and 'epos') not found.") 376 377 # Extract centimorgan positions (if available) 378 if metadata.has_centimorgan_pos: 379 centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy() 380 else: 381 centimorgan_pos = None 382 log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.") 383 384 # Extract window sizes (if available) 385 if metadata.has_window_sizes: 386 window_sizes = msp_df['n snps'].to_numpy() 387 else: 388 window_sizes = None 389 log.warning("Window sizes ('n snps') not found.") 390 391 # Extract LAI data (haplotype-level) 392 lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False) 393 394 # Extract haplotype identifiers 395 haplotypes = metadata.haplotypes 396 397 # Extract haplotype identifiers and sample identifiers 398 samples = metadata.samples 399 del msp_df 400 gc.collect() 401 402 # Validate the number of samples matches the LAI data dimensions 403 n_samples = len(samples) 404 if n_samples != int(lai.shape[1] / 2): 405 raise ValueError( 406 "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. " 407 f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}." 408 ) 409 410 # Count number of unique ancestries in the LAI data 411 n_ancestries = len(np.unique(lai)) 412 413 # Parse ancestry map from the comment (if available) 414 ancestry_map = None 415 if comment is not None: 416 ancestry_map = metadata.ancestry_map 417 if len(ancestry_map) != n_ancestries: 418 warnings.warn( 419 "Mismatch between the number of unique ancestries in the LAI data " 420 f"({n_ancestries}) and the number of classes in the ancestry map " 421 f"({len(ancestry_map)})." 422 ) 423 else: 424 # Provide default ancestry mapping if no comment is provided 425 ancestry_map = None 426 warnings.warn( 427 "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry " 428 "map as a comment in the first line." 429 ) 430 431 # Replace fully NaN attributes with None 432 window_sizes = self._replace_nan_with_none(window_sizes) 433 centimorgan_pos = self._replace_nan_with_none(centimorgan_pos) 434 chromosomes = self._replace_nan_with_none(chromosomes) 435 physical_pos = self._replace_nan_with_none(physical_pos) 436 437 return LocalAncestryObject( 438 haplotypes=haplotypes, 439 lai=lai, 440 samples=samples, 441 ancestry_map=ancestry_map, 442 window_sizes=window_sizes, 443 centimorgan_pos=centimorgan_pos, 444 chromosomes=chromosomes, 445 physical_pos=physical_pos 446 )
A reader class for parsing Local Ancestry Inference (LAI) data from an .msp or msp.tsv file
and constructing a snputils.ancestry.genobj.LocalAncestryObject.
35 def __init__(self, file: Union[str, Path]) -> None: 36 """ 37 Args: 38 file (str or pathlib.Path): 39 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 40 """ 41 self.__file = Path(file)
Arguments:
- file (str or pathlib.Path): Path to the file to be read. It should end with
.mspor.msp.tsv.
43 @property 44 def file(self) -> Path: 45 """ 46 Retrieve `file`. 47 48 Returns: 49 **pathlib.Path:** 50 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 51 """ 52 return self.__file
Retrieve file.
Returns:
pathlib.Path: Path to the file to be read. It should end with
.mspor.msp.tsv.
113 def read_metadata(self) -> MSPMetadata: 114 comment, header = self._parse_header_and_comment() 115 116 if len(header) != len(set(header)): 117 raise ValueError("Duplicate columns detected in the header.") 118 119 first_lai_col_indx = self._get_first_lai_col_indx(header) 120 haplotypes = header[first_lai_col_indx:] 121 samples = self._get_samples_from_haplotypes(haplotypes) 122 ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None 123 124 return MSPMetadata( 125 header=header, 126 comment=comment, 127 first_lai_col_indx=first_lai_col_indx, 128 haplotypes=haplotypes, 129 samples=samples, 130 ancestry_map=ancestry_map, 131 has_physical_pos=("spos" in header and "epos" in header), 132 has_centimorgan_pos=("sgpos" in header and "egpos" in header), 133 has_window_sizes=("n snps" in header), 134 )
136 def iter_windows( 137 self, 138 chunk_size: int = 1024, 139 sample_indices: Optional[np.ndarray] = None, 140 ) -> Iterator[Dict[str, np.ndarray]]: 141 metadata = self.read_metadata() 142 143 if chunk_size < 1: 144 raise ValueError("chunk_size must be >= 1.") 145 146 header = metadata.header 147 first_lai_col_indx = metadata.first_lai_col_indx 148 column_index = {name: i for i, name in enumerate(header)} 149 chrom_col_idx = column_index["#chm"] 150 151 spos_col_idx: Optional[int] = None 152 epos_col_idx: Optional[int] = None 153 if metadata.has_physical_pos: 154 spos_col_idx = column_index["spos"] 155 epos_col_idx = column_index["epos"] 156 157 if sample_indices is None: 158 hap_col_indices = list(range(first_lai_col_indx, len(header))) 159 else: 160 sample_indices = np.asarray(sample_indices, dtype=np.int64) 161 if sample_indices.size == 0: 162 raise ValueError("sample_indices cannot be empty.") 163 if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)): 164 raise ValueError("sample_indices contain out-of-bounds sample indexes.") 165 166 hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64) 167 hap_indices[0::2] = 2 * sample_indices 168 hap_indices[1::2] = 2 * sample_indices + 1 169 hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist() 170 171 n_selected_haps = len(hap_col_indices) 172 n_total_haps = len(metadata.haplotypes) 173 all_haps_selected = ( 174 n_selected_haps == n_total_haps 175 and n_selected_haps > 0 176 and hap_col_indices[0] == first_lai_col_indx 177 and hap_col_indices[-1] == (len(header) - 1) 178 ) 179 180 # Pre-compute relative indices for the sample-subset path so the 181 # inner loop can use np.fromstring (C-level) + numpy fancy indexing 182 # instead of a Python for-loop over potentially millions of columns. 183 if not all_haps_selected: 184 _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx 185 else: 186 _relative_hap_idx = None 187 188 row_in_chunk = 0 189 window_start = 0 190 chromosomes_chunk = np.empty(int(chunk_size), dtype=object) 191 lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8) 192 physical_pos_chunk = ( 193 np.empty((int(chunk_size), 2), dtype=np.int64) 194 if metadata.has_physical_pos 195 else None 196 ) 197 198 with open(self.file, "r", encoding="utf-8") as handle: 199 for line_no, raw_line in enumerate(handle, start=1): 200 if not raw_line: 201 continue 202 if raw_line.startswith("#"): 203 continue 204 205 line = raw_line.rstrip("\n") 206 if not line: 207 continue 208 209 # Both paths split only at the metadata/haplotype boundary, 210 # then use np.fromstring (C parser) for the haplotype tail. 211 fields = line.split("\t", first_lai_col_indx) 212 if len(fields) != (first_lai_col_indx + 1): 213 raise ValueError( 214 f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} " 215 f"prefix segments when parsing haplotypes." 216 ) 217 218 chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx] 219 if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None: 220 physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx]) 221 physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx]) 222 223 lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8) 224 225 if all_haps_selected: 226 if lai_row.size != n_selected_haps: 227 raise ValueError( 228 f"Malformed MSP haplotype row at line {line_no}: expected " 229 f"{n_selected_haps} haplotype values, got {lai_row.size}." 230 ) 231 lai_chunk[row_in_chunk, :] = lai_row 232 else: 233 if lai_row.size < n_total_haps: 234 raise ValueError( 235 f"Malformed MSP haplotype row at line {line_no}: expected at least " 236 f"{n_total_haps} haplotype values, got {lai_row.size}." 237 ) 238 lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx] 239 240 row_in_chunk += 1 241 if row_in_chunk == chunk_size: 242 window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64) 243 yield { 244 "window_indexes": window_indexes, 245 "chromosomes": chromosomes_chunk, 246 "physical_pos": physical_pos_chunk, 247 "lai": lai_chunk, 248 } 249 250 window_start += row_in_chunk 251 row_in_chunk = 0 252 chromosomes_chunk = np.empty(int(chunk_size), dtype=object) 253 lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8) 254 if metadata.has_physical_pos: 255 physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64) 256 else: 257 physical_pos_chunk = None 258 259 if row_in_chunk > 0: 260 window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64) 261 yield { 262 "window_indexes": window_indexes, 263 "chromosomes": chromosomes_chunk[:row_in_chunk], 264 "physical_pos": ( 265 physical_pos_chunk[:row_in_chunk] 266 if physical_pos_chunk is not None 267 else None 268 ), 269 "lai": lai_chunk[:row_in_chunk], 270 }
335 def read(self) -> 'LocalAncestryObject': 336 """ 337 Read data from the provided `.msp` or `msp.tsv` `file` and construct a 338 `snputils.ancestry.genobj.LocalAncestryObject`. 339 340 **Expected MSP content:** 341 342 The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows. 343 Each row should correspond to a genomic window and include the following columns: 344 345 - `#chm`: Chromosome numbers corresponding to each genomic window. 346 - `spos`: Start physical position for each window. 347 - `epos`: End physical position for each window. 348 - `sgpos`: Start centimorgan position for each window. 349 - `egpos`: End centimorgan position for each window. 350 - `n snps`: Number of SNPs in each genomic window. 351 - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window. 352 - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window. 353 354 Returns: 355 **LocalAncestryObject:** 356 A LocalAncestryObject instance. 357 """ 358 log.info(f"Reading '{self.file}'...") 359 metadata = self.read_metadata() 360 comment = metadata.comment 361 header = metadata.header 362 363 # Read the main data into a DataFrame, skipping comment lines 364 msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header) 365 366 # Extract chromosomes data 367 chromosomes = msp_df['#chm'].astype(str).to_numpy() 368 369 # Extract physical positions (if available) 370 column_counter = metadata.first_lai_col_indx 371 if metadata.has_physical_pos: 372 physical_pos = msp_df[['spos', 'epos']].to_numpy() 373 else: 374 physical_pos = None 375 log.warning("Physical positions ('spos' and 'epos') not found.") 376 377 # Extract centimorgan positions (if available) 378 if metadata.has_centimorgan_pos: 379 centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy() 380 else: 381 centimorgan_pos = None 382 log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.") 383 384 # Extract window sizes (if available) 385 if metadata.has_window_sizes: 386 window_sizes = msp_df['n snps'].to_numpy() 387 else: 388 window_sizes = None 389 log.warning("Window sizes ('n snps') not found.") 390 391 # Extract LAI data (haplotype-level) 392 lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False) 393 394 # Extract haplotype identifiers 395 haplotypes = metadata.haplotypes 396 397 # Extract haplotype identifiers and sample identifiers 398 samples = metadata.samples 399 del msp_df 400 gc.collect() 401 402 # Validate the number of samples matches the LAI data dimensions 403 n_samples = len(samples) 404 if n_samples != int(lai.shape[1] / 2): 405 raise ValueError( 406 "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. " 407 f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}." 408 ) 409 410 # Count number of unique ancestries in the LAI data 411 n_ancestries = len(np.unique(lai)) 412 413 # Parse ancestry map from the comment (if available) 414 ancestry_map = None 415 if comment is not None: 416 ancestry_map = metadata.ancestry_map 417 if len(ancestry_map) != n_ancestries: 418 warnings.warn( 419 "Mismatch between the number of unique ancestries in the LAI data " 420 f"({n_ancestries}) and the number of classes in the ancestry map " 421 f"({len(ancestry_map)})." 422 ) 423 else: 424 # Provide default ancestry mapping if no comment is provided 425 ancestry_map = None 426 warnings.warn( 427 "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry " 428 "map as a comment in the first line." 429 ) 430 431 # Replace fully NaN attributes with None 432 window_sizes = self._replace_nan_with_none(window_sizes) 433 centimorgan_pos = self._replace_nan_with_none(centimorgan_pos) 434 chromosomes = self._replace_nan_with_none(chromosomes) 435 physical_pos = self._replace_nan_with_none(physical_pos) 436 437 return LocalAncestryObject( 438 haplotypes=haplotypes, 439 lai=lai, 440 samples=samples, 441 ancestry_map=ancestry_map, 442 window_sizes=window_sizes, 443 centimorgan_pos=centimorgan_pos, 444 chromosomes=chromosomes, 445 physical_pos=physical_pos 446 )
Read data from the provided .msp or msp.tsv file and construct a
snputils.ancestry.genobj.LocalAncestryObject.
Expected MSP content:
The .msp file should contain local ancestry assignments for each haplotype across genomic windows.
Each row should correspond to a genomic window and include the following columns:
#chm: Chromosome numbers corresponding to each genomic window.spos: Start physical position for each window.epos: End physical position for each window.sgpos: Start centimorgan position for each window.egpos: End centimorgan position for each window.n snps: Number of SNPs in each genomic window.SampleID.0: Local ancestry for the first haplotype of the sample for each window.SampleID.1: Local ancestry for the second haplotype of the sample for each window.
Returns:
LocalAncestryObject: A LocalAncestryObject instance.
15class MSPWriter(LAIBaseWriter): 16 """ 17 A writer class for exporting local ancestry data from a `snputils.ancestry.genobj.LocalAncestryObject` 18 into an `.msp` or `.msp.tsv` file. 19 """ 20 def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None: 21 """ 22 Args: 23 laiobj (LocalAncestryObject): 24 A LocalAncestryObject instance. 25 file (str or pathlib.Path): 26 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 27 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 28 """ 29 self.__laiobj = laiobj 30 self.__file = Path(file) 31 32 @property 33 def laiobj(self) -> LocalAncestryObject: 34 """ 35 Retrieve `laiobj`. 36 37 Returns: 38 **LocalAncestryObject:** 39 A LocalAncestryObject instance. 40 """ 41 return self.__laiobj 42 43 @property 44 def file(self) -> Path: 45 """ 46 Retrieve `file`. 47 48 Returns: 49 **pathlib.Path:** 50 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 51 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 52 """ 53 return self.__file 54 55 @file.setter 56 def file(self, x: Union[str, Path]): 57 """ 58 Update `file`. 59 """ 60 self.__file = Path(x) 61 62 def write(self) -> None: 63 """ 64 Write the data contained in the `laiobj` instance to the specified output `file`. 65 If the file already exists, it will be overwritten. 66 67 **Output MSP content:** 68 69 The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows. 70 Each row corresponds to a genomic window and includes the following columns: 71 72 - `#chm`: Chromosome numbers corresponding to each genomic window. 73 - `spos`: Start physical position for each window. 74 - `epos`: End physical position for each window. 75 - `sgpos`: Start centimorgan position for each window. 76 - `egpos`: End centimorgan position for each window. 77 - `n snps`: Number of SNPs in each genomic window. 78 - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window. 79 - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window. 80 """ 81 log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.") 82 83 # Define the valid file extensions 84 valid_extensions = ('.msp', '.msp.tsv') 85 86 # Append '.msp' extension if not already present 87 if not self.file.name.endswith(valid_extensions): 88 self.file = self.file.with_name(self.file.name + '.msp') 89 90 # Check if file already exists 91 if self.file.exists(): 92 warnings.warn(f"File '{self.file}' already exists and will be overwritten.") 93 94 # Compute the number of windows and haplotypes 95 n_windows = self.laiobj.n_windows 96 n_haplotypes = self.laiobj.n_haplotypes 97 98 # Initialize attributes with NaN where they are None 99 chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan) 100 physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan) 101 centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan) 102 window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan) 103 104 haplotypes = self.laiobj.haplotypes 105 if haplotypes is None: 106 # Generate haplotypes from samples or default identifiers 107 if self.laiobj.samples is not None: 108 haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)] 109 warnings.warn( 110 "Haplotype data is missing. Haplotypes have been automatically generated " 111 "from the provided sample identifiers." 112 ) 113 else: 114 haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)] 115 warnings.warn( 116 "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated " 117 "as `sample_<index>.0` and `sample_<index>.1`." 118 ) 119 120 # Prepare columns for the DataFrame 121 columns = ["spos", "epos", "sgpos", "egpos", "n snps"] 122 lai_dic = { 123 "#chm": chromosomes, 124 "spos": physical_pos[:, 0], 125 "epos": physical_pos[:, 1], 126 "sgpos": centimorgan_pos[:, 0], 127 "egpos": centimorgan_pos[:, 1], 128 "n snps": window_sizes, 129 } 130 131 # Populate the dictionary with haplotype data 132 for ilai, haplotype in enumerate(haplotypes): 133 lai_dic[haplotype] = self.laiobj.lai[:, ilai] 134 columns.append(haplotype) 135 136 # Check if DataFrame is empty 137 if len(lai_dic["#chm"]) == 0: 138 raise ValueError("No data to write: all columns are empty or missing.") 139 140 # Create a DataFrame from the dictionary containing all data 141 lai_df = pd.DataFrame(lai_dic) 142 143 log.info(f"Writing MSP file to '{self.file}'...") 144 145 # Save the DataFrame to the .msp file in tab-separated format 146 lai_df.to_csv(self.file, sep="\t", index=False, header=False) 147 148 # Construct the second line for the output file containing the column headers 149 second_line = "#chm" + "\t" + "\t".join(columns) 150 151 # If an ancestry map is available, prepend it to the output file 152 if self.laiobj.ancestry_map is not None: 153 ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes 154 ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names 155 156 # Create the first line for the ancestry information, detailing subpopulation codes 157 first_line = "#Subpopulation order/codes: " + "\t".join( 158 f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries) 159 ) 160 161 # Open the file for reading and prepend the first line 162 with open(self.__file, "r+") as f: 163 content = f.read() 164 f.seek(0,0) 165 f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content) 166 167 log.info(f"Finished writing MSP file to '{self.file}'.") 168 169 return None
A writer class for exporting local ancestry data from a snputils.ancestry.genobj.LocalAncestryObject
into an .msp or .msp.tsv file.
20 def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None: 21 """ 22 Args: 23 laiobj (LocalAncestryObject): 24 A LocalAncestryObject instance. 25 file (str or pathlib.Path): 26 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 27 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 28 """ 29 self.__laiobj = laiobj 30 self.__file = Path(file)
Arguments:
- laiobj (LocalAncestryObject): A LocalAncestryObject instance.
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.mspor.msp.tsv. If the provided path does not have one of these extensions, the.mspextension will be appended.
43 @property 44 def file(self) -> Path: 45 """ 46 Retrieve `file`. 47 48 Returns: 49 **pathlib.Path:** 50 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 51 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 52 """ 53 return self.__file
Retrieve file.
Returns:
pathlib.Path: Path to the file where the data will be saved. It should end with
.mspor.msp.tsv. If the provided path does not have one of these extensions, the.mspextension will be appended.
62 def write(self) -> None: 63 """ 64 Write the data contained in the `laiobj` instance to the specified output `file`. 65 If the file already exists, it will be overwritten. 66 67 **Output MSP content:** 68 69 The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows. 70 Each row corresponds to a genomic window and includes the following columns: 71 72 - `#chm`: Chromosome numbers corresponding to each genomic window. 73 - `spos`: Start physical position for each window. 74 - `epos`: End physical position for each window. 75 - `sgpos`: Start centimorgan position for each window. 76 - `egpos`: End centimorgan position for each window. 77 - `n snps`: Number of SNPs in each genomic window. 78 - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window. 79 - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window. 80 """ 81 log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.") 82 83 # Define the valid file extensions 84 valid_extensions = ('.msp', '.msp.tsv') 85 86 # Append '.msp' extension if not already present 87 if not self.file.name.endswith(valid_extensions): 88 self.file = self.file.with_name(self.file.name + '.msp') 89 90 # Check if file already exists 91 if self.file.exists(): 92 warnings.warn(f"File '{self.file}' already exists and will be overwritten.") 93 94 # Compute the number of windows and haplotypes 95 n_windows = self.laiobj.n_windows 96 n_haplotypes = self.laiobj.n_haplotypes 97 98 # Initialize attributes with NaN where they are None 99 chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan) 100 physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan) 101 centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan) 102 window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan) 103 104 haplotypes = self.laiobj.haplotypes 105 if haplotypes is None: 106 # Generate haplotypes from samples or default identifiers 107 if self.laiobj.samples is not None: 108 haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)] 109 warnings.warn( 110 "Haplotype data is missing. Haplotypes have been automatically generated " 111 "from the provided sample identifiers." 112 ) 113 else: 114 haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)] 115 warnings.warn( 116 "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated " 117 "as `sample_<index>.0` and `sample_<index>.1`." 118 ) 119 120 # Prepare columns for the DataFrame 121 columns = ["spos", "epos", "sgpos", "egpos", "n snps"] 122 lai_dic = { 123 "#chm": chromosomes, 124 "spos": physical_pos[:, 0], 125 "epos": physical_pos[:, 1], 126 "sgpos": centimorgan_pos[:, 0], 127 "egpos": centimorgan_pos[:, 1], 128 "n snps": window_sizes, 129 } 130 131 # Populate the dictionary with haplotype data 132 for ilai, haplotype in enumerate(haplotypes): 133 lai_dic[haplotype] = self.laiobj.lai[:, ilai] 134 columns.append(haplotype) 135 136 # Check if DataFrame is empty 137 if len(lai_dic["#chm"]) == 0: 138 raise ValueError("No data to write: all columns are empty or missing.") 139 140 # Create a DataFrame from the dictionary containing all data 141 lai_df = pd.DataFrame(lai_dic) 142 143 log.info(f"Writing MSP file to '{self.file}'...") 144 145 # Save the DataFrame to the .msp file in tab-separated format 146 lai_df.to_csv(self.file, sep="\t", index=False, header=False) 147 148 # Construct the second line for the output file containing the column headers 149 second_line = "#chm" + "\t" + "\t".join(columns) 150 151 # If an ancestry map is available, prepend it to the output file 152 if self.laiobj.ancestry_map is not None: 153 ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes 154 ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names 155 156 # Create the first line for the ancestry information, detailing subpopulation codes 157 first_line = "#Subpopulation order/codes: " + "\t".join( 158 f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries) 159 ) 160 161 # Open the file for reading and prepend the first line 162 with open(self.__file, "r+") as f: 163 content = f.read() 164 f.seek(0,0) 165 f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content) 166 167 log.info(f"Finished writing MSP file to '{self.file}'.") 168 169 return None
Write the data contained in the laiobj instance to the specified output file.
If the file already exists, it will be overwritten.
Output MSP content:
The output .msp file will contain local ancestry assignments for each haplotype across genomic windows.
Each row corresponds to a genomic window and includes the following columns:
#chm: Chromosome numbers corresponding to each genomic window.spos: Start physical position for each window.epos: End physical position for each window.sgpos: Start centimorgan position for each window.egpos: End centimorgan position for each window.n snps: Number of SNPs in each genomic window.SampleID.0: Local ancestry for the first haplotype of the sample for each window.SampleID.1: Local ancestry for the second haplotype of the sample for each window.
16class AdmixtureMappingVCFWriter: 17 """ 18 A writer class for converting and writing local ancestry data into ancestry-specific 19 VCF/BCF files for ADMIXTURE mapping. 20 """ 21 def __init__( 22 self, 23 laiobj: LocalAncestryObject, 24 file: Union[str, Path], 25 ancestry_map: Optional[Dict[str, str]] = None 26 ): 27 """ 28 Args: 29 laiobj (LocalAncestryObject): 30 A LocalAncestryObject instance. 31 file (str or pathlib.Path): 32 Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 33 If the provided path does not have one of these extensions, the `.vcf` extension will be appended. 34 ancestry_map (dict of str to str, optional): 35 A dictionary mapping ancestry codes to region names. If not explicitly 36 provided, it will default to the `ancestry_map` from `laiobj`. 37 """ 38 self.__laiobj = laiobj 39 self.__file = Path(file) 40 self.__ancestry_map = ancestry_map 41 42 @property 43 def laiobj(self) -> LocalAncestryObject: 44 """ 45 Retrieve `laiobj`. 46 47 Returns: 48 **LocalAncestryObject:** 49 A LocalAncestryObject instance. 50 """ 51 return self.__laiobj 52 53 @property 54 def file(self) -> Path: 55 """ 56 Retrieve `file`. 57 58 Returns: 59 **pathlib.Path:** 60 Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 61 If the provided path does not have one of these extensions, the `.vcf` extension will be appended. 62 """ 63 return self.__file 64 65 @property 66 def ancestry_map(self) -> Dict[str, str]: 67 """ 68 Retrieve `ancestry_map`. 69 70 Returns: 71 **dict of str to str:** 72 A dictionary mapping ancestry codes to region names. If not explicitly 73 provided, it will default to the `ancestry_map` from `laiobj`. 74 """ 75 if self.__ancestry_map is not None: 76 return self.__ancestry_map 77 elif self.laiobj.ancestry_map is not None: 78 return self.laiobj.ancestry_map 79 else: 80 raise ValueError( 81 "Ancestry mapping is required but missing. Provide `ancestry_map` " 82 "during initialization or ensure `laiobj.ancestry_map` is set." 83 ) 84 85 def write(self) -> None: 86 """ 87 Write VCF or BCF files for each ancestry type defined in the ancestry map. 88 If the file already exists, it will be overwritten. 89 90 **Output VCF/BCF content:** 91 92 For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. 93 SNPs are encoded as follows: 94 95 - `1`: Indicates positions that match the specified ancestry. 96 - `0`: Indicates positions that do not match the specified ancestry. 97 98 The VCF/BCF files will contain the following fields: 99 100 - `CHROM`: Chromosome for each variant. 101 - `POS`: Chromosomal positions for each variant. 102 - `ID`: Unique identifier for each variant. 103 - `REF`: Reference allele for each variant. 104 - `ALT`: Alternate allele for each variant. 105 - `QUAL`: Phred-scaled quality score for each variant. 106 - `FILTER`: Status indicating whether each SNP passed control checks. 107 - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`. 108 - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles. 109 - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.). 110 111 **Output files:** 112 113 - A separate VCF file is written for each ancestry type, with filenames formatted as: 114 `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`). 115 """ 116 # Process the list of positions to include both the start and end coordinates for each window 117 # Iterate over each ancestry key in the ancestry mapping 118 for key in self.ancestry_map: 119 ancestry = int(key) 120 anc_string = self.ancestry_map[key] 121 122 # Define the output file format, ensuring it has the correct ancestry-specific suffix 123 file_extension = (".vcf", ".bcf") 124 125 # Check if file has one of the specified extensions 126 if self.file.suffix not in file_extension: 127 # If file does not have the correct extension, default to ".vcf" 128 output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf") 129 else: 130 # If file has the correct extension, insert the ancestry string before the extension 131 output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}") 132 133 # Check if file already exists 134 if output_file.exists(): 135 warnings.warn(f"File '{output_file}' already exists and will be overwritten.") 136 137 if self.laiobj.physical_pos is not None: 138 pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64) 139 variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos] 140 else: 141 pos_list = None 142 variants_info = None 143 144 # Modify LAI data values to simulate a SNP file 145 # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0 146 147 match = (self.laiobj.lai == ancestry) 148 match = match.view(np.int8) 149 match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 ) 150 151 152 # Set up VCF-related data 153 calldata_gt = match 154 del match 155 gc.collect() 156 samples = np.array(self.laiobj.samples) 157 variants_chrom = self.laiobj.chromosomes 158 variants_list = [str(i+1) for i in range(len(self.laiobj.lai))] 159 variants_id = np.array(variants_list) 160 variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5') 161 variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1') 162 163 # Create the SNPObject 164 variant_data_obj = SNPObject( 165 calldata_gt=calldata_gt, 166 samples=samples, 167 variants_chrom=variants_chrom, 168 variants_id=variants_id, 169 variants_ref = variants_ref, 170 variants_alt = variants_alt, 171 variants_pos = pos_list, 172 ) 173 174 # Log the start of the VCF file writing process 175 log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...") 176 177 vcf_writer = VCFWriter(variant_data_obj, output_file) 178 vcf_writer.write(variants_info=variants_info) 179 180 log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.") 181 182 return
A writer class for converting and writing local ancestry data into ancestry-specific VCF/BCF files for ADMIXTURE mapping.
21 def __init__( 22 self, 23 laiobj: LocalAncestryObject, 24 file: Union[str, Path], 25 ancestry_map: Optional[Dict[str, str]] = None 26 ): 27 """ 28 Args: 29 laiobj (LocalAncestryObject): 30 A LocalAncestryObject instance. 31 file (str or pathlib.Path): 32 Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 33 If the provided path does not have one of these extensions, the `.vcf` extension will be appended. 34 ancestry_map (dict of str to str, optional): 35 A dictionary mapping ancestry codes to region names. If not explicitly 36 provided, it will default to the `ancestry_map` from `laiobj`. 37 """ 38 self.__laiobj = laiobj 39 self.__file = Path(file) 40 self.__ancestry_map = ancestry_map
Arguments:
- laiobj (LocalAncestryObject): A LocalAncestryObject instance.
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.vcfor.bcf. If the provided path does not have one of these extensions, the.vcfextension will be appended. - ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names. If not explicitly
provided, it will default to the
ancestry_mapfromlaiobj.
53 @property 54 def file(self) -> Path: 55 """ 56 Retrieve `file`. 57 58 Returns: 59 **pathlib.Path:** 60 Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 61 If the provided path does not have one of these extensions, the `.vcf` extension will be appended. 62 """ 63 return self.__file
Retrieve file.
Returns:
pathlib.Path: Path to the file where the data will be saved. It should end with
.vcfor.bcf. If the provided path does not have one of these extensions, the.vcfextension will be appended.
65 @property 66 def ancestry_map(self) -> Dict[str, str]: 67 """ 68 Retrieve `ancestry_map`. 69 70 Returns: 71 **dict of str to str:** 72 A dictionary mapping ancestry codes to region names. If not explicitly 73 provided, it will default to the `ancestry_map` from `laiobj`. 74 """ 75 if self.__ancestry_map is not None: 76 return self.__ancestry_map 77 elif self.laiobj.ancestry_map is not None: 78 return self.laiobj.ancestry_map 79 else: 80 raise ValueError( 81 "Ancestry mapping is required but missing. Provide `ancestry_map` " 82 "during initialization or ensure `laiobj.ancestry_map` is set." 83 )
Retrieve ancestry_map.
Returns:
dict of str to str: A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the
ancestry_mapfromlaiobj.
85 def write(self) -> None: 86 """ 87 Write VCF or BCF files for each ancestry type defined in the ancestry map. 88 If the file already exists, it will be overwritten. 89 90 **Output VCF/BCF content:** 91 92 For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. 93 SNPs are encoded as follows: 94 95 - `1`: Indicates positions that match the specified ancestry. 96 - `0`: Indicates positions that do not match the specified ancestry. 97 98 The VCF/BCF files will contain the following fields: 99 100 - `CHROM`: Chromosome for each variant. 101 - `POS`: Chromosomal positions for each variant. 102 - `ID`: Unique identifier for each variant. 103 - `REF`: Reference allele for each variant. 104 - `ALT`: Alternate allele for each variant. 105 - `QUAL`: Phred-scaled quality score for each variant. 106 - `FILTER`: Status indicating whether each SNP passed control checks. 107 - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`. 108 - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles. 109 - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.). 110 111 **Output files:** 112 113 - A separate VCF file is written for each ancestry type, with filenames formatted as: 114 `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`). 115 """ 116 # Process the list of positions to include both the start and end coordinates for each window 117 # Iterate over each ancestry key in the ancestry mapping 118 for key in self.ancestry_map: 119 ancestry = int(key) 120 anc_string = self.ancestry_map[key] 121 122 # Define the output file format, ensuring it has the correct ancestry-specific suffix 123 file_extension = (".vcf", ".bcf") 124 125 # Check if file has one of the specified extensions 126 if self.file.suffix not in file_extension: 127 # If file does not have the correct extension, default to ".vcf" 128 output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf") 129 else: 130 # If file has the correct extension, insert the ancestry string before the extension 131 output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}") 132 133 # Check if file already exists 134 if output_file.exists(): 135 warnings.warn(f"File '{output_file}' already exists and will be overwritten.") 136 137 if self.laiobj.physical_pos is not None: 138 pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64) 139 variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos] 140 else: 141 pos_list = None 142 variants_info = None 143 144 # Modify LAI data values to simulate a SNP file 145 # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0 146 147 match = (self.laiobj.lai == ancestry) 148 match = match.view(np.int8) 149 match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 ) 150 151 152 # Set up VCF-related data 153 calldata_gt = match 154 del match 155 gc.collect() 156 samples = np.array(self.laiobj.samples) 157 variants_chrom = self.laiobj.chromosomes 158 variants_list = [str(i+1) for i in range(len(self.laiobj.lai))] 159 variants_id = np.array(variants_list) 160 variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5') 161 variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1') 162 163 # Create the SNPObject 164 variant_data_obj = SNPObject( 165 calldata_gt=calldata_gt, 166 samples=samples, 167 variants_chrom=variants_chrom, 168 variants_id=variants_id, 169 variants_ref = variants_ref, 170 variants_alt = variants_alt, 171 variants_pos = pos_list, 172 ) 173 174 # Log the start of the VCF file writing process 175 log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...") 176 177 vcf_writer = VCFWriter(variant_data_obj, output_file) 178 vcf_writer.write(variants_info=variants_info) 179 180 log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.") 181 182 return
Write VCF or BCF files for each ancestry type defined in the ancestry map. If the file already exists, it will be overwritten.
Output VCF/BCF content:
For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. SNPs are encoded as follows:
1: Indicates positions that match the specified ancestry.0: Indicates positions that do not match the specified ancestry.
The VCF/BCF files will contain the following fields:
CHROM: Chromosome for each variant.POS: Chromosomal positions for each variant.ID: Unique identifier for each variant.REF: Reference allele for each variant.ALT: Alternate allele for each variant.QUAL: Phred-scaled quality score for each variant.FILTER: Status indicating whether each SNP passed control checks.INFO: When physical positions are available, containsEND=<end_pos>for the segment end; otherwise'.'.FORMAT: Genotype format. Set to'GT', representing the genotype as phased alleles.<SampleID>: One column per sample, containing the genotype data (1|0,0|1, etc.).
Output files:
- A separate VCF file is written for each ancestry type, with filenames formatted as:
<filename>_<ancestry>.vcf(e.g.,output_African.vcf).
13class AdmixtureReader(WideBaseReader): 14 """ 15 A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`. 16 """ 17 def __init__( 18 self, 19 Q_file: Union[str, Path], 20 P_file: Optional[Union[str, Path]] = None, 21 sample_file: Optional[Union[str, Path]] = None, 22 snp_file: Optional[Union[str, Path]] = None, 23 ancestry_file: Optional[Union[str, Path]] = None, 24 ) -> None: 25 """ 26 Args: 27 Q_file (str or pathlib.Path): 28 Path to the file containing the Q matrix (per-sample ancestry proportions). 29 It should end with .Q or .txt. 30 The file should use space (' ') as the delimiter. 31 P_file (str or pathlib.Path, optional): 32 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 33 It should end with .P or .txt. 34 The file should use space (' ') as the delimiter. If None, P is not loaded. 35 sample_file (str or pathlib.Path, optional): 36 Path to the single-column file containing sample identifiers. 37 It should end with .fam or .txt. 38 If None, sample identifiers are not loaded. 39 snp_file (str or pathlib.Path, optional): 40 Path to the single-column file containing SNP identifiers. 41 It should end with .bim or .txt. 42 If None, SNP identifiers are not loaded. 43 ancestry_file (str or pathlib.Path, optional): 44 Path to the single-column file containing ancestry labels for each sample. 45 It should end with .map or .txt. 46 If None, ancestries are not loaded. 47 """ 48 self.__Q_file = Path(Q_file) 49 self.__P_file = Path(P_file) if P_file is not None else None 50 self.__sample_file = Path(sample_file) if sample_file is not None else None 51 self.__snp_file = Path(snp_file) if snp_file is not None else None 52 self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None 53 54 @property 55 def Q_file(self) -> Path: 56 """ 57 Retrieve Q_file. 58 59 Returns: 60 **pathlib.Path:** 61 Path to the file containing the Q matrix (per-sample ancestry proportions). 62 It should end with .Q or .txt. 63 The file should use space (' ') as the delimiter. 64 """ 65 return self.__Q_file 66 67 @property 68 def P_file(self) -> Optional[Path]: 69 """ 70 Retrieve P_file. 71 72 Returns: 73 **pathlib.Path or None:** 74 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 75 It should end with .P or .txt. 76 The file should use space (' ') as the delimiter. If None, P is not loaded. 77 """ 78 return self.__P_file 79 80 @property 81 def sample_file(self) -> Optional[Path]: 82 """ 83 Retrieve sample_file. 84 85 Returns: 86 **pathlib.Path:** 87 Path to the single-column file containing sample identifiers. 88 It should end with .fam or .txt. 89 If None, sample identifiers are not loaded. 90 """ 91 return self.__sample_file 92 93 @property 94 def snp_file(self) -> Optional[Path]: 95 """ 96 Retrieve snp_file. 97 98 Returns: 99 **pathlib.Path:** 100 Path to the single-column file containing SNP identifiers. 101 It should end with .bim or .txt. 102 If None, SNP identifiers are not loaded. 103 """ 104 return self.__snp_file 105 106 @property 107 def ancestry_file(self) -> Optional[Path]: 108 """ 109 Retrieve ancestry_file. 110 111 Returns: 112 **pathlib.Path:** 113 Path to the single-column file containing ancestry labels for each sample. 114 It should end with .map or .txt. 115 If None, ancestries are not loaded. 116 """ 117 return self.__ancestry_file 118 119 def read(self) -> 'GlobalAncestryObject': 120 """ 121 Read data from the provided ADMIXTURE files and construct a 122 snputils.ancestry.genobj.GlobalAncestryObject instance. 123 124 **Expected ADMIXTURE files content:** 125 126 - **Q_file**: 127 A text file containing the Q matrix with per-sample ancestry proportions. 128 Each row corresponds to a sample, and each column corresponds to an ancestry. 129 - **P_file**: 130 A text file containing the P matrix with per-ancestry SNP frequencies. 131 Each row corresponds to a SNP, and each column corresponds to an ancestry. 132 133 Optional files (if provided): 134 - **sample_file**: A single-column text file containing sample identifiers in order. 135 - **snp_file**: A single-column text file containing SNP identifiers in order. 136 - **ancestry_file**: A single-column text file containing ancestry labels for each sample. 137 138 Returns: 139 **GlobalAncestryObject:** 140 A GlobalAncestryObject instance. 141 """ 142 log.info(f"Reading Q matrix from '{self.Q_file}'...") 143 Q_mat = np.genfromtxt(self.Q_file, delimiter=' ') 144 if self.P_file is not None: 145 log.info(f"Reading P matrix from '{self.P_file}'...") 146 P_mat = np.genfromtxt(self.P_file, delimiter=' ') 147 else: 148 P_mat = None 149 150 samples = self._read_sample_ids() 151 snps = self._read_snps() 152 ancestries = self._read_ancestries() 153 154 return GlobalAncestryObject( 155 Q_mat, 156 P_mat, 157 samples=samples, 158 snps=snps, 159 ancestries=ancestries 160 )
A reader class for parsing ADMIXTURE files and constructing a snputils.ancestry.genobj.GlobalAncestryObject.
17 def __init__( 18 self, 19 Q_file: Union[str, Path], 20 P_file: Optional[Union[str, Path]] = None, 21 sample_file: Optional[Union[str, Path]] = None, 22 snp_file: Optional[Union[str, Path]] = None, 23 ancestry_file: Optional[Union[str, Path]] = None, 24 ) -> None: 25 """ 26 Args: 27 Q_file (str or pathlib.Path): 28 Path to the file containing the Q matrix (per-sample ancestry proportions). 29 It should end with .Q or .txt. 30 The file should use space (' ') as the delimiter. 31 P_file (str or pathlib.Path, optional): 32 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 33 It should end with .P or .txt. 34 The file should use space (' ') as the delimiter. If None, P is not loaded. 35 sample_file (str or pathlib.Path, optional): 36 Path to the single-column file containing sample identifiers. 37 It should end with .fam or .txt. 38 If None, sample identifiers are not loaded. 39 snp_file (str or pathlib.Path, optional): 40 Path to the single-column file containing SNP identifiers. 41 It should end with .bim or .txt. 42 If None, SNP identifiers are not loaded. 43 ancestry_file (str or pathlib.Path, optional): 44 Path to the single-column file containing ancestry labels for each sample. 45 It should end with .map or .txt. 46 If None, ancestries are not loaded. 47 """ 48 self.__Q_file = Path(Q_file) 49 self.__P_file = Path(P_file) if P_file is not None else None 50 self.__sample_file = Path(sample_file) if sample_file is not None else None 51 self.__snp_file = Path(snp_file) if snp_file is not None else None 52 self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
Arguments:
- Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
- P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
- sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
- snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
- ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
54 @property 55 def Q_file(self) -> Path: 56 """ 57 Retrieve Q_file. 58 59 Returns: 60 **pathlib.Path:** 61 Path to the file containing the Q matrix (per-sample ancestry proportions). 62 It should end with .Q or .txt. 63 The file should use space (' ') as the delimiter. 64 """ 65 return self.__Q_file
Retrieve Q_file.
Returns:
pathlib.Path: Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
67 @property 68 def P_file(self) -> Optional[Path]: 69 """ 70 Retrieve P_file. 71 72 Returns: 73 **pathlib.Path or None:** 74 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 75 It should end with .P or .txt. 76 The file should use space (' ') as the delimiter. If None, P is not loaded. 77 """ 78 return self.__P_file
Retrieve P_file.
Returns:
pathlib.Path or None: Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
80 @property 81 def sample_file(self) -> Optional[Path]: 82 """ 83 Retrieve sample_file. 84 85 Returns: 86 **pathlib.Path:** 87 Path to the single-column file containing sample identifiers. 88 It should end with .fam or .txt. 89 If None, sample identifiers are not loaded. 90 """ 91 return self.__sample_file
Retrieve sample_file.
Returns:
pathlib.Path: Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
93 @property 94 def snp_file(self) -> Optional[Path]: 95 """ 96 Retrieve snp_file. 97 98 Returns: 99 **pathlib.Path:** 100 Path to the single-column file containing SNP identifiers. 101 It should end with .bim or .txt. 102 If None, SNP identifiers are not loaded. 103 """ 104 return self.__snp_file
Retrieve snp_file.
Returns:
pathlib.Path: Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
106 @property 107 def ancestry_file(self) -> Optional[Path]: 108 """ 109 Retrieve ancestry_file. 110 111 Returns: 112 **pathlib.Path:** 113 Path to the single-column file containing ancestry labels for each sample. 114 It should end with .map or .txt. 115 If None, ancestries are not loaded. 116 """ 117 return self.__ancestry_file
Retrieve ancestry_file.
Returns:
pathlib.Path: Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
119 def read(self) -> 'GlobalAncestryObject': 120 """ 121 Read data from the provided ADMIXTURE files and construct a 122 snputils.ancestry.genobj.GlobalAncestryObject instance. 123 124 **Expected ADMIXTURE files content:** 125 126 - **Q_file**: 127 A text file containing the Q matrix with per-sample ancestry proportions. 128 Each row corresponds to a sample, and each column corresponds to an ancestry. 129 - **P_file**: 130 A text file containing the P matrix with per-ancestry SNP frequencies. 131 Each row corresponds to a SNP, and each column corresponds to an ancestry. 132 133 Optional files (if provided): 134 - **sample_file**: A single-column text file containing sample identifiers in order. 135 - **snp_file**: A single-column text file containing SNP identifiers in order. 136 - **ancestry_file**: A single-column text file containing ancestry labels for each sample. 137 138 Returns: 139 **GlobalAncestryObject:** 140 A GlobalAncestryObject instance. 141 """ 142 log.info(f"Reading Q matrix from '{self.Q_file}'...") 143 Q_mat = np.genfromtxt(self.Q_file, delimiter=' ') 144 if self.P_file is not None: 145 log.info(f"Reading P matrix from '{self.P_file}'...") 146 P_mat = np.genfromtxt(self.P_file, delimiter=' ') 147 else: 148 P_mat = None 149 150 samples = self._read_sample_ids() 151 snps = self._read_snps() 152 ancestries = self._read_ancestries() 153 154 return GlobalAncestryObject( 155 Q_mat, 156 P_mat, 157 samples=samples, 158 snps=snps, 159 ancestries=ancestries 160 )
Read data from the provided ADMIXTURE files and construct a snputils.ancestry.genobj.GlobalAncestryObject instance.
Expected ADMIXTURE files content:
- Q_file: A text file containing the Q matrix with per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
- P_file: A text file containing the P matrix with per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
Optional files (if provided):
- sample_file: A single-column text file containing sample identifiers in order.
- snp_file: A single-column text file containing SNP identifiers in order.
- ancestry_file: A single-column text file containing ancestry labels for each sample.
Returns:
GlobalAncestryObject: A GlobalAncestryObject instance.
13class AdmixtureWriter(WideBaseWriter): 14 """ 15 A writer class for exporting global ancestry data from a 16 `snputils.ancestry.genobj.GlobalAncestryObject` into multiple ADMIXTURE files. 17 """ 18 def __init__( 19 self, 20 wideobj: GlobalAncestryObject, 21 file_prefix: Union[str, Path] 22 ) -> None: 23 """ 24 Args: 25 wideobj (GlobalAncestryObject): 26 A GlobalAncestryObject instance. 27 file_prefix (str or pathlib.Path): 28 Prefix for output file names, including directory path but excluding file extensions. 29 The prefix is used to generate specific file names for each output, with file-specific 30 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 31 """ 32 super(AdmixtureWriter, self).__init__(wideobj, file_prefix) 33 self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q") 34 self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P") 35 36 self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None 37 self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None 38 self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None 39 40 @property 41 def wideobj(self) -> GlobalAncestryObject: 42 """ 43 Retrieve `wideobj`. 44 45 Returns: 46 **GlobalAncestryObject:** A GlobalAncestryObject instance. 47 """ 48 return self.__wideobj 49 50 @property 51 def file_prefix(self) -> Path: 52 """ 53 Retrieve `file_prefix`. 54 55 Returns: 56 **pathlib.Path:** 57 Prefix for output file names, including directory path but excluding file extensions. 58 The prefix is used to generate specific file names for each output, with file-specific 59 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 60 """ 61 return self.__file_prefix 62 63 @property 64 def Q_file(self) -> Path: 65 """ 66 Retrieve `Q_file`. 67 68 Returns: 69 **pathlib.Path:** 70 Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions). 71 """ 72 return self.__Q_file 73 74 @property 75 def P_file(self) -> Path: 76 """ 77 Retrieve `P_file`. 78 79 Returns: 80 **pathlib.Path:** 81 Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies). 82 """ 83 return self.__P_file 84 85 @property 86 def sample_file(self) -> Optional[Path]: 87 """ 88 Retrieve `sample_file`. 89 90 Returns: 91 **pathlib.Path:** 92 Path to the `.txt` the file that will store sample identifiers. 93 If None, sample identifiers are not saved. 94 """ 95 return self.__sample_file 96 97 @property 98 def snp_file(self) -> Optional[Path]: 99 """ 100 Retrieve `snp_file`. 101 102 Returns: 103 **pathlib.Path:** 104 Path to the `.txt` file that will store SNP identifiers. 105 If None, SNP identifiers are not saved. 106 """ 107 return self.__snp_file 108 109 @property 110 def ancestry_file(self) -> Optional[Path]: 111 """ 112 Retrieve `ancestry_file`. 113 114 Returns: 115 **pathlib.Path:** 116 Path to the `.map` file that will store ancestry labels for each sample. 117 If None, ancestries are not saved. 118 """ 119 return self.__ancestry_file 120 121 def _write_Q(self): 122 log.info(f"Writing Q matrix to '{self.Q_file}'...") 123 np.savetxt(self.Q_file, self.wideobj.Q, delimiter=" ") 124 log.info(f"Finished writing Q matrix to '{self.Q_file}'.") 125 126 def _write_P(self): 127 log.info(f"Writing P matrix to '{self.P_file}'...") 128 np.savetxt(self.P_file, self.wideobj.P, delimiter=" ") 129 log.info(f"Finished writing P matrix to '{self.P_file}'.") 130 131 def _write_sample_ids(self): 132 if self.wideobj.samples is not None: 133 log.info(f"Writing sample IDs to '{self.sample_file}'...") 134 np.savetxt(self.sample_file, self.wideobj.samples, fmt="%s") 135 log.info(f"Finished writing sample IDs to '{self.sample_file}'.") 136 137 def _write_snps(self): 138 if self.wideobj.snps is not None: 139 log.info(f"Writing SNP IDs to '{self.snp_file}'...") 140 np.savetxt(self.snp_file, self.wideobj.snps, fmt="%s") 141 log.info(f"Finished writing SNP IDs to '{self.snp_file}'.") 142 143 def _write_ancestries(self): 144 if self.wideobj.ancestries is not None: 145 log.info(f"Writing ancestry information to '{self.ancestry_file}'...") 146 np.savetxt(self.ancestry_file, self.wideobj.ancestries, fmt="%s") 147 log.info(f"Finished writing ancestry information to '{self.ancestry_file}'.") 148 149 def write(self) -> None: 150 """ 151 Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files 152 with the specified `file_prefix`. If the files already exist, they will be overwritten. 153 154 **Output files:** 155 156 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 157 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 158 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 159 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 160 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 161 162 where `K` is the total number of ancestries. 163 """ 164 log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...") 165 166 self.file_prefix.parent.mkdir(parents=True, exist_ok=True) 167 168 self._write_Q() 169 self._write_P() 170 self._write_sample_ids() 171 self._write_snps() 172 self._write_ancestries() 173 174 log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")
A writer class for exporting global ancestry data from a
snputils.ancestry.genobj.GlobalAncestryObject into multiple ADMIXTURE files.
18 def __init__( 19 self, 20 wideobj: GlobalAncestryObject, 21 file_prefix: Union[str, Path] 22 ) -> None: 23 """ 24 Args: 25 wideobj (GlobalAncestryObject): 26 A GlobalAncestryObject instance. 27 file_prefix (str or pathlib.Path): 28 Prefix for output file names, including directory path but excluding file extensions. 29 The prefix is used to generate specific file names for each output, with file-specific 30 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 31 """ 32 super(AdmixtureWriter, self).__init__(wideobj, file_prefix) 33 self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q") 34 self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P") 35 36 self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None 37 self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None 38 self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None
Arguments:
- wideobj (GlobalAncestryObject): A GlobalAncestryObject instance.
- file_prefix (str or pathlib.Path): Prefix for output file names, including directory path but excluding file extensions.
The prefix is used to generate specific file names for each output, with file-specific
suffixes appended as described above (e.g.,
file_prefix.n_ancestries.Qfor the Q matrix file).
50 @property 51 def file_prefix(self) -> Path: 52 """ 53 Retrieve `file_prefix`. 54 55 Returns: 56 **pathlib.Path:** 57 Prefix for output file names, including directory path but excluding file extensions. 58 The prefix is used to generate specific file names for each output, with file-specific 59 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 60 """ 61 return self.__file_prefix
Retrieve file_prefix.
Returns:
pathlib.Path: Prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g.,
file_prefix.n_ancestries.Qfor the Q matrix file).
63 @property 64 def Q_file(self) -> Path: 65 """ 66 Retrieve `Q_file`. 67 68 Returns: 69 **pathlib.Path:** 70 Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions). 71 """ 72 return self.__Q_file
Retrieve Q_file.
Returns:
pathlib.Path: Path to the
.Qfile that will store the Q matrix (per-sample ancestry proportions).
74 @property 75 def P_file(self) -> Path: 76 """ 77 Retrieve `P_file`. 78 79 Returns: 80 **pathlib.Path:** 81 Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies). 82 """ 83 return self.__P_file
Retrieve P_file.
Returns:
pathlib.Path: Path to the
.Pfile that will store the P/F matrix (per-ancestry SNP frequencies).
85 @property 86 def sample_file(self) -> Optional[Path]: 87 """ 88 Retrieve `sample_file`. 89 90 Returns: 91 **pathlib.Path:** 92 Path to the `.txt` the file that will store sample identifiers. 93 If None, sample identifiers are not saved. 94 """ 95 return self.__sample_file
Retrieve sample_file.
Returns:
pathlib.Path: Path to the
.txtthe file that will store sample identifiers. If None, sample identifiers are not saved.
97 @property 98 def snp_file(self) -> Optional[Path]: 99 """ 100 Retrieve `snp_file`. 101 102 Returns: 103 **pathlib.Path:** 104 Path to the `.txt` file that will store SNP identifiers. 105 If None, SNP identifiers are not saved. 106 """ 107 return self.__snp_file
Retrieve snp_file.
Returns:
pathlib.Path: Path to the
.txtfile that will store SNP identifiers. If None, SNP identifiers are not saved.
109 @property 110 def ancestry_file(self) -> Optional[Path]: 111 """ 112 Retrieve `ancestry_file`. 113 114 Returns: 115 **pathlib.Path:** 116 Path to the `.map` file that will store ancestry labels for each sample. 117 If None, ancestries are not saved. 118 """ 119 return self.__ancestry_file
Retrieve ancestry_file.
Returns:
pathlib.Path: Path to the
.mapfile that will store ancestry labels for each sample. If None, ancestries are not saved.
149 def write(self) -> None: 150 """ 151 Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files 152 with the specified `file_prefix`. If the files already exist, they will be overwritten. 153 154 **Output files:** 155 156 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 157 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 158 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 159 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 160 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 161 162 where `K` is the total number of ancestries. 163 """ 164 log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...") 165 166 self.file_prefix.parent.mkdir(parents=True, exist_ok=True) 167 168 self._write_Q() 169 self._write_P() 170 self._write_sample_ids() 171 self._write_snps() 172 self._write_ancestries() 173 174 log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")
Write the data contained in the wideobj instance into the multiple ADMIXTURE files
with the specified file_prefix. If the files already exist, they will be overwritten.
Output files:
<file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.<file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.<file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).<file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).<file_prefix>.map: Ancestry file (if ancestries information is available).
where K is the total number of ancestries.
8def read_lai(file: Union[str, Path], **kwargs) -> LocalAncestryObject: 9 """ 10 Automatically detect the local ancestry data file format from the file's extension and 11 read it into a `snputils.ancestry.genobj.LocalAncestryObject`. 12 13 **Supported formats:** 14 15 - `.msp`: Text-based MSP format. 16 - `.msp.tsv`: Text-based MSP format with TSV extension. 17 18 Args: 19 file (str or pathlib.Path): 20 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 21 **kwargs: Additional arguments passed to the reader method. 22 """ 23 from snputils.ancestry.io.local.read.auto import LAIReader 24 25 return LAIReader(file).read(**kwargs)
Automatically detect the local ancestry data file format from the file's extension and
read it into a snputils.ancestry.genobj.LocalAncestryObject.
Supported formats:
.msp: Text-based MSP format..msp.tsv: Text-based MSP format with TSV extension.
Arguments:
- file (str or pathlib.Path): Path to the file to be read. It should end with
.mspor.msp.tsv. - **kwargs: Additional arguments passed to the reader method.
28def read_msp(file: Union[str, Path]) -> 'LocalAncestryObject': 29 """ 30 Read data from an `.msp` or `.msp.tsv` file and construct a `snputils.ancestry.genobj.LocalAncestryObject`. 31 32 Args: 33 file (str or pathlib.Path): 34 Path to the file to be read. It should end with `.msp` or `.msp.tsv`. 35 36 Returns: 37 **LocalAncestryObject:** 38 A LocalAncestryObject instance. 39 """ 40 from snputils.ancestry.io.local.read.msp import MSPReader 41 42 return MSPReader(file).read()
Read data from an .msp or .msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.
Arguments:
- file (str or pathlib.Path): Path to the file to be read. It should end with
.mspor.msp.tsv.
Returns:
LocalAncestryObject: A LocalAncestryObject instance.
8def read_admixture( 9 Q_file: Union[str, Path], 10 P_file: Optional[Union[str, Path]] = None, 11 sample_file: Optional[Union[str, Path]] = None, 12 snp_file: Optional[Union[str, Path]] = None, 13 ancestry_file: Optional[Union[str, Path]] = None, 14) -> 'GlobalAncestryObject': 15 """ 16 Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`. 17 18 Args: 19 Q_file (str or pathlib.Path): 20 Path to the file containing the Q matrix (per-sample ancestry proportions). 21 It should end with .Q or .txt. 22 The file should use space (' ') as the delimiter. 23 P_file (str or pathlib.Path, optional): 24 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 25 It should end with .P or .txt. 26 The file should use space (' ') as the delimiter. If None, P is not loaded. 27 sample_file (str or pathlib.Path, optional): 28 Path to the single-column file containing sample identifiers. 29 It should end with .fam or .txt. 30 If None, sample identifiers are not loaded. 31 snp_file (str or pathlib.Path, optional): 32 Path to the single-column file containing SNP identifiers. 33 It should end with .bim or .txt. 34 If None, SNP identifiers are not loaded. 35 ancestry_file (str or pathlib.Path, optional): 36 Path to the single-column file containing ancestry labels for each sample. 37 It should end with .map or .txt. 38 If None, ancestries are not loaded. 39 40 Returns: 41 **GlobalAncestryObject:** 42 A GlobalAncestryObject instance. 43 """ 44 from snputils.ancestry.io.wide.read.admixture import AdmixtureReader 45 46 return AdmixtureReader( 47 Q_file=Q_file, 48 P_file=P_file, 49 sample_file=sample_file, 50 snp_file=snp_file, 51 ancestry_file=ancestry_file 52 ).read()
Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.
Arguments:
- Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
- P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
- sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
- snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
- ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:
GlobalAncestryObject: A GlobalAncestryObject instance.
8def read_admixture( 9 Q_file: Union[str, Path], 10 P_file: Optional[Union[str, Path]] = None, 11 sample_file: Optional[Union[str, Path]] = None, 12 snp_file: Optional[Union[str, Path]] = None, 13 ancestry_file: Optional[Union[str, Path]] = None, 14) -> 'GlobalAncestryObject': 15 """ 16 Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`. 17 18 Args: 19 Q_file (str or pathlib.Path): 20 Path to the file containing the Q matrix (per-sample ancestry proportions). 21 It should end with .Q or .txt. 22 The file should use space (' ') as the delimiter. 23 P_file (str or pathlib.Path, optional): 24 Path to the file containing the P/F matrix (per-ancestry SNP frequencies). 25 It should end with .P or .txt. 26 The file should use space (' ') as the delimiter. If None, P is not loaded. 27 sample_file (str or pathlib.Path, optional): 28 Path to the single-column file containing sample identifiers. 29 It should end with .fam or .txt. 30 If None, sample identifiers are not loaded. 31 snp_file (str or pathlib.Path, optional): 32 Path to the single-column file containing SNP identifiers. 33 It should end with .bim or .txt. 34 If None, SNP identifiers are not loaded. 35 ancestry_file (str or pathlib.Path, optional): 36 Path to the single-column file containing ancestry labels for each sample. 37 It should end with .map or .txt. 38 If None, ancestries are not loaded. 39 40 Returns: 41 **GlobalAncestryObject:** 42 A GlobalAncestryObject instance. 43 """ 44 from snputils.ancestry.io.wide.read.admixture import AdmixtureReader 45 46 return AdmixtureReader( 47 Q_file=Q_file, 48 P_file=P_file, 49 sample_file=sample_file, 50 snp_file=snp_file, 51 ancestry_file=ancestry_file 52 ).read()
Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.
Arguments:
- Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
- P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
- sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
- snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
- ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:
GlobalAncestryObject: A GlobalAncestryObject instance.
12class IBDObject: 13 """ 14 A class for Identity-By-Descent (IBD) segment data. 15 """ 16 17 def __init__( 18 self, 19 sample_id_1: np.ndarray, 20 haplotype_id_1: np.ndarray, 21 sample_id_2: np.ndarray, 22 haplotype_id_2: np.ndarray, 23 chrom: np.ndarray, 24 start: np.ndarray, 25 end: np.ndarray, 26 length_cm: Optional[np.ndarray] = None, 27 segment_type: Optional[np.ndarray] = None, 28 ) -> None: 29 """ 30 Args: 31 sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual. 32 haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown). 33 sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual. 34 haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown). 35 chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment. 36 start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment. 37 end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment. 38 length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available. 39 """ 40 # Store attributes 41 self.__sample_id_1 = np.asarray(sample_id_1) 42 self.__haplotype_id_1 = np.asarray(haplotype_id_1) 43 self.__sample_id_2 = np.asarray(sample_id_2) 44 self.__haplotype_id_2 = np.asarray(haplotype_id_2) 45 self.__chrom = np.asarray(chrom) 46 self.__start = np.asarray(start) 47 self.__end = np.asarray(end) 48 self.__length_cm = None if length_cm is None else np.asarray(length_cm) 49 self.__segment_type = None if segment_type is None else np.asarray(segment_type) 50 51 self._sanity_check() 52 53 def __getitem__(self, key: str) -> Any: 54 """ 55 To access an attribute of the class using the square bracket notation, 56 similar to a dictionary. 57 """ 58 try: 59 return getattr(self, key) 60 except Exception: 61 raise KeyError(f"Invalid key: {key}.") 62 63 def __setitem__(self, key: str, value: Any) -> None: 64 """ 65 To set an attribute of the class using the square bracket notation, 66 similar to a dictionary. 67 """ 68 try: 69 setattr(self, key, value) 70 except Exception: 71 raise KeyError(f"Invalid key: {key}.") 72 73 @property 74 def sample_id_1(self) -> np.ndarray: 75 """ 76 Retrieve `sample_id_1`. 77 78 Returns: 79 **array of shape (n_segments,):** Sample identifiers for the first individual. 80 """ 81 return self.__sample_id_1 82 83 @sample_id_1.setter 84 def sample_id_1(self, x: Sequence) -> None: 85 """ 86 Update `sample_id_1`. 87 """ 88 self.__sample_id_1 = np.asarray(x) 89 90 @property 91 def haplotype_id_1(self) -> np.ndarray: 92 """ 93 Retrieve `haplotype_id_1`. 94 95 Returns: 96 **array of shape (n_segments,):** Haplotype identifiers for the first individual (values in {1, 2}). 97 """ 98 return self.__haplotype_id_1 99 100 @haplotype_id_1.setter 101 def haplotype_id_1(self, x: Sequence) -> None: 102 """ 103 Update `haplotype_id_1`. 104 """ 105 self.__haplotype_id_1 = np.asarray(x) 106 107 @property 108 def sample_id_2(self) -> np.ndarray: 109 """ 110 Retrieve `sample_id_2`. 111 112 Returns: 113 **array of shape (n_segments,):** Sample identifiers for the second individual. 114 """ 115 return self.__sample_id_2 116 117 @sample_id_2.setter 118 def sample_id_2(self, x: Sequence) -> None: 119 """ 120 Update `sample_id_2`. 121 """ 122 self.__sample_id_2 = np.asarray(x) 123 124 @property 125 def haplotype_id_2(self) -> np.ndarray: 126 """ 127 Retrieve `haplotype_id_2`. 128 129 Returns: 130 **array of shape (n_segments,):** Haplotype identifiers for the second individual (values in {1, 2}). 131 """ 132 return self.__haplotype_id_2 133 134 @haplotype_id_2.setter 135 def haplotype_id_2(self, x: Sequence) -> None: 136 """ 137 Update `haplotype_id_2`. 138 """ 139 self.__haplotype_id_2 = np.asarray(x) 140 141 @property 142 def chrom(self) -> np.ndarray: 143 """ 144 Retrieve `chrom`. 145 146 Returns: 147 **array of shape (n_segments,):** Chromosome identifier for each IBD segment. 148 """ 149 return self.__chrom 150 151 @chrom.setter 152 def chrom(self, x: Sequence) -> None: 153 """ 154 Update `chrom`. 155 """ 156 self.__chrom = np.asarray(x) 157 158 @property 159 def start(self) -> np.ndarray: 160 """ 161 Retrieve `start`. 162 163 Returns: 164 **array of shape (n_segments,):** Start physical position (1-based, bp) for each IBD segment. 165 """ 166 return self.__start 167 168 @start.setter 169 def start(self, x: Sequence) -> None: 170 """ 171 Update `start`. 172 """ 173 self.__start = np.asarray(x) 174 175 @property 176 def end(self) -> np.ndarray: 177 """ 178 Retrieve `end`. 179 180 Returns: 181 **array of shape (n_segments,):** End physical position (1-based, bp) for each IBD segment. 182 """ 183 return self.__end 184 185 @end.setter 186 def end(self, x: Sequence) -> None: 187 """ 188 Update `end`. 189 """ 190 self.__end = np.asarray(x) 191 192 @property 193 def length_cm(self) -> Optional[np.ndarray]: 194 """ 195 Retrieve `length_cm`. 196 197 Returns: 198 **array of shape (n_segments,):** Genetic length (cM) for each segment if available; otherwise None. 199 """ 200 return self.__length_cm 201 202 @length_cm.setter 203 def length_cm(self, x: Optional[Sequence]) -> None: 204 """ 205 Update `length_cm`. 206 """ 207 self.__length_cm = None if x is None else np.asarray(x) 208 209 @property 210 def segment_type(self) -> Optional[np.ndarray]: 211 """ 212 Retrieve `segment_type`. 213 214 Returns: 215 **array of shape (n_segments,):** Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable. 216 """ 217 return self.__segment_type 218 219 @segment_type.setter 220 def segment_type(self, x: Optional[Sequence]) -> None: 221 """ 222 Update `segment_type`. 223 """ 224 self.__segment_type = None if x is None else np.asarray(x) 225 226 @property 227 def n_segments(self) -> int: 228 """ 229 Retrieve `n_segments`. 230 231 Returns: 232 **int:** The total number of IBD segments. 233 """ 234 return self.__chrom.shape[0] 235 236 @property 237 def pairs(self) -> np.ndarray: 238 """ 239 Retrieve `pairs`. 240 241 Returns: 242 **array of shape (n_segments, 2):** Per-segment sample identifier pairs. 243 """ 244 return np.column_stack([self.__sample_id_1, self.__sample_id_2]) 245 246 @property 247 def haplotype_pairs(self) -> np.ndarray: 248 """ 249 Retrieve `haplotype_pairs`. 250 251 Returns: 252 **array of shape (n_segments, 2):** Per-segment haplotype identifier pairs. 253 """ 254 return np.column_stack([self.__haplotype_id_1, self.__haplotype_id_2]) 255 256 def copy(self) -> 'IBDObject': 257 """ 258 Create and return a copy of `self`. 259 260 Returns: 261 **IBDObject:** A new instance of the current object. 262 """ 263 return copy.deepcopy(self) 264 265 def keys(self) -> List[str]: 266 """ 267 Retrieve a list of public attribute names for `self`. 268 269 Returns: 270 **list of str:** A list of attribute names, with internal name-mangling removed. 271 """ 272 return [attr.replace('_IBDObject__', '') for attr in vars(self)] 273 274 def filter_segments( 275 self, 276 chrom: Optional[Sequence[str]] = None, 277 samples: Optional[Sequence[str]] = None, 278 min_length_cm: Optional[float] = None, 279 segment_types: Optional[Sequence[str]] = None, 280 inplace: bool = False, 281 ) -> Optional['IBDObject']: 282 """ 283 Filter IBD segments by chromosome, sample names, and/or minimum genetic length. 284 285 Args: 286 chrom (sequence of str, optional): Chromosome(s) to include. 287 samples (sequence of str, optional): Sample names to include if present in either column. 288 min_length_cm (float, optional): Minimum cM length threshold. 289 inplace (bool, default=False): If True, modifies `self` in place. If False, returns a new `IBDObject`. 290 291 Returns: 292 **Optional[IBDObject]:** A filtered IBDObject if `inplace=False`. If `inplace=True`, returns None. 293 """ 294 mask = np.ones(self.n_segments, dtype=bool) 295 296 if chrom is not None: 297 chrom = np.atleast_1d(chrom) 298 mask &= np.isin(self.__chrom, chrom) 299 300 if samples is not None: 301 samples = np.atleast_1d(samples) 302 mask &= np.isin(self.__sample_id_1, samples) | np.isin(self.__sample_id_2, samples) 303 304 if min_length_cm is not None and self.__length_cm is not None: 305 mask &= self.__length_cm >= float(min_length_cm) 306 307 if segment_types is not None and self.__segment_type is not None: 308 segment_types = np.atleast_1d(segment_types) 309 mask &= np.isin(self.__segment_type, segment_types) 310 311 def _apply_mask(x: Optional[np.ndarray]) -> Optional[np.ndarray]: 312 return None if x is None else np.asarray(x)[mask] 313 314 if inplace: 315 self.__sample_id_1 = _apply_mask(self.__sample_id_1) 316 self.__haplotype_id_1 = _apply_mask(self.__haplotype_id_1) 317 self.__sample_id_2 = _apply_mask(self.__sample_id_2) 318 self.__haplotype_id_2 = _apply_mask(self.__haplotype_id_2) 319 self.__chrom = _apply_mask(self.__chrom) 320 self.__start = _apply_mask(self.__start) 321 self.__end = _apply_mask(self.__end) 322 self.__length_cm = _apply_mask(self.__length_cm) 323 self.__segment_type = _apply_mask(self.__segment_type) 324 return None 325 else: 326 return IBDObject( 327 sample_id_1=_apply_mask(self.__sample_id_1), 328 haplotype_id_1=_apply_mask(self.__haplotype_id_1), 329 sample_id_2=_apply_mask(self.__sample_id_2), 330 haplotype_id_2=_apply_mask(self.__haplotype_id_2), 331 chrom=_apply_mask(self.__chrom), 332 start=_apply_mask(self.__start), 333 end=_apply_mask(self.__end), 334 length_cm=_apply_mask(self.__length_cm), 335 segment_type=_apply_mask(self.__segment_type), 336 ) 337 338 def restrict_to_ancestry( 339 self, 340 *, 341 laiobj: Any, 342 ancestry: Any, 343 require_both_haplotypes: bool = False, 344 min_bp: Optional[int] = None, 345 min_cm: Optional[float] = None, 346 inplace: bool = False, 347 method: str = 'clip', 348 ) -> Optional['IBDObject']: 349 """ 350 Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry 351 according to a `LocalAncestryObject`. 352 353 This performs an interval intersection per segment against ancestry tracts: 354 - If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific 355 haplotype of each individual. 356 - If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is 357 considered present for an individual if at least one of their haplotypes matches 358 the requested ancestry (unless `require_both_haplotypes=True`). 359 360 Method 'strict': 361 Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry 362 for either individual. No trimming occurs - segments are kept whole or dropped completely. 363 364 Method 'clip': 365 Trim IBD segments to contiguous regions where both individuals have the target ancestry. 366 Resulting subsegments are clipped to LAI window boundaries and original IBD start/end, 367 with optional length filtering by bp or cM. 368 369 Args: 370 laiobj: LocalAncestryObject containing 2D `lai` of shape (n_windows, n_haplotypes), 371 `physical_pos` (n_windows, 2), and `chromosomes` (n_windows,). 372 ancestry: Target ancestry code or label. Compared as string, so both int and str work. 373 require_both_haplotypes: If True, require both haplotypes of each individual to have 374 the target ancestry within a window. When haplotypes are known per segment, this 375 only affects cases with unknown haplotypes (== -1) or IBD2 segments. 376 min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip). 377 min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip). 378 inplace: If True, replace `self` with the restricted object; else return a new object. 379 method: Method to use for filtering. 'strict' drops entire segments that overlap with 380 non-target ancestry. 'clip' trims segments to target ancestry regions. 381 382 Returns: 383 Optional[IBDObject]: A restricted IBDObject if `inplace=False`. If `inplace=True`, 384 returns None. 385 """ 386 if method not in ['strict', 'clip']: 387 raise ValueError(f"Method must be 'strict' or 'clip', got '{method}'") 388 389 # Basic LAI shape/metadata checks 390 lai = getattr(laiobj, 'lai', None) 391 physical_pos = getattr(laiobj, 'physical_pos', None) 392 chromosomes = getattr(laiobj, 'chromosomes', None) 393 centimorgan_pos = getattr(laiobj, 'centimorgan_pos', None) 394 haplotypes = getattr(laiobj, 'haplotypes', None) 395 396 if lai is None or physical_pos is None or chromosomes is None or haplotypes is None: 397 raise ValueError( 398 "`laiobj` must provide `lai`, `physical_pos`, `chromosomes`, and `haplotypes`." 399 ) 400 401 if lai.ndim != 2: 402 raise ValueError("`laiobj.lai` must be 2D with shape (n_windows, n_haplotypes).") 403 404 # Build haplotype label -> column index map (labels like 'Sample.0', 'Sample.1') 405 hap_to_col = {str(h): i for i, h in enumerate(haplotypes)} 406 407 # Coerce ancestry to str for robust comparisons 408 anc_str = str(ancestry) 409 410 # Coerce LAI values to str once for comparisons 411 lai_str = lai.astype(str) 412 413 # Prepare arrays for the restricted segments 414 out_sample_id_1: List[str] = [] 415 out_haplotype_id_1: List[int] = [] 416 out_sample_id_2: List[str] = [] 417 out_haplotype_id_2: List[int] = [] 418 out_chrom: List[str] = [] 419 out_start: List[int] = [] 420 out_end: List[int] = [] 421 out_length_cm: List[float] = [] 422 out_segment_type: List[str] = [] if self.__segment_type is not None else None # type: ignore 423 424 # Vectorize chrom compare by making LAI chromosome strings 425 chr_lai = np.asarray(chromosomes).astype(str) 426 427 # Helper to compute cM length for a trimmed interval using LAI windows 428 def _approx_cm_len(chr_mask: np.ndarray, start_bp: int, end_bp: int) -> Optional[float]: 429 if centimorgan_pos is None: 430 return None 431 win_st = physical_pos[chr_mask, 0] 432 win_en = physical_pos[chr_mask, 1] 433 win_cm_st = centimorgan_pos[chr_mask, 0] 434 win_cm_en = centimorgan_pos[chr_mask, 1] 435 cm_total = 0.0 436 for ws, we, cs, ce in zip(win_st, win_en, win_cm_st, win_cm_en): 437 # Overlap with [start_bp, end_bp] 438 overlap_start = max(int(ws), int(start_bp)) 439 overlap_end = min(int(we), int(end_bp)) 440 if overlap_start > overlap_end: 441 continue 442 wlen_bp = max(1, int(we) - int(ws) + 1) 443 olen_bp = int(overlap_end) - int(overlap_start) + 1 444 frac = float(olen_bp) / float(wlen_bp) 445 cm_total += frac * float(ce - cs) 446 return cm_total 447 448 # Iterate over segments 449 for i in range(self.n_segments): 450 chrom = str(self.__chrom[i]) 451 seg_start = int(self.__start[i]) 452 seg_end = int(self.__end[i]) 453 if seg_end < seg_start: 454 continue 455 456 # Subset LAI windows on this chromosome that overlap the segment 457 idx_chr = (chr_lai == chrom) 458 if not np.any(idx_chr): 459 continue 460 lai_st = physical_pos[idx_chr, 0] 461 lai_en = physical_pos[idx_chr, 1] 462 overlaps = (lai_en >= seg_start) & (lai_st <= seg_end) 463 if not np.any(overlaps): 464 continue 465 466 # Build per-window ancestry mask for both individuals 467 s1 = str(self.__sample_id_1[i]) 468 s2 = str(self.__sample_id_2[i]) 469 h1 = int(self.__haplotype_id_1[i]) if self.__haplotype_id_1 is not None else -1 470 h2 = int(self.__haplotype_id_2[i]) if self.__haplotype_id_2 is not None else -1 471 472 # Resolve haplotype column indices for each sample 473 # Known haplotypes are 1-based in inputs; convert to {0,1} 474 def _get_cols(sample: str) -> Tuple[int, int]: 475 a = hap_to_col.get(f"{sample}.0") 476 b = hap_to_col.get(f"{sample}.1") 477 if a is None or b is None: 478 raise ValueError(f"Sample '{sample}' not found in LAI haplotypes.") 479 return a, b 480 481 s1_a, s1_b = _get_cols(s1) 482 s2_a, s2_b = _get_cols(s2) 483 484 # LAI rows for this chromosome 485 lai_rows = lai_str[idx_chr, :] 486 487 # Determine ancestry presence per window for each individual 488 if h1 in (1, 2) and h2 in (1, 2): 489 # Use specific haplotypes 490 s1_col = s1_a if (h1 - 1) == 0 else s1_b 491 s2_col = s2_a if (h2 - 1) == 0 else s2_b 492 s1_mask = (lai_rows[:, s1_col] == anc_str) 493 s2_mask = (lai_rows[:, s2_col] == anc_str) 494 if require_both_haplotypes: 495 # Additionally require the other hap of each sample to match 496 s1_other = s1_b if s1_col == s1_a else s1_a 497 s2_other = s2_b if s2_col == s2_a else s2_a 498 s1_mask = s1_mask & (lai_rows[:, s1_other] == anc_str) 499 s2_mask = s2_mask & (lai_rows[:, s2_other] == anc_str) 500 else: 501 # Unknown hap IDs: require at least one hap to match (or both if requested) 502 if require_both_haplotypes: 503 s1_mask = (lai_rows[:, s1_a] == anc_str) & (lai_rows[:, s1_b] == anc_str) 504 s2_mask = (lai_rows[:, s2_a] == anc_str) & (lai_rows[:, s2_b] == anc_str) 505 else: 506 s1_mask = (lai_rows[:, s1_a] == anc_str) | (lai_rows[:, s1_b] == anc_str) 507 s2_mask = (lai_rows[:, s2_a] == anc_str) | (lai_rows[:, s2_b] == anc_str) 508 509 keep = overlaps & s1_mask & s2_mask 510 511 if method == 'strict': 512 # In strict mode, ALL overlapping windows must have target ancestry 513 if not np.array_equal(overlaps, keep): 514 continue # Drop entire segment 515 516 # Apply length filters to original segment 517 if min_bp is not None and (seg_end - seg_start + 1) < int(min_bp): 518 continue 519 520 # In strict mode, preserve original length_cm 521 cm_len = float(self.__length_cm[i]) if self.__length_cm is not None else None 522 523 if min_cm is not None: 524 if cm_len is None or cm_len < float(min_cm): 525 continue 526 527 # Keep entire original segment 528 out_sample_id_1.append(s1) 529 out_sample_id_2.append(s2) 530 out_haplotype_id_1.append(h1) 531 out_haplotype_id_2.append(h2) 532 out_chrom.append(chrom) 533 out_start.append(seg_start) 534 out_end.append(seg_end) 535 out_length_cm.append(float(cm_len) if cm_len is not None else float('nan')) 536 if out_segment_type is not None: 537 out_segment_type.append(str(self.__segment_type[i])) # type: ignore 538 539 else: # method == 'clip' 540 if not np.any(keep): 541 continue 542 543 # Identify contiguous windows where keep=True 544 idx_keep = np.where(keep)[0] 545 # Split into runs of consecutive indices 546 breaks = np.where(np.diff(idx_keep) > 1)[0] 547 run_starts = np.r_[0, breaks + 1] 548 run_ends = np.r_[breaks, idx_keep.size - 1] 549 550 # Create subsegments for each contiguous run 551 for rs, re in zip(run_starts, run_ends): 552 i0 = idx_keep[rs] 553 i1 = idx_keep[re] 554 sub_start = int(max(seg_start, int(lai_st[i0]))) 555 sub_end = int(min(seg_end, int(lai_en[i1]))) 556 if sub_end < sub_start: 557 continue 558 559 # Length filters: bp first 560 if min_bp is not None and (sub_end - sub_start + 1) < int(min_bp): 561 continue 562 563 # Compute cM length if possible, else approximate or None 564 cm_len = _approx_cm_len(idx_chr, sub_start, sub_end) 565 if cm_len is None and self.__length_cm is not None: 566 # Scale the original segment length by bp fraction 567 total_bp = max(1, int(seg_end - seg_start + 1)) 568 frac_bp = float(sub_end - sub_start + 1) / float(total_bp) 569 try: 570 cm_len = float(self.__length_cm[i]) * frac_bp 571 except Exception: 572 cm_len = None 573 574 # Apply cM filter if requested (treat None as 0) 575 if min_cm is not None: 576 if cm_len is None or cm_len < float(min_cm): 577 continue 578 579 # Append trimmed segment 580 out_sample_id_1.append(s1) 581 out_sample_id_2.append(s2) 582 out_haplotype_id_1.append(h1) 583 out_haplotype_id_2.append(h2) 584 out_chrom.append(chrom) 585 out_start.append(sub_start) 586 out_end.append(sub_end) 587 out_length_cm.append(float(cm_len) if cm_len is not None else float('nan')) 588 if out_segment_type is not None: 589 out_segment_type.append(str(self.__segment_type[i])) # type: ignore 590 591 # If nothing remains, return empty object with zero segments 592 if len(out_start) == 0: 593 # Build minimal arrays 594 empty = IBDObject( 595 sample_id_1=np.array([], dtype=object), 596 haplotype_id_1=np.array([], dtype=int), 597 sample_id_2=np.array([], dtype=object), 598 haplotype_id_2=np.array([], dtype=int), 599 chrom=np.array([], dtype=object), 600 start=np.array([], dtype=int), 601 end=np.array([], dtype=int), 602 length_cm=None, 603 segment_type=None if out_segment_type is None else np.array([], dtype=object), 604 ) 605 if inplace: 606 self.__sample_id_1 = empty.sample_id_1 607 self.__haplotype_id_1 = empty.haplotype_id_1 608 self.__sample_id_2 = empty.sample_id_2 609 self.__haplotype_id_2 = empty.haplotype_id_2 610 self.__chrom = empty.chrom 611 self.__start = empty.start 612 self.__end = empty.end 613 self.__length_cm = empty.length_cm 614 self.__segment_type = empty.segment_type 615 return None 616 return empty 617 618 # Assemble outputs 619 out_length_array: Optional[np.ndarray] 620 if len(out_length_cm) > 0: 621 # Convert NaNs to None-equivalent by using np.array with dtype float 622 out_length_array = np.asarray(out_length_cm, dtype=float) 623 else: 624 out_length_array = None 625 626 new_obj = IBDObject( 627 sample_id_1=np.asarray(out_sample_id_1, dtype=object), 628 haplotype_id_1=np.asarray(out_haplotype_id_1, dtype=int), 629 sample_id_2=np.asarray(out_sample_id_2, dtype=object), 630 haplotype_id_2=np.asarray(out_haplotype_id_2, dtype=int), 631 chrom=np.asarray(out_chrom, dtype=object), 632 start=np.asarray(out_start, dtype=int), 633 end=np.asarray(out_end, dtype=int), 634 length_cm=out_length_array, 635 segment_type=None if out_segment_type is None else np.asarray(out_segment_type, dtype=object), 636 ) 637 638 if inplace: 639 self.__sample_id_1 = new_obj.sample_id_1 640 self.__haplotype_id_1 = new_obj.haplotype_id_1 641 self.__sample_id_2 = new_obj.sample_id_2 642 self.__haplotype_id_2 = new_obj.haplotype_id_2 643 self.__chrom = new_obj.chrom 644 self.__start = new_obj.start 645 self.__end = new_obj.end 646 self.__length_cm = new_obj.length_cm 647 self.__segment_type = new_obj.segment_type 648 return None 649 return new_obj 650 651 def _sanity_check(self) -> None: 652 """ 653 Perform sanity checks on the parsed data to ensure data integrity. 654 """ 655 n = self.__chrom.shape[0] 656 arrays = [ 657 self.__sample_id_1, 658 self.__haplotype_id_1, 659 self.__sample_id_2, 660 self.__haplotype_id_2, 661 self.__start, 662 self.__end, 663 ] 664 if any(arr.shape[0] != n for arr in arrays): 665 raise ValueError("All input arrays must have the same length.") 666 667 if self.__length_cm is not None and self.__length_cm.shape[0] != n: 668 raise ValueError("`length_cm` must have the same length as other arrays.") 669 670 if self.__segment_type is not None and self.__segment_type.shape[0] != n: 671 raise ValueError("`segment_type` must have the same length as other arrays.") 672 673 # Validate haplotype identifiers are 1 or 2, or -1 when unknown 674 valid_values = np.array([1, 2, -1]) 675 if not np.isin(self.__haplotype_id_1, valid_values).all() or not np.isin(self.__haplotype_id_2, valid_values).all(): 676 raise ValueError("Haplotype identifiers must be in {1, 2} or -1 if unknown.")
A class for Identity-By-Descent (IBD) segment data.
17 def __init__( 18 self, 19 sample_id_1: np.ndarray, 20 haplotype_id_1: np.ndarray, 21 sample_id_2: np.ndarray, 22 haplotype_id_2: np.ndarray, 23 chrom: np.ndarray, 24 start: np.ndarray, 25 end: np.ndarray, 26 length_cm: Optional[np.ndarray] = None, 27 segment_type: Optional[np.ndarray] = None, 28 ) -> None: 29 """ 30 Args: 31 sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual. 32 haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown). 33 sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual. 34 haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown). 35 chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment. 36 start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment. 37 end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment. 38 length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available. 39 """ 40 # Store attributes 41 self.__sample_id_1 = np.asarray(sample_id_1) 42 self.__haplotype_id_1 = np.asarray(haplotype_id_1) 43 self.__sample_id_2 = np.asarray(sample_id_2) 44 self.__haplotype_id_2 = np.asarray(haplotype_id_2) 45 self.__chrom = np.asarray(chrom) 46 self.__start = np.asarray(start) 47 self.__end = np.asarray(end) 48 self.__length_cm = None if length_cm is None else np.asarray(length_cm) 49 self.__segment_type = None if segment_type is None else np.asarray(segment_type) 50 51 self._sanity_check()
Arguments:
- sample_id_1 (array of shape (n_segments,)): Sample identifiers for the first individual.
- haplotype_id_1 (array of shape (n_segments,)): Haplotype identifiers for the first individual (values in {1, 2}, or -1 if unknown).
- sample_id_2 (array of shape (n_segments,)): Sample identifiers for the second individual.
- haplotype_id_2 (array of shape (n_segments,)): Haplotype identifiers for the second individual (values in {1, 2}, or -1 if unknown).
- chrom (array of shape (n_segments,)): Chromosome identifier for each IBD segment.
- start (array of shape (n_segments,)): Start physical position (1-based, bp) for each IBD segment.
- end (array of shape (n_segments,)): End physical position (1-based, bp) for each IBD segment.
- length_cm (array of shape (n_segments,), optional): Genetic length (cM) for each segment, if available.
73 @property 74 def sample_id_1(self) -> np.ndarray: 75 """ 76 Retrieve `sample_id_1`. 77 78 Returns: 79 **array of shape (n_segments,):** Sample identifiers for the first individual. 80 """ 81 return self.__sample_id_1
Retrieve sample_id_1.
Returns:
array of shape (n_segments,): Sample identifiers for the first individual.
90 @property 91 def haplotype_id_1(self) -> np.ndarray: 92 """ 93 Retrieve `haplotype_id_1`. 94 95 Returns: 96 **array of shape (n_segments,):** Haplotype identifiers for the first individual (values in {1, 2}). 97 """ 98 return self.__haplotype_id_1
Retrieve haplotype_id_1.
Returns:
array of shape (n_segments,): Haplotype identifiers for the first individual (values in {1, 2}).
107 @property 108 def sample_id_2(self) -> np.ndarray: 109 """ 110 Retrieve `sample_id_2`. 111 112 Returns: 113 **array of shape (n_segments,):** Sample identifiers for the second individual. 114 """ 115 return self.__sample_id_2
Retrieve sample_id_2.
Returns:
array of shape (n_segments,): Sample identifiers for the second individual.
124 @property 125 def haplotype_id_2(self) -> np.ndarray: 126 """ 127 Retrieve `haplotype_id_2`. 128 129 Returns: 130 **array of shape (n_segments,):** Haplotype identifiers for the second individual (values in {1, 2}). 131 """ 132 return self.__haplotype_id_2
Retrieve haplotype_id_2.
Returns:
array of shape (n_segments,): Haplotype identifiers for the second individual (values in {1, 2}).
158 @property 159 def start(self) -> np.ndarray: 160 """ 161 Retrieve `start`. 162 163 Returns: 164 **array of shape (n_segments,):** Start physical position (1-based, bp) for each IBD segment. 165 """ 166 return self.__start
Retrieve start.
Returns:
array of shape (n_segments,): Start physical position (1-based, bp) for each IBD segment.
175 @property 176 def end(self) -> np.ndarray: 177 """ 178 Retrieve `end`. 179 180 Returns: 181 **array of shape (n_segments,):** End physical position (1-based, bp) for each IBD segment. 182 """ 183 return self.__end
Retrieve end.
Returns:
array of shape (n_segments,): End physical position (1-based, bp) for each IBD segment.
192 @property 193 def length_cm(self) -> Optional[np.ndarray]: 194 """ 195 Retrieve `length_cm`. 196 197 Returns: 198 **array of shape (n_segments,):** Genetic length (cM) for each segment if available; otherwise None. 199 """ 200 return self.__length_cm
Retrieve length_cm.
Returns:
array of shape (n_segments,): Genetic length (cM) for each segment if available; otherwise None.
209 @property 210 def segment_type(self) -> Optional[np.ndarray]: 211 """ 212 Retrieve `segment_type`. 213 214 Returns: 215 **array of shape (n_segments,):** Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable. 216 """ 217 return self.__segment_type
Retrieve segment_type.
Returns:
array of shape (n_segments,): Segment type labels (e.g., 'IBD1', 'IBD2'), or None if unavailable.
246 @property 247 def haplotype_pairs(self) -> np.ndarray: 248 """ 249 Retrieve `haplotype_pairs`. 250 251 Returns: 252 **array of shape (n_segments, 2):** Per-segment haplotype identifier pairs. 253 """ 254 return np.column_stack([self.__haplotype_id_1, self.__haplotype_id_2])
Retrieve haplotype_pairs.
Returns:
array of shape (n_segments, 2): Per-segment haplotype identifier pairs.
256 def copy(self) -> 'IBDObject': 257 """ 258 Create and return a copy of `self`. 259 260 Returns: 261 **IBDObject:** A new instance of the current object. 262 """ 263 return copy.deepcopy(self)
Create and return a copy of self.
Returns:
IBDObject: A new instance of the current object.
265 def keys(self) -> List[str]: 266 """ 267 Retrieve a list of public attribute names for `self`. 268 269 Returns: 270 **list of str:** A list of attribute names, with internal name-mangling removed. 271 """ 272 return [attr.replace('_IBDObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed.
274 def filter_segments( 275 self, 276 chrom: Optional[Sequence[str]] = None, 277 samples: Optional[Sequence[str]] = None, 278 min_length_cm: Optional[float] = None, 279 segment_types: Optional[Sequence[str]] = None, 280 inplace: bool = False, 281 ) -> Optional['IBDObject']: 282 """ 283 Filter IBD segments by chromosome, sample names, and/or minimum genetic length. 284 285 Args: 286 chrom (sequence of str, optional): Chromosome(s) to include. 287 samples (sequence of str, optional): Sample names to include if present in either column. 288 min_length_cm (float, optional): Minimum cM length threshold. 289 inplace (bool, default=False): If True, modifies `self` in place. If False, returns a new `IBDObject`. 290 291 Returns: 292 **Optional[IBDObject]:** A filtered IBDObject if `inplace=False`. If `inplace=True`, returns None. 293 """ 294 mask = np.ones(self.n_segments, dtype=bool) 295 296 if chrom is not None: 297 chrom = np.atleast_1d(chrom) 298 mask &= np.isin(self.__chrom, chrom) 299 300 if samples is not None: 301 samples = np.atleast_1d(samples) 302 mask &= np.isin(self.__sample_id_1, samples) | np.isin(self.__sample_id_2, samples) 303 304 if min_length_cm is not None and self.__length_cm is not None: 305 mask &= self.__length_cm >= float(min_length_cm) 306 307 if segment_types is not None and self.__segment_type is not None: 308 segment_types = np.atleast_1d(segment_types) 309 mask &= np.isin(self.__segment_type, segment_types) 310 311 def _apply_mask(x: Optional[np.ndarray]) -> Optional[np.ndarray]: 312 return None if x is None else np.asarray(x)[mask] 313 314 if inplace: 315 self.__sample_id_1 = _apply_mask(self.__sample_id_1) 316 self.__haplotype_id_1 = _apply_mask(self.__haplotype_id_1) 317 self.__sample_id_2 = _apply_mask(self.__sample_id_2) 318 self.__haplotype_id_2 = _apply_mask(self.__haplotype_id_2) 319 self.__chrom = _apply_mask(self.__chrom) 320 self.__start = _apply_mask(self.__start) 321 self.__end = _apply_mask(self.__end) 322 self.__length_cm = _apply_mask(self.__length_cm) 323 self.__segment_type = _apply_mask(self.__segment_type) 324 return None 325 else: 326 return IBDObject( 327 sample_id_1=_apply_mask(self.__sample_id_1), 328 haplotype_id_1=_apply_mask(self.__haplotype_id_1), 329 sample_id_2=_apply_mask(self.__sample_id_2), 330 haplotype_id_2=_apply_mask(self.__haplotype_id_2), 331 chrom=_apply_mask(self.__chrom), 332 start=_apply_mask(self.__start), 333 end=_apply_mask(self.__end), 334 length_cm=_apply_mask(self.__length_cm), 335 segment_type=_apply_mask(self.__segment_type), 336 )
Filter IBD segments by chromosome, sample names, and/or minimum genetic length.
Arguments:
- chrom (sequence of str, optional): Chromosome(s) to include.
- samples (sequence of str, optional): Sample names to include if present in either column.
- min_length_cm (float, optional): Minimum cM length threshold.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newIBDObject.
Returns:
Optional[IBDObject]: A filtered IBDObject if
inplace=False. Ifinplace=True, returns None.
338 def restrict_to_ancestry( 339 self, 340 *, 341 laiobj: Any, 342 ancestry: Any, 343 require_both_haplotypes: bool = False, 344 min_bp: Optional[int] = None, 345 min_cm: Optional[float] = None, 346 inplace: bool = False, 347 method: str = 'clip', 348 ) -> Optional['IBDObject']: 349 """ 350 Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry 351 according to a `LocalAncestryObject`. 352 353 This performs an interval intersection per segment against ancestry tracts: 354 - If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific 355 haplotype of each individual. 356 - If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is 357 considered present for an individual if at least one of their haplotypes matches 358 the requested ancestry (unless `require_both_haplotypes=True`). 359 360 Method 'strict': 361 Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry 362 for either individual. No trimming occurs - segments are kept whole or dropped completely. 363 364 Method 'clip': 365 Trim IBD segments to contiguous regions where both individuals have the target ancestry. 366 Resulting subsegments are clipped to LAI window boundaries and original IBD start/end, 367 with optional length filtering by bp or cM. 368 369 Args: 370 laiobj: LocalAncestryObject containing 2D `lai` of shape (n_windows, n_haplotypes), 371 `physical_pos` (n_windows, 2), and `chromosomes` (n_windows,). 372 ancestry: Target ancestry code or label. Compared as string, so both int and str work. 373 require_both_haplotypes: If True, require both haplotypes of each individual to have 374 the target ancestry within a window. When haplotypes are known per segment, this 375 only affects cases with unknown haplotypes (== -1) or IBD2 segments. 376 min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip). 377 min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip). 378 inplace: If True, replace `self` with the restricted object; else return a new object. 379 method: Method to use for filtering. 'strict' drops entire segments that overlap with 380 non-target ancestry. 'clip' trims segments to target ancestry regions. 381 382 Returns: 383 Optional[IBDObject]: A restricted IBDObject if `inplace=False`. If `inplace=True`, 384 returns None. 385 """ 386 if method not in ['strict', 'clip']: 387 raise ValueError(f"Method must be 'strict' or 'clip', got '{method}'") 388 389 # Basic LAI shape/metadata checks 390 lai = getattr(laiobj, 'lai', None) 391 physical_pos = getattr(laiobj, 'physical_pos', None) 392 chromosomes = getattr(laiobj, 'chromosomes', None) 393 centimorgan_pos = getattr(laiobj, 'centimorgan_pos', None) 394 haplotypes = getattr(laiobj, 'haplotypes', None) 395 396 if lai is None or physical_pos is None or chromosomes is None or haplotypes is None: 397 raise ValueError( 398 "`laiobj` must provide `lai`, `physical_pos`, `chromosomes`, and `haplotypes`." 399 ) 400 401 if lai.ndim != 2: 402 raise ValueError("`laiobj.lai` must be 2D with shape (n_windows, n_haplotypes).") 403 404 # Build haplotype label -> column index map (labels like 'Sample.0', 'Sample.1') 405 hap_to_col = {str(h): i for i, h in enumerate(haplotypes)} 406 407 # Coerce ancestry to str for robust comparisons 408 anc_str = str(ancestry) 409 410 # Coerce LAI values to str once for comparisons 411 lai_str = lai.astype(str) 412 413 # Prepare arrays for the restricted segments 414 out_sample_id_1: List[str] = [] 415 out_haplotype_id_1: List[int] = [] 416 out_sample_id_2: List[str] = [] 417 out_haplotype_id_2: List[int] = [] 418 out_chrom: List[str] = [] 419 out_start: List[int] = [] 420 out_end: List[int] = [] 421 out_length_cm: List[float] = [] 422 out_segment_type: List[str] = [] if self.__segment_type is not None else None # type: ignore 423 424 # Vectorize chrom compare by making LAI chromosome strings 425 chr_lai = np.asarray(chromosomes).astype(str) 426 427 # Helper to compute cM length for a trimmed interval using LAI windows 428 def _approx_cm_len(chr_mask: np.ndarray, start_bp: int, end_bp: int) -> Optional[float]: 429 if centimorgan_pos is None: 430 return None 431 win_st = physical_pos[chr_mask, 0] 432 win_en = physical_pos[chr_mask, 1] 433 win_cm_st = centimorgan_pos[chr_mask, 0] 434 win_cm_en = centimorgan_pos[chr_mask, 1] 435 cm_total = 0.0 436 for ws, we, cs, ce in zip(win_st, win_en, win_cm_st, win_cm_en): 437 # Overlap with [start_bp, end_bp] 438 overlap_start = max(int(ws), int(start_bp)) 439 overlap_end = min(int(we), int(end_bp)) 440 if overlap_start > overlap_end: 441 continue 442 wlen_bp = max(1, int(we) - int(ws) + 1) 443 olen_bp = int(overlap_end) - int(overlap_start) + 1 444 frac = float(olen_bp) / float(wlen_bp) 445 cm_total += frac * float(ce - cs) 446 return cm_total 447 448 # Iterate over segments 449 for i in range(self.n_segments): 450 chrom = str(self.__chrom[i]) 451 seg_start = int(self.__start[i]) 452 seg_end = int(self.__end[i]) 453 if seg_end < seg_start: 454 continue 455 456 # Subset LAI windows on this chromosome that overlap the segment 457 idx_chr = (chr_lai == chrom) 458 if not np.any(idx_chr): 459 continue 460 lai_st = physical_pos[idx_chr, 0] 461 lai_en = physical_pos[idx_chr, 1] 462 overlaps = (lai_en >= seg_start) & (lai_st <= seg_end) 463 if not np.any(overlaps): 464 continue 465 466 # Build per-window ancestry mask for both individuals 467 s1 = str(self.__sample_id_1[i]) 468 s2 = str(self.__sample_id_2[i]) 469 h1 = int(self.__haplotype_id_1[i]) if self.__haplotype_id_1 is not None else -1 470 h2 = int(self.__haplotype_id_2[i]) if self.__haplotype_id_2 is not None else -1 471 472 # Resolve haplotype column indices for each sample 473 # Known haplotypes are 1-based in inputs; convert to {0,1} 474 def _get_cols(sample: str) -> Tuple[int, int]: 475 a = hap_to_col.get(f"{sample}.0") 476 b = hap_to_col.get(f"{sample}.1") 477 if a is None or b is None: 478 raise ValueError(f"Sample '{sample}' not found in LAI haplotypes.") 479 return a, b 480 481 s1_a, s1_b = _get_cols(s1) 482 s2_a, s2_b = _get_cols(s2) 483 484 # LAI rows for this chromosome 485 lai_rows = lai_str[idx_chr, :] 486 487 # Determine ancestry presence per window for each individual 488 if h1 in (1, 2) and h2 in (1, 2): 489 # Use specific haplotypes 490 s1_col = s1_a if (h1 - 1) == 0 else s1_b 491 s2_col = s2_a if (h2 - 1) == 0 else s2_b 492 s1_mask = (lai_rows[:, s1_col] == anc_str) 493 s2_mask = (lai_rows[:, s2_col] == anc_str) 494 if require_both_haplotypes: 495 # Additionally require the other hap of each sample to match 496 s1_other = s1_b if s1_col == s1_a else s1_a 497 s2_other = s2_b if s2_col == s2_a else s2_a 498 s1_mask = s1_mask & (lai_rows[:, s1_other] == anc_str) 499 s2_mask = s2_mask & (lai_rows[:, s2_other] == anc_str) 500 else: 501 # Unknown hap IDs: require at least one hap to match (or both if requested) 502 if require_both_haplotypes: 503 s1_mask = (lai_rows[:, s1_a] == anc_str) & (lai_rows[:, s1_b] == anc_str) 504 s2_mask = (lai_rows[:, s2_a] == anc_str) & (lai_rows[:, s2_b] == anc_str) 505 else: 506 s1_mask = (lai_rows[:, s1_a] == anc_str) | (lai_rows[:, s1_b] == anc_str) 507 s2_mask = (lai_rows[:, s2_a] == anc_str) | (lai_rows[:, s2_b] == anc_str) 508 509 keep = overlaps & s1_mask & s2_mask 510 511 if method == 'strict': 512 # In strict mode, ALL overlapping windows must have target ancestry 513 if not np.array_equal(overlaps, keep): 514 continue # Drop entire segment 515 516 # Apply length filters to original segment 517 if min_bp is not None and (seg_end - seg_start + 1) < int(min_bp): 518 continue 519 520 # In strict mode, preserve original length_cm 521 cm_len = float(self.__length_cm[i]) if self.__length_cm is not None else None 522 523 if min_cm is not None: 524 if cm_len is None or cm_len < float(min_cm): 525 continue 526 527 # Keep entire original segment 528 out_sample_id_1.append(s1) 529 out_sample_id_2.append(s2) 530 out_haplotype_id_1.append(h1) 531 out_haplotype_id_2.append(h2) 532 out_chrom.append(chrom) 533 out_start.append(seg_start) 534 out_end.append(seg_end) 535 out_length_cm.append(float(cm_len) if cm_len is not None else float('nan')) 536 if out_segment_type is not None: 537 out_segment_type.append(str(self.__segment_type[i])) # type: ignore 538 539 else: # method == 'clip' 540 if not np.any(keep): 541 continue 542 543 # Identify contiguous windows where keep=True 544 idx_keep = np.where(keep)[0] 545 # Split into runs of consecutive indices 546 breaks = np.where(np.diff(idx_keep) > 1)[0] 547 run_starts = np.r_[0, breaks + 1] 548 run_ends = np.r_[breaks, idx_keep.size - 1] 549 550 # Create subsegments for each contiguous run 551 for rs, re in zip(run_starts, run_ends): 552 i0 = idx_keep[rs] 553 i1 = idx_keep[re] 554 sub_start = int(max(seg_start, int(lai_st[i0]))) 555 sub_end = int(min(seg_end, int(lai_en[i1]))) 556 if sub_end < sub_start: 557 continue 558 559 # Length filters: bp first 560 if min_bp is not None and (sub_end - sub_start + 1) < int(min_bp): 561 continue 562 563 # Compute cM length if possible, else approximate or None 564 cm_len = _approx_cm_len(idx_chr, sub_start, sub_end) 565 if cm_len is None and self.__length_cm is not None: 566 # Scale the original segment length by bp fraction 567 total_bp = max(1, int(seg_end - seg_start + 1)) 568 frac_bp = float(sub_end - sub_start + 1) / float(total_bp) 569 try: 570 cm_len = float(self.__length_cm[i]) * frac_bp 571 except Exception: 572 cm_len = None 573 574 # Apply cM filter if requested (treat None as 0) 575 if min_cm is not None: 576 if cm_len is None or cm_len < float(min_cm): 577 continue 578 579 # Append trimmed segment 580 out_sample_id_1.append(s1) 581 out_sample_id_2.append(s2) 582 out_haplotype_id_1.append(h1) 583 out_haplotype_id_2.append(h2) 584 out_chrom.append(chrom) 585 out_start.append(sub_start) 586 out_end.append(sub_end) 587 out_length_cm.append(float(cm_len) if cm_len is not None else float('nan')) 588 if out_segment_type is not None: 589 out_segment_type.append(str(self.__segment_type[i])) # type: ignore 590 591 # If nothing remains, return empty object with zero segments 592 if len(out_start) == 0: 593 # Build minimal arrays 594 empty = IBDObject( 595 sample_id_1=np.array([], dtype=object), 596 haplotype_id_1=np.array([], dtype=int), 597 sample_id_2=np.array([], dtype=object), 598 haplotype_id_2=np.array([], dtype=int), 599 chrom=np.array([], dtype=object), 600 start=np.array([], dtype=int), 601 end=np.array([], dtype=int), 602 length_cm=None, 603 segment_type=None if out_segment_type is None else np.array([], dtype=object), 604 ) 605 if inplace: 606 self.__sample_id_1 = empty.sample_id_1 607 self.__haplotype_id_1 = empty.haplotype_id_1 608 self.__sample_id_2 = empty.sample_id_2 609 self.__haplotype_id_2 = empty.haplotype_id_2 610 self.__chrom = empty.chrom 611 self.__start = empty.start 612 self.__end = empty.end 613 self.__length_cm = empty.length_cm 614 self.__segment_type = empty.segment_type 615 return None 616 return empty 617 618 # Assemble outputs 619 out_length_array: Optional[np.ndarray] 620 if len(out_length_cm) > 0: 621 # Convert NaNs to None-equivalent by using np.array with dtype float 622 out_length_array = np.asarray(out_length_cm, dtype=float) 623 else: 624 out_length_array = None 625 626 new_obj = IBDObject( 627 sample_id_1=np.asarray(out_sample_id_1, dtype=object), 628 haplotype_id_1=np.asarray(out_haplotype_id_1, dtype=int), 629 sample_id_2=np.asarray(out_sample_id_2, dtype=object), 630 haplotype_id_2=np.asarray(out_haplotype_id_2, dtype=int), 631 chrom=np.asarray(out_chrom, dtype=object), 632 start=np.asarray(out_start, dtype=int), 633 end=np.asarray(out_end, dtype=int), 634 length_cm=out_length_array, 635 segment_type=None if out_segment_type is None else np.asarray(out_segment_type, dtype=object), 636 ) 637 638 if inplace: 639 self.__sample_id_1 = new_obj.sample_id_1 640 self.__haplotype_id_1 = new_obj.haplotype_id_1 641 self.__sample_id_2 = new_obj.sample_id_2 642 self.__haplotype_id_2 = new_obj.haplotype_id_2 643 self.__chrom = new_obj.chrom 644 self.__start = new_obj.start 645 self.__end = new_obj.end 646 self.__length_cm = new_obj.length_cm 647 self.__segment_type = new_obj.segment_type 648 return None 649 return new_obj
Filter and/or trim IBD segments to intervals where both individuals carry the specified ancestry
according to a LocalAncestryObject.
This performs an interval intersection per segment against ancestry tracts:
- If haplotype IDs are known (e.g., Hap-IBD), ancestry is checked on the specific haplotype of each individual.
- If haplotype IDs are unknown (e.g., ancIBD; haplotype_id_* == -1), ancestry is
considered present for an individual if at least one of their haplotypes matches
the requested ancestry (unless
require_both_haplotypes=True).
Method 'strict': Drop entire IBD segments if ANY overlapping LAI window contains non-target ancestry for either individual. No trimming occurs - segments are kept whole or dropped completely.
Method 'clip': Trim IBD segments to contiguous regions where both individuals have the target ancestry. Resulting subsegments are clipped to LAI window boundaries and original IBD start/end, with optional length filtering by bp or cM.
Arguments:
- laiobj: LocalAncestryObject containing 2D
laiof shape (n_windows, n_haplotypes),physical_pos(n_windows, 2), andchromosomes(n_windows,). - ancestry: Target ancestry code or label. Compared as string, so both int and str work.
- require_both_haplotypes: If True, require both haplotypes of each individual to have the target ancestry within a window. When haplotypes are known per segment, this only affects cases with unknown haplotypes (== -1) or IBD2 segments.
- min_bp: Minimum base-pair length to retain a segment (strict) or subsegment (clip).
- min_cm: Minimum centiMorgan length to retain a segment (strict) or subsegment (clip).
- inplace: If True, replace
selfwith the restricted object; else return a new object. - method: Method to use for filtering. 'strict' drops entire segments that overlap with non-target ancestry. 'clip' trims segments to target ancestry regions.
Returns:
Optional[IBDObject]: A restricted IBDObject if
inplace=False. Ifinplace=True, returns None.
8def read_ibd(file: Union[str, Path], **kwargs) -> IBDObject: 9 """ 10 Automatically detect the IBD data file format from the file's extension and read it into an `IBDObject`. 11 12 Supported formats: 13 - Hap-IBD (no standard extension; defaults to tab-delimited columns without header). 14 - ancIBD (template only). 15 16 Args: 17 file (str or pathlib.Path): Path to the file to be read. 18 **kwargs: Additional arguments passed to the reader method. 19 """ 20 from snputils.ibd.io.read.auto import IBDReader 21 22 return IBDReader(file).read(**kwargs)
Automatically detect the IBD data file format from the file's extension and read it into an IBDObject.
Supported formats:
- Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
- ancIBD (template only).
Arguments:
- file (str or pathlib.Path): Path to the file to be read.
- **kwargs: Additional arguments passed to the reader method.
18class HapIBDReader(IBDBaseReader): 19 """ 20 Reads an IBD file in Hap-IBD format and processes it into an `IBDObject`. 21 """ 22 23 def read(self, separator: Optional[str] = None) -> IBDObject: 24 """ 25 Read a Hap-IBD file into an `IBDObject`. 26 27 The Hap-IBD format is a delimited text without a header with columns: 28 sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm 29 30 Notes: 31 - Haplotype identifiers are 1-based and take values in {1, 2}. 32 33 Args: 34 separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed. 35 36 Returns: 37 **IBDObject**: An IBDObject instance. 38 """ 39 log.info(f"Reading {self.file}") 40 41 # Column names for Hap-IBD files (no header present in input) 42 col_names = [ 43 'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2', 44 'chrom', 'start', 'end', 'length_cm' 45 ] 46 47 # Detect gzip by extension 48 is_gz = str(self.file).endswith('.gz') 49 50 # If separator is None, treat as whitespace-delimited (any spaces or tabs) 51 if separator is None: 52 # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing 53 if is_gz: 54 with gzip.open(self.file, 'rt') as f: 55 lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()] 56 else: 57 with open(self.file, 'r') as f: 58 lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()] 59 60 data = StringIO("\n".join(lines)) 61 df = pl.read_csv( 62 source=data, 63 has_header=False, 64 separator='\t', 65 new_columns=col_names, 66 schema_overrides={ 67 'sample_id_1': pl.Utf8, 68 'haplotype_id_1': pl.Int8, 69 'sample_id_2': pl.Utf8, 70 'haplotype_id_2': pl.Int8, 71 'chrom': pl.Utf8, 72 'start': pl.Int64, 73 'end': pl.Int64, 74 'length_cm': pl.Float64, 75 }, 76 ) 77 else: 78 df = pl.read_csv( 79 source=str(self.file), 80 has_header=False, 81 separator=separator, 82 new_columns=col_names, 83 schema_overrides={ 84 'sample_id_1': pl.Utf8, 85 'haplotype_id_1': pl.Int8, 86 'sample_id_2': pl.Utf8, 87 'haplotype_id_2': pl.Int8, 88 'chrom': pl.Utf8, 89 'start': pl.Int64, 90 'end': pl.Int64, 91 'length_cm': pl.Float64, 92 }, 93 ) 94 95 ibdobj = IBDObject( 96 sample_id_1=df['sample_id_1'].to_numpy(), 97 haplotype_id_1=df['haplotype_id_1'].to_numpy(), 98 sample_id_2=df['sample_id_2'].to_numpy(), 99 haplotype_id_2=df['haplotype_id_2'].to_numpy(), 100 chrom=df['chrom'].to_numpy(), 101 start=df['start'].to_numpy(), 102 end=df['end'].to_numpy(), 103 length_cm=df['length_cm'].to_numpy(), 104 segment_type=np.array(["IBD1"] * df.height), # hap-IBD does not distinguish; treat as IBD1 105 ) 106 107 log.info(f"Finished reading {self.file}") 108 109 return ibdobj
Reads an IBD file in Hap-IBD format and processes it into an IBDObject.
23 def read(self, separator: Optional[str] = None) -> IBDObject: 24 """ 25 Read a Hap-IBD file into an `IBDObject`. 26 27 The Hap-IBD format is a delimited text without a header with columns: 28 sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm 29 30 Notes: 31 - Haplotype identifiers are 1-based and take values in {1, 2}. 32 33 Args: 34 separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed. 35 36 Returns: 37 **IBDObject**: An IBDObject instance. 38 """ 39 log.info(f"Reading {self.file}") 40 41 # Column names for Hap-IBD files (no header present in input) 42 col_names = [ 43 'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2', 44 'chrom', 'start', 'end', 'length_cm' 45 ] 46 47 # Detect gzip by extension 48 is_gz = str(self.file).endswith('.gz') 49 50 # If separator is None, treat as whitespace-delimited (any spaces or tabs) 51 if separator is None: 52 # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing 53 if is_gz: 54 with gzip.open(self.file, 'rt') as f: 55 lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()] 56 else: 57 with open(self.file, 'r') as f: 58 lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()] 59 60 data = StringIO("\n".join(lines)) 61 df = pl.read_csv( 62 source=data, 63 has_header=False, 64 separator='\t', 65 new_columns=col_names, 66 schema_overrides={ 67 'sample_id_1': pl.Utf8, 68 'haplotype_id_1': pl.Int8, 69 'sample_id_2': pl.Utf8, 70 'haplotype_id_2': pl.Int8, 71 'chrom': pl.Utf8, 72 'start': pl.Int64, 73 'end': pl.Int64, 74 'length_cm': pl.Float64, 75 }, 76 ) 77 else: 78 df = pl.read_csv( 79 source=str(self.file), 80 has_header=False, 81 separator=separator, 82 new_columns=col_names, 83 schema_overrides={ 84 'sample_id_1': pl.Utf8, 85 'haplotype_id_1': pl.Int8, 86 'sample_id_2': pl.Utf8, 87 'haplotype_id_2': pl.Int8, 88 'chrom': pl.Utf8, 89 'start': pl.Int64, 90 'end': pl.Int64, 91 'length_cm': pl.Float64, 92 }, 93 ) 94 95 ibdobj = IBDObject( 96 sample_id_1=df['sample_id_1'].to_numpy(), 97 haplotype_id_1=df['haplotype_id_1'].to_numpy(), 98 sample_id_2=df['sample_id_2'].to_numpy(), 99 haplotype_id_2=df['haplotype_id_2'].to_numpy(), 100 chrom=df['chrom'].to_numpy(), 101 start=df['start'].to_numpy(), 102 end=df['end'].to_numpy(), 103 length_cm=df['length_cm'].to_numpy(), 104 segment_type=np.array(["IBD1"] * df.height), # hap-IBD does not distinguish; treat as IBD1 105 ) 106 107 log.info(f"Finished reading {self.file}") 108 109 return ibdobj
Read a Hap-IBD file into an IBDObject.
The Hap-IBD format is a delimited text without a header with columns: sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
Notes:
- Haplotype identifiers are 1-based and take values in {1, 2}.
Arguments:
- separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
Returns:
IBDObject: An IBDObject instance.
17class AncIBDReader(IBDBaseReader): 18 """ 19 Reads IBD data from ancIBD outputs (TSV), accepting a file (`ch_all.tsv` or `ch*.tsv`) or a directory. 20 """ 21 22 def read( 23 self, 24 path: Optional[Union[str, Path]] = None, 25 include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"), 26 ) -> IBDObject: 27 """ 28 Read ancIBD outputs and convert to `IBDObject`. 29 30 Inputs accepted: 31 - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`. 32 - A directory containing per-chromosome TSVs or `ch_all.tsv`. 33 34 Column schema (tab-separated with header): 35 iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type 36 37 Notes: 38 - Haplotype indices are not provided by ancIBD; set to -1. 39 - Positions in IBDObject use base-pair StartBP/EndBP. 40 - Length uses centiMorgan as `lengthM * 100`. 41 42 Args: 43 path (str or Path, optional): Override input path. Defaults to `self.file`. 44 include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable. 45 46 Returns: 47 **IBDObject**: An IBDObject instance. 48 """ 49 p = Path(path) if path is not None else Path(self.file) 50 log.info(f"Reading ancIBD from {p}") 51 52 files: list[Path] 53 if p.is_dir(): 54 # Prefer combined file if present, else gather per-chromosome files 55 combined = p / "ch_all.tsv" 56 combined_gz = p / "ch_all.tsv.gz" 57 if combined.exists(): 58 files = [combined] 59 elif combined_gz.exists(): 60 files = [combined_gz] 61 else: 62 files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz"))) 63 if not files: 64 raise FileNotFoundError("No ancIBD output files found in directory.") 65 else: 66 files = [p] 67 68 frames = [] 69 schema_overrides = { 70 "iid1": pl.Utf8, 71 "iid2": pl.Utf8, 72 "ch": pl.Utf8, 73 "Start": pl.Int64, 74 "End": pl.Int64, 75 "length": pl.Int64, # marker span; not used 76 "StartM": pl.Float64, 77 "EndM": pl.Float64, 78 "lengthM": pl.Float64, 79 "StartBP": pl.Int64, 80 "EndBP": pl.Int64, 81 "segment_type": pl.Utf8, 82 } 83 84 for f in files: 85 frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides) 86 frames.append(frame) 87 88 df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0] 89 90 if include_segment_types is not None: 91 df = df.filter(pl.col("segment_type").is_in(list(include_segment_types))) 92 93 # Map columns to IBDObject schema 94 sample_id_1 = df["iid1"].to_numpy() 95 sample_id_2 = df["iid2"].to_numpy() 96 chrom = df["ch"].to_numpy() 97 start_bp = df["StartBP"].to_numpy() 98 end_bp = df["EndBP"].to_numpy() 99 length_cm = (df["lengthM"] * 100.0).to_numpy() 100 101 # ancIBD doesn't include haplotype indices; set to -1 102 hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8) 103 hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8) 104 105 ibdobj = IBDObject( 106 sample_id_1=sample_id_1, 107 haplotype_id_1=hap1, 108 sample_id_2=sample_id_2, 109 haplotype_id_2=hap2, 110 chrom=chrom, 111 start=start_bp, 112 end=end_bp, 113 length_cm=length_cm, 114 segment_type=df["segment_type"].to_numpy(), 115 ) 116 117 log.info(f"Finished reading ancIBD from {p}") 118 return ibdobj
Reads IBD data from ancIBD outputs (TSV), accepting a file (ch_all.tsv or ch*.tsv) or a directory.
22 def read( 23 self, 24 path: Optional[Union[str, Path]] = None, 25 include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"), 26 ) -> IBDObject: 27 """ 28 Read ancIBD outputs and convert to `IBDObject`. 29 30 Inputs accepted: 31 - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`. 32 - A directory containing per-chromosome TSVs or `ch_all.tsv`. 33 34 Column schema (tab-separated with header): 35 iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type 36 37 Notes: 38 - Haplotype indices are not provided by ancIBD; set to -1. 39 - Positions in IBDObject use base-pair StartBP/EndBP. 40 - Length uses centiMorgan as `lengthM * 100`. 41 42 Args: 43 path (str or Path, optional): Override input path. Defaults to `self.file`. 44 include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable. 45 46 Returns: 47 **IBDObject**: An IBDObject instance. 48 """ 49 p = Path(path) if path is not None else Path(self.file) 50 log.info(f"Reading ancIBD from {p}") 51 52 files: list[Path] 53 if p.is_dir(): 54 # Prefer combined file if present, else gather per-chromosome files 55 combined = p / "ch_all.tsv" 56 combined_gz = p / "ch_all.tsv.gz" 57 if combined.exists(): 58 files = [combined] 59 elif combined_gz.exists(): 60 files = [combined_gz] 61 else: 62 files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz"))) 63 if not files: 64 raise FileNotFoundError("No ancIBD output files found in directory.") 65 else: 66 files = [p] 67 68 frames = [] 69 schema_overrides = { 70 "iid1": pl.Utf8, 71 "iid2": pl.Utf8, 72 "ch": pl.Utf8, 73 "Start": pl.Int64, 74 "End": pl.Int64, 75 "length": pl.Int64, # marker span; not used 76 "StartM": pl.Float64, 77 "EndM": pl.Float64, 78 "lengthM": pl.Float64, 79 "StartBP": pl.Int64, 80 "EndBP": pl.Int64, 81 "segment_type": pl.Utf8, 82 } 83 84 for f in files: 85 frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides) 86 frames.append(frame) 87 88 df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0] 89 90 if include_segment_types is not None: 91 df = df.filter(pl.col("segment_type").is_in(list(include_segment_types))) 92 93 # Map columns to IBDObject schema 94 sample_id_1 = df["iid1"].to_numpy() 95 sample_id_2 = df["iid2"].to_numpy() 96 chrom = df["ch"].to_numpy() 97 start_bp = df["StartBP"].to_numpy() 98 end_bp = df["EndBP"].to_numpy() 99 length_cm = (df["lengthM"] * 100.0).to_numpy() 100 101 # ancIBD doesn't include haplotype indices; set to -1 102 hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8) 103 hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8) 104 105 ibdobj = IBDObject( 106 sample_id_1=sample_id_1, 107 haplotype_id_1=hap1, 108 sample_id_2=sample_id_2, 109 haplotype_id_2=hap2, 110 chrom=chrom, 111 start=start_bp, 112 end=end_bp, 113 length_cm=length_cm, 114 segment_type=df["segment_type"].to_numpy(), 115 ) 116 117 log.info(f"Finished reading ancIBD from {p}") 118 return ibdobj
Read ancIBD outputs and convert to IBDObject.
Inputs accepted:
- A single TSV (optionally gzipped), e.g.
ch_all.tsv[.gz]orch{CHR}.tsv[.gz]. - A directory containing per-chromosome TSVs or
ch_all.tsv.
Column schema (tab-separated with header): iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
Notes:
- Haplotype indices are not provided by ancIBD; set to -1.
- Positions in IBDObject use base-pair StartBP/EndBP.
- Length uses centiMorgan as
lengthM * 100.
Arguments:
- path (str or Path, optional): Override input path. Defaults to
self.file. - include_segment_types (sequence of str, optional): Filter by
segment_type(e.g., IBD1, IBD2). None to disable.
Returns:
IBDObject: An IBDObject instance.
8class IBDReader: 9 def __new__( 10 cls, 11 file: Union[str, Path] 12 ) -> object: 13 """ 14 A factory class that attempts to detect the IBD file format and returns the corresponding reader. 15 16 Supported detections: 17 - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns) 18 - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema 19 """ 20 file = Path(file) 21 suffixes = [s.lower() for s in file.suffixes] 22 23 # Directory-based detection for ancIBD 24 if file.is_dir(): 25 if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists(): 26 from snputils.ibd.io.read.anc_ibd import AncIBDReader 27 return AncIBDReader(file) 28 has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz')) 29 if has_chr_files: 30 from snputils.ibd.io.read.anc_ibd import AncIBDReader 31 return AncIBDReader(file) 32 # Fallback to HapIBD if nothing matches 33 from snputils.ibd.io.read.hap_ibd import HapIBDReader 34 return HapIBDReader(file) 35 36 # File-based detection 37 if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']: 38 from snputils.ibd.io.read.hap_ibd import HapIBDReader 39 return HapIBDReader(file) 40 if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']: 41 from snputils.ibd.io.read.anc_ibd import AncIBDReader 42 return AncIBDReader(file) 43 44 # Default to HapIBDReader (most tools use .ibd[.gz]) 45 from snputils.ibd.io.read.hap_ibd import HapIBDReader 46 return HapIBDReader(file)
9 def __new__( 10 cls, 11 file: Union[str, Path] 12 ) -> object: 13 """ 14 A factory class that attempts to detect the IBD file format and returns the corresponding reader. 15 16 Supported detections: 17 - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns) 18 - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema 19 """ 20 file = Path(file) 21 suffixes = [s.lower() for s in file.suffixes] 22 23 # Directory-based detection for ancIBD 24 if file.is_dir(): 25 if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists(): 26 from snputils.ibd.io.read.anc_ibd import AncIBDReader 27 return AncIBDReader(file) 28 has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz')) 29 if has_chr_files: 30 from snputils.ibd.io.read.anc_ibd import AncIBDReader 31 return AncIBDReader(file) 32 # Fallback to HapIBD if nothing matches 33 from snputils.ibd.io.read.hap_ibd import HapIBDReader 34 return HapIBDReader(file) 35 36 # File-based detection 37 if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']: 38 from snputils.ibd.io.read.hap_ibd import HapIBDReader 39 return HapIBDReader(file) 40 if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']: 41 from snputils.ibd.io.read.anc_ibd import AncIBDReader 42 return AncIBDReader(file) 43 44 # Default to HapIBDReader (most tools use .ibd[.gz]) 45 from snputils.ibd.io.read.hap_ibd import HapIBDReader 46 return HapIBDReader(file)
A factory class that attempts to detect the IBD file format and returns the corresponding reader.
Supported detections:
- Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
- ancIBD: directories with
ch_all.tsv/ch*.tsvor files *.tsv / *.tsv.gz with ancIBD schema
9class MultiPhenotypeObject(): 10 """ 11 A class for multi-phenotype data. 12 13 This class serves as a container for phenotype data, allowing for 14 operations such as filtering samples and accessing phenotype information. 15 It uses a DataFrame to store the data, with the first column reserved for the sample identifers. 16 """ 17 def __init__( 18 self, 19 phen_df: pd.DataFrame 20 ) -> None: 21 """ 22 Args: 23 phen_df (pd.DataFrame): 24 A Pandas DataFrame containing phenotype data, with the first column 25 representing sample identifiers. 26 """ 27 self.__phen_df = phen_df 28 29 def __getitem__(self, key): 30 """ 31 To access an attribute of the class using the square bracket notation, 32 similar to a dictionary. 33 """ 34 try: 35 return getattr(self, key) 36 except: 37 raise KeyError(f'Invalid key: {key}') 38 39 def __setitem__(self, key, value): 40 """ 41 To set an attribute of the class using the square bracket notation, 42 similar to a dictionary. 43 """ 44 try: 45 setattr(self, key, value) 46 except AttributeError: 47 raise KeyError(f'Invalid key: {key}') 48 49 @property 50 def phen_df(self) -> pd.DataFrame: 51 """ 52 Retrieve `phen_df`. 53 54 Returns: 55 pd.DataFrame: 56 A Pandas DataFrame containing phenotype data, with the first column 57 representing sample identifiers. 58 """ 59 return self.__phen_df 60 61 @phen_df.setter 62 def phen_df(self, x: pd.DataFrame): 63 """ 64 Update `phen_df`. 65 """ 66 self.__phen_df = x 67 68 @property 69 def n_samples(self) -> int: 70 """ 71 Retrieve `n_samples`. 72 73 Returns: 74 int: The total number of samples. 75 """ 76 return len(self.phen_df) 77 78 def copy(self): 79 """ 80 Create and return a copy of the current `MultiPhenotypeObject` instance. 81 82 Returns: 83 MultiPhenotypeObject: A new instance of the current object. 84 """ 85 return copy.copy(self) 86 87 def filter_samples( 88 self, 89 samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 90 indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 91 include: bool = True, 92 reorder: bool = False, 93 inplace: bool = False 94 ) -> Optional['MultiPhenotypeObject']: 95 """ 96 Filter samples in the `MultiPhenotypeObject` based on sample names or indexes. 97 98 This method allows you to include or exclude specific samples by their names, 99 indexes, or both. When both samples and indexes are provided, the union of 100 the specified samples is used. Negative indexes are supported and follow NumPy's indexing 101 conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or 102 `indexes` lists when including. 103 104 Args: 105 samples (str or array_like of str, optional): 106 Names of the samples to include or exclude. Can be a single sample name or a 107 sequence of sample names. Default is None. 108 indexes (int or array_like of int, optional): 109 Indexes of the samples to include or exclude. Can be a single index or a sequence 110 of indexes. Negative indexes are supported. Default is None. 111 include (bool, default=True): 112 If True, includes only the specified samples. If False, excludes the specified 113 samples. Default is True. 114 inplace (bool, default=False): 115 If True, modifies the object in place. If False, returns a new 116 `MultiPhenotypeObject` with the samples filtered. Default is False. 117 118 Returns: 119 Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 120 filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None. 121 """ 122 # Ensure at least one of samples or indexes is provided 123 if samples is None and indexes is None: 124 raise ValueError("At least one of 'samples' or 'indexes' must be provided.") 125 126 n_samples = self.n_samples 127 128 # Create mask based on sample names 129 if samples is not None: 130 samples = np.asarray(samples).ravel() 131 # Extract sample names from the DataFrame 132 sample_names = self.__phen_df.iloc[:, 0].values 133 # Create mask for samples belonging to specified names 134 mask_samples = np.isin(sample_names, samples) 135 else: 136 mask_samples = np.zeros(n_samples, dtype=bool) 137 138 # Create mask based on sample indexes 139 if indexes is not None: 140 indexes = np.asarray(indexes).ravel() 141 # Adjust negative indexes 142 indexes = np.mod(indexes, n_samples) 143 if np.any((indexes < 0) | (indexes >= n_samples)): 144 raise IndexError("One or more sample indexes are out of bounds.") 145 # Create mask for samples at specified indexes 146 mask_indexes = np.zeros(n_samples, dtype=bool) 147 mask_indexes[indexes] = True 148 else: 149 mask_indexes = np.zeros(n_samples, dtype=bool) 150 151 # Combine masks using logical OR (union of samples) 152 mask_combined = mask_samples | mask_indexes 153 154 if not include: 155 # Invert mask if excluding samples 156 mask_combined = ~mask_combined 157 158 # If requested, compute an ordering of selected rows that follows the provided lists 159 ordered_indices = None 160 if include and reorder: 161 sel_indices = np.where(mask_combined)[0] 162 sample_names = self.__phen_df.iloc[:, 0].values 163 ordered_list = [] 164 added = np.zeros(n_samples, dtype=bool) 165 166 # Respect the order provided in `samples` (supports duplicate sample names) 167 if samples is not None: 168 for s in samples: 169 matches = np.where(sample_names == s)[0] 170 for idx in matches: 171 if mask_combined[idx] and not added[idx]: 172 ordered_list.append(int(idx)) 173 added[idx] = True 174 175 # Then respect the order in `indexes` 176 if indexes is not None: 177 adj_idx = np.mod(np.atleast_1d(indexes), n_samples) 178 for idx in adj_idx: 179 if mask_combined[idx] and not added[idx]: 180 ordered_list.append(int(idx)) 181 added[idx] = True 182 183 # Finally, append any remaining selected rows in their original order 184 for idx in sel_indices: 185 if not added[idx]: 186 ordered_list.append(int(idx)) 187 188 ordered_indices = np.asarray(ordered_list, dtype=int) 189 190 # Filter the phenotype DataFrame 191 if inplace: 192 if ordered_indices is not None: 193 self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True) 194 else: 195 self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True) 196 return None 197 else: 198 phen_obj = self.copy() 199 if ordered_indices is not None: 200 phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True) 201 else: 202 phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True) 203 return phen_obj
A class for multi-phenotype data.
This class serves as a container for phenotype data, allowing for operations such as filtering samples and accessing phenotype information. It uses a DataFrame to store the data, with the first column reserved for the sample identifers.
17 def __init__( 18 self, 19 phen_df: pd.DataFrame 20 ) -> None: 21 """ 22 Args: 23 phen_df (pd.DataFrame): 24 A Pandas DataFrame containing phenotype data, with the first column 25 representing sample identifiers. 26 """ 27 self.__phen_df = phen_df
Arguments:
- phen_df (pd.DataFrame): A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.
49 @property 50 def phen_df(self) -> pd.DataFrame: 51 """ 52 Retrieve `phen_df`. 53 54 Returns: 55 pd.DataFrame: 56 A Pandas DataFrame containing phenotype data, with the first column 57 representing sample identifiers. 58 """ 59 return self.__phen_df
Retrieve phen_df.
Returns:
pd.DataFrame: A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.
78 def copy(self): 79 """ 80 Create and return a copy of the current `MultiPhenotypeObject` instance. 81 82 Returns: 83 MultiPhenotypeObject: A new instance of the current object. 84 """ 85 return copy.copy(self)
Create and return a copy of the current MultiPhenotypeObject instance.
Returns:
MultiPhenotypeObject: A new instance of the current object.
87 def filter_samples( 88 self, 89 samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 90 indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 91 include: bool = True, 92 reorder: bool = False, 93 inplace: bool = False 94 ) -> Optional['MultiPhenotypeObject']: 95 """ 96 Filter samples in the `MultiPhenotypeObject` based on sample names or indexes. 97 98 This method allows you to include or exclude specific samples by their names, 99 indexes, or both. When both samples and indexes are provided, the union of 100 the specified samples is used. Negative indexes are supported and follow NumPy's indexing 101 conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or 102 `indexes` lists when including. 103 104 Args: 105 samples (str or array_like of str, optional): 106 Names of the samples to include or exclude. Can be a single sample name or a 107 sequence of sample names. Default is None. 108 indexes (int or array_like of int, optional): 109 Indexes of the samples to include or exclude. Can be a single index or a sequence 110 of indexes. Negative indexes are supported. Default is None. 111 include (bool, default=True): 112 If True, includes only the specified samples. If False, excludes the specified 113 samples. Default is True. 114 inplace (bool, default=False): 115 If True, modifies the object in place. If False, returns a new 116 `MultiPhenotypeObject` with the samples filtered. Default is False. 117 118 Returns: 119 Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 120 filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None. 121 """ 122 # Ensure at least one of samples or indexes is provided 123 if samples is None and indexes is None: 124 raise ValueError("At least one of 'samples' or 'indexes' must be provided.") 125 126 n_samples = self.n_samples 127 128 # Create mask based on sample names 129 if samples is not None: 130 samples = np.asarray(samples).ravel() 131 # Extract sample names from the DataFrame 132 sample_names = self.__phen_df.iloc[:, 0].values 133 # Create mask for samples belonging to specified names 134 mask_samples = np.isin(sample_names, samples) 135 else: 136 mask_samples = np.zeros(n_samples, dtype=bool) 137 138 # Create mask based on sample indexes 139 if indexes is not None: 140 indexes = np.asarray(indexes).ravel() 141 # Adjust negative indexes 142 indexes = np.mod(indexes, n_samples) 143 if np.any((indexes < 0) | (indexes >= n_samples)): 144 raise IndexError("One or more sample indexes are out of bounds.") 145 # Create mask for samples at specified indexes 146 mask_indexes = np.zeros(n_samples, dtype=bool) 147 mask_indexes[indexes] = True 148 else: 149 mask_indexes = np.zeros(n_samples, dtype=bool) 150 151 # Combine masks using logical OR (union of samples) 152 mask_combined = mask_samples | mask_indexes 153 154 if not include: 155 # Invert mask if excluding samples 156 mask_combined = ~mask_combined 157 158 # If requested, compute an ordering of selected rows that follows the provided lists 159 ordered_indices = None 160 if include and reorder: 161 sel_indices = np.where(mask_combined)[0] 162 sample_names = self.__phen_df.iloc[:, 0].values 163 ordered_list = [] 164 added = np.zeros(n_samples, dtype=bool) 165 166 # Respect the order provided in `samples` (supports duplicate sample names) 167 if samples is not None: 168 for s in samples: 169 matches = np.where(sample_names == s)[0] 170 for idx in matches: 171 if mask_combined[idx] and not added[idx]: 172 ordered_list.append(int(idx)) 173 added[idx] = True 174 175 # Then respect the order in `indexes` 176 if indexes is not None: 177 adj_idx = np.mod(np.atleast_1d(indexes), n_samples) 178 for idx in adj_idx: 179 if mask_combined[idx] and not added[idx]: 180 ordered_list.append(int(idx)) 181 added[idx] = True 182 183 # Finally, append any remaining selected rows in their original order 184 for idx in sel_indices: 185 if not added[idx]: 186 ordered_list.append(int(idx)) 187 188 ordered_indices = np.asarray(ordered_list, dtype=int) 189 190 # Filter the phenotype DataFrame 191 if inplace: 192 if ordered_indices is not None: 193 self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True) 194 else: 195 self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True) 196 return None 197 else: 198 phen_obj = self.copy() 199 if ordered_indices is not None: 200 phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True) 201 else: 202 phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True) 203 return phen_obj
Filter samples in the MultiPhenotypeObject based on sample names or indexes.
This method allows you to include or exclude specific samples by their names,
indexes, or both. When both samples and indexes are provided, the union of
the specified samples is used. Negative indexes are supported and follow NumPy's indexing
conventions. Set reorder=True to match the ordering of the provided samples and/or
indexes lists when including.
Arguments:
- samples (str or array_like of str, optional): Names of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
- indexes (int or array_like of int, optional): Indexes of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
- inplace (bool, default=False): If True, modifies the object in place. If False, returns a new
MultiPhenotypeObjectwith the samples filtered. Default is False.
Returns:
Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples filtered if
inplace=False. Ifinplace=True, modifies the object in place and returns None.
8class PhenotypeObject: 9 """ 10 Generic phenotype container for single-trait analyses. 11 12 The object stores sample IDs, normalized phenotype values, inferred/declared 13 trait type, and binary case/control convenience attributes. 14 """ 15 16 def __init__( 17 self, 18 samples: Sequence[str], 19 values: Sequence[float], 20 phenotype_name: str = "PHENO", 21 quantitative: Optional[bool] = None, 22 ) -> None: 23 sample_ids = [str(sample) for sample in samples] 24 if len(sample_ids) == 0: 25 raise ValueError("Phenotype file contains no samples.") 26 if len(set(sample_ids)) != len(sample_ids): 27 raise ValueError("Phenotype sample IDs must be unique.") 28 29 try: 30 values_f64 = np.asarray(values, dtype=np.float64) 31 except (TypeError, ValueError) as exc: 32 raise ValueError("Phenotype values must be numeric.") from exc 33 34 if values_f64.ndim != 1: 35 raise ValueError("Phenotype values must be a 1-dimensional array.") 36 if values_f64.size != len(sample_ids): 37 raise ValueError( 38 "Phenotype sample/value length mismatch: " 39 f"{len(sample_ids)} samples but {values_f64.size} values." 40 ) 41 if not np.all(np.isfinite(values_f64)): 42 raise ValueError("Phenotype contains non-finite values (NaN/Inf).") 43 44 trait_is_quantitative = ( 45 self._infer_quantitative(values_f64) 46 if quantitative is None 47 else bool(quantitative) 48 ) 49 50 if trait_is_quantitative: 51 if float(np.var(values_f64)) <= 0.0: 52 raise ValueError("Quantitative phenotype has zero variance.") 53 normalized_values = values_f64 54 cases: List[str] = [] 55 controls: List[str] = sample_ids.copy() 56 else: 57 normalized_values = self._normalize_binary(values_f64) 58 case_mask = normalized_values == 1 59 control_mask = normalized_values == 0 60 cases = [sample_ids[idx] for idx in np.where(case_mask)[0].tolist()] 61 controls = [sample_ids[idx] for idx in np.where(control_mask)[0].tolist()] 62 if len(cases) == 0: 63 raise ValueError("No case data available.") 64 if len(controls) == 0: 65 raise ValueError("No control data available.") 66 67 self._samples = sample_ids 68 self._values = normalized_values 69 self._phenotype_name = str(phenotype_name) 70 self._is_quantitative = trait_is_quantitative 71 72 self._cases = cases 73 self._controls = controls 74 self._all_haplotypes = [f"{sample}.0" for sample in sample_ids] + [ 75 f"{sample}.1" for sample in sample_ids 76 ] 77 self._cases_haplotypes = [f"{sample}.0" for sample in cases] + [ 78 f"{sample}.1" for sample in cases 79 ] 80 self._controls_haplotypes = [f"{sample}.0" for sample in controls] + [ 81 f"{sample}.1" for sample in controls 82 ] 83 84 @staticmethod 85 def _matches_binary_encoding(values_f64: np.ndarray, encoding: Sequence[float]) -> bool: 86 unique_vals = np.unique(values_f64) 87 if unique_vals.size != 2: 88 return False 89 target = np.asarray(sorted(float(v) for v in encoding), dtype=np.float64) 90 observed = np.asarray(sorted(unique_vals.tolist()), dtype=np.float64) 91 return bool(np.allclose(observed, target, rtol=0.0, atol=1e-8)) 92 93 @staticmethod 94 def _infer_quantitative(values_f64: np.ndarray) -> bool: 95 return not ( 96 PhenotypeObject._matches_binary_encoding(values_f64, (0.0, 1.0)) 97 or PhenotypeObject._matches_binary_encoding(values_f64, (1.0, 2.0)) 98 ) 99 100 @staticmethod 101 def _normalize_binary(values_f64: np.ndarray) -> np.ndarray: 102 unique_vals = np.unique(values_f64) 103 if PhenotypeObject._matches_binary_encoding(values_f64, (1.0, 2.0)): 104 return np.isclose(values_f64, 2.0, rtol=0.0, atol=1e-8).astype(np.int8) 105 if PhenotypeObject._matches_binary_encoding(values_f64, (0.0, 1.0)): 106 return values_f64.astype(np.int8) 107 raise ValueError( 108 "Binary phenotype must use exactly two levels encoded as {1,2} or {0,1}. " 109 f"Observed unique values: {sorted(unique_vals.tolist())}" 110 ) 111 112 def __getitem__(self, key): 113 try: 114 return getattr(self, key) 115 except AttributeError as exc: 116 raise KeyError(f"Invalid key: {key}") from exc 117 118 def __setitem__(self, key, value): 119 try: 120 setattr(self, key, value) 121 except AttributeError as exc: 122 raise KeyError(f"Invalid key: {key}") from exc 123 124 @property 125 def samples(self) -> List[str]: 126 return self._samples 127 128 @property 129 def n_samples(self) -> int: 130 return len(self._samples) 131 132 @property 133 def values(self) -> np.ndarray: 134 return self._values 135 136 @property 137 def y(self) -> np.ndarray: 138 return self._values 139 140 @property 141 def phenotype_name(self) -> str: 142 return self._phenotype_name 143 144 @property 145 def is_quantitative(self) -> bool: 146 return self._is_quantitative 147 148 @property 149 def quantitative(self) -> bool: 150 return self._is_quantitative 151 152 @property 153 def cases(self) -> List[str]: 154 return self._cases 155 156 @property 157 def n_cases(self) -> int: 158 return len(self._cases) 159 160 @property 161 def controls(self) -> List[str]: 162 return self._controls 163 164 @property 165 def n_controls(self) -> int: 166 return len(self._controls) 167 168 @property 169 def all_haplotypes(self) -> List[str]: 170 return self._all_haplotypes 171 172 @property 173 def cases_haplotypes(self) -> List[str]: 174 return self._cases_haplotypes 175 176 @property 177 def controls_haplotypes(self) -> List[str]: 178 return self._controls_haplotypes 179 180 def copy(self): 181 return copy.copy(self) 182 183 def keys(self) -> List[str]: 184 return [ 185 "samples", 186 "n_samples", 187 "values", 188 "y", 189 "phenotype_name", 190 "is_quantitative", 191 "quantitative", 192 "cases", 193 "n_cases", 194 "controls", 195 "n_controls", 196 "all_haplotypes", 197 "cases_haplotypes", 198 "controls_haplotypes", 199 ]
Generic phenotype container for single-trait analyses.
The object stores sample IDs, normalized phenotype values, inferred/declared trait type, and binary case/control convenience attributes.
16 def __init__( 17 self, 18 samples: Sequence[str], 19 values: Sequence[float], 20 phenotype_name: str = "PHENO", 21 quantitative: Optional[bool] = None, 22 ) -> None: 23 sample_ids = [str(sample) for sample in samples] 24 if len(sample_ids) == 0: 25 raise ValueError("Phenotype file contains no samples.") 26 if len(set(sample_ids)) != len(sample_ids): 27 raise ValueError("Phenotype sample IDs must be unique.") 28 29 try: 30 values_f64 = np.asarray(values, dtype=np.float64) 31 except (TypeError, ValueError) as exc: 32 raise ValueError("Phenotype values must be numeric.") from exc 33 34 if values_f64.ndim != 1: 35 raise ValueError("Phenotype values must be a 1-dimensional array.") 36 if values_f64.size != len(sample_ids): 37 raise ValueError( 38 "Phenotype sample/value length mismatch: " 39 f"{len(sample_ids)} samples but {values_f64.size} values." 40 ) 41 if not np.all(np.isfinite(values_f64)): 42 raise ValueError("Phenotype contains non-finite values (NaN/Inf).") 43 44 trait_is_quantitative = ( 45 self._infer_quantitative(values_f64) 46 if quantitative is None 47 else bool(quantitative) 48 ) 49 50 if trait_is_quantitative: 51 if float(np.var(values_f64)) <= 0.0: 52 raise ValueError("Quantitative phenotype has zero variance.") 53 normalized_values = values_f64 54 cases: List[str] = [] 55 controls: List[str] = sample_ids.copy() 56 else: 57 normalized_values = self._normalize_binary(values_f64) 58 case_mask = normalized_values == 1 59 control_mask = normalized_values == 0 60 cases = [sample_ids[idx] for idx in np.where(case_mask)[0].tolist()] 61 controls = [sample_ids[idx] for idx in np.where(control_mask)[0].tolist()] 62 if len(cases) == 0: 63 raise ValueError("No case data available.") 64 if len(controls) == 0: 65 raise ValueError("No control data available.") 66 67 self._samples = sample_ids 68 self._values = normalized_values 69 self._phenotype_name = str(phenotype_name) 70 self._is_quantitative = trait_is_quantitative 71 72 self._cases = cases 73 self._controls = controls 74 self._all_haplotypes = [f"{sample}.0" for sample in sample_ids] + [ 75 f"{sample}.1" for sample in sample_ids 76 ] 77 self._cases_haplotypes = [f"{sample}.0" for sample in cases] + [ 78 f"{sample}.1" for sample in cases 79 ] 80 self._controls_haplotypes = [f"{sample}.0" for sample in controls] + [ 81 f"{sample}.1" for sample in controls 82 ]
183 def keys(self) -> List[str]: 184 return [ 185 "samples", 186 "n_samples", 187 "values", 188 "y", 189 "phenotype_name", 190 "is_quantitative", 191 "quantitative", 192 "cases", 193 "n_cases", 194 "controls", 195 "n_controls", 196 "all_haplotypes", 197 "cases_haplotypes", 198 "controls_haplotypes", 199 ]
17class MultiPhenReader(PhenotypeBaseReader): 18 """ 19 Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen), 20 constructing a `MultiPhenotypeObject`. 21 """ 22 def __init__(self, file: Union[str, Path]) -> None: 23 """ 24 Args: 25 file (str or pathlib.Path): 26 Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen. 27 """ 28 self.__file = file 29 30 @property 31 def file(self) -> Path: 32 """ 33 Retrieve `file`. 34 35 Returns: 36 pathlib.Path: 37 Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen. 38 """ 39 return self.__file 40 41 def read( 42 self, 43 samples_idx: int = 0, 44 phen_names: Optional[List] = None, 45 sep: str = ',', 46 header: int = 0, 47 drop: bool = False 48 ) -> 'MultiPhenotypeObject': 49 """ 50 Read data from `file` and construct a `MultiPhenotypeObject`. 51 52 Args: 53 samples_idx (int, default=0): Index of the column containing sample identifiers. 54 Default is 0, assuming the first column contains sample identifiers. 55 phen_names (list of str, optional): List of phenotype column names. If provided, 56 these columns will be renamed to the specified names. 57 sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`, 58 `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited. 59 header (int, default=0): Row index to use as the column names. By default, 60 uses the first row (`header=0`). Set to `None` if column names are provided 61 explicitly. 62 drop (bool, default=False): If True, removes columns not listed in `phen_names` 63 (except the samples column). 64 65 Returns: 66 MultiPhenotypeObject: 67 A multi-phenotype object instance. 68 """ 69 file_extension = os.path.splitext(self.file)[1] 70 71 log.info(f"Reading '{file_extension}' file from '{self.file}'...") 72 73 if file_extension == '.xlsx': 74 phen_df = pd.read_excel(self.file, header=0, index_col=None) 75 elif file_extension == '.csv': 76 phen_df = pd.read_csv(self.file, sep=sep, header=header) 77 elif file_extension in ['.map', '.smap']: 78 phen_df = pd.read_csv(self.file, sep=sep, header=header) 79 elif file_extension == '.tsv': 80 phen_df = pd.read_csv(self.file, sep='\t') 81 elif file_extension in ['.txt', '.phe', '.pheno']: 82 phen_df = pd.read_csv(self.file, sep=r'\s+', header=header) 83 elif file_extension == '.phen': 84 with open(self.file, 'r') as f: 85 contents = f.readlines() 86 phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]} 87 phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())}) 88 else: 89 raise ValueError( 90 f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}." 91 ) 92 93 phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True) 94 95 if samples_idx != 0: 96 cols = ['samples'] + [col for col in phen_df.columns if col != 'samples'] 97 phen_df = phen_df[cols] 98 99 if phen_names is not None: 100 if drop: 101 non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names)) 102 phen_df = phen_df.drop(non_phen_columns, axis=1) 103 104 phenotype_col_count = phen_df.shape[1] - 1 105 if phenotype_col_count == len(phen_names): 106 phen_df.columns.values[1:] = phen_names 107 else: 108 raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) " 109 f"and length of `phen_names` ({len(phen_names)}).") 110 111 return MultiPhenotypeObject(phen_df=phen_df)
Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen),
constructing a MultiPhenotypeObject.
22 def __init__(self, file: Union[str, Path]) -> None: 23 """ 24 Args: 25 file (str or pathlib.Path): 26 Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen. 27 """ 28 self.__file = file
Arguments:
- file (str or pathlib.Path): Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
30 @property 31 def file(self) -> Path: 32 """ 33 Retrieve `file`. 34 35 Returns: 36 pathlib.Path: 37 Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen. 38 """ 39 return self.__file
Retrieve file.
Returns:
pathlib.Path: Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
41 def read( 42 self, 43 samples_idx: int = 0, 44 phen_names: Optional[List] = None, 45 sep: str = ',', 46 header: int = 0, 47 drop: bool = False 48 ) -> 'MultiPhenotypeObject': 49 """ 50 Read data from `file` and construct a `MultiPhenotypeObject`. 51 52 Args: 53 samples_idx (int, default=0): Index of the column containing sample identifiers. 54 Default is 0, assuming the first column contains sample identifiers. 55 phen_names (list of str, optional): List of phenotype column names. If provided, 56 these columns will be renamed to the specified names. 57 sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`, 58 `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited. 59 header (int, default=0): Row index to use as the column names. By default, 60 uses the first row (`header=0`). Set to `None` if column names are provided 61 explicitly. 62 drop (bool, default=False): If True, removes columns not listed in `phen_names` 63 (except the samples column). 64 65 Returns: 66 MultiPhenotypeObject: 67 A multi-phenotype object instance. 68 """ 69 file_extension = os.path.splitext(self.file)[1] 70 71 log.info(f"Reading '{file_extension}' file from '{self.file}'...") 72 73 if file_extension == '.xlsx': 74 phen_df = pd.read_excel(self.file, header=0, index_col=None) 75 elif file_extension == '.csv': 76 phen_df = pd.read_csv(self.file, sep=sep, header=header) 77 elif file_extension in ['.map', '.smap']: 78 phen_df = pd.read_csv(self.file, sep=sep, header=header) 79 elif file_extension == '.tsv': 80 phen_df = pd.read_csv(self.file, sep='\t') 81 elif file_extension in ['.txt', '.phe', '.pheno']: 82 phen_df = pd.read_csv(self.file, sep=r'\s+', header=header) 83 elif file_extension == '.phen': 84 with open(self.file, 'r') as f: 85 contents = f.readlines() 86 phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]} 87 phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())}) 88 else: 89 raise ValueError( 90 f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}." 91 ) 92 93 phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True) 94 95 if samples_idx != 0: 96 cols = ['samples'] + [col for col in phen_df.columns if col != 'samples'] 97 phen_df = phen_df[cols] 98 99 if phen_names is not None: 100 if drop: 101 non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names)) 102 phen_df = phen_df.drop(non_phen_columns, axis=1) 103 104 phenotype_col_count = phen_df.shape[1] - 1 105 if phenotype_col_count == len(phen_names): 106 phen_df.columns.values[1:] = phen_names 107 else: 108 raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) " 109 f"and length of `phen_names` ({len(phen_names)}).") 110 111 return MultiPhenotypeObject(phen_df=phen_df)
Read data from file and construct a MultiPhenotypeObject.
Arguments:
- samples_idx (int, default=0): Index of the column containing sample identifiers. Default is 0, assuming the first column contains sample identifiers.
- phen_names (list of str, optional): List of phenotype column names. If provided, these columns will be renamed to the specified names.
- sep (str, default=','): The delimiter for separating values in
.csv,.tsv,.txt,.phe,.pheno, or.mapfiles. Default is ','; usesep=r'\s+'for whitespace-delimited. - header (int, default=0): Row index to use as the column names. By default,
uses the first row (
header=0). Set toNoneif column names are provided explicitly. - drop (bool, default=False): If True, removes columns not listed in
phen_names(except the samples column).
Returns:
MultiPhenotypeObject: A multi-phenotype object instance.
11class PhenotypeReader(PhenotypeBaseReader): 12 """ 13 Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno). 14 15 Expected format (headered, whitespace-delimited): 16 - Must include `IID` (optionally preceded by `FID`) 17 - First phenotype column after `IID` is used by default 18 """ 19 20 def __init__(self, file: Union[str, Path]) -> None: 21 super().__init__(file) 22 23 @property 24 def file(self) -> Path: 25 return Path(self._file) 26 27 @staticmethod 28 def _has_header_with_iid(file_path: Path) -> bool: 29 with open(file_path, "r", encoding="utf-8") as handle: 30 for raw_line in handle: 31 line = raw_line.strip() 32 if not line: 33 continue 34 tokens = line.split() 35 return any(token.lstrip("#").upper() == "IID" for token in tokens) 36 raise ValueError("Empty phenotype file.") 37 38 @staticmethod 39 def _resolve_column(columns, normalized_columns, requested: str) -> Optional[str]: 40 requested_norm = str(requested).lstrip("#").upper() 41 for col, col_norm in zip(columns, normalized_columns): 42 if str(col) == str(requested) or col_norm == requested_norm: 43 return str(col) 44 return None 45 46 def read( 47 self, 48 phenotype_col: Optional[str] = None, 49 quantitative: Optional[bool] = None, 50 ) -> PhenotypeObject: 51 file_path = self.file 52 if not file_path.exists(): 53 raise FileNotFoundError(f"Phenotype file not found: '{file_path}'") 54 55 has_iid_header = self._has_header_with_iid(file_path) 56 if has_iid_header: 57 phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str) 58 else: 59 warnings.warn( 60 ( 61 "Phenotype file has no header/IID column. Legacy 3-column parsing " 62 "(FID IID PHENO) is deprecated; please switch to a headered format." 63 ), 64 UserWarning, 65 stacklevel=2, 66 ) 67 legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str) 68 if legacy.shape[1] < 3: 69 raise ValueError( 70 "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO." 71 ) 72 phen_df = legacy.iloc[:, :3].copy() 73 phen_df.columns = ["FID", "IID", "PHENO"] 74 75 if phen_df.empty: 76 raise ValueError("Empty phenotype file.") 77 78 columns = [str(col) for col in phen_df.columns] 79 normalized_columns = [col.lstrip("#").upper() for col in columns] 80 if "IID" not in normalized_columns: 81 raise ValueError("Phenotype file must include an IID column in the header.") 82 iid_col = columns[normalized_columns.index("IID")] 83 84 iid_series = phen_df[iid_col].astype(str).str.strip() 85 if iid_series.eq("").any(): 86 raise ValueError("Phenotype IID column contains empty values.") 87 if iid_series.duplicated().any(): 88 raise ValueError("Phenotype IID values must be unique.") 89 90 if phenotype_col is not None: 91 resolved = self._resolve_column(columns, normalized_columns, phenotype_col) 92 if resolved is None: 93 raise ValueError( 94 f"Phenotype column '{phenotype_col}' not found in header: {columns}" 95 ) 96 target_col = resolved 97 else: 98 iid_idx = normalized_columns.index("IID") 99 if iid_idx + 1 >= len(columns): 100 raise ValueError( 101 "Phenotype file must include at least one phenotype column after IID." 102 ) 103 target_col = columns[iid_idx + 1] 104 105 values = pd.to_numeric(phen_df[target_col], errors="coerce") 106 if values.isna().any(): 107 bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist() 108 raise ValueError( 109 f"Phenotype column '{target_col}' contains non-numeric or missing values: " 110 f"{bad_examples}" 111 ) 112 113 phenotype_name = str(target_col).lstrip("#") 114 return PhenotypeObject( 115 samples=iid_series.tolist(), 116 values=values.to_numpy(), 117 phenotype_name=phenotype_name, 118 quantitative=quantitative, 119 )
Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno).
Expected format (headered, whitespace-delimited):
- Must include
IID(optionally preceded byFID) - First phenotype column after
IIDis used by default
46 def read( 47 self, 48 phenotype_col: Optional[str] = None, 49 quantitative: Optional[bool] = None, 50 ) -> PhenotypeObject: 51 file_path = self.file 52 if not file_path.exists(): 53 raise FileNotFoundError(f"Phenotype file not found: '{file_path}'") 54 55 has_iid_header = self._has_header_with_iid(file_path) 56 if has_iid_header: 57 phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str) 58 else: 59 warnings.warn( 60 ( 61 "Phenotype file has no header/IID column. Legacy 3-column parsing " 62 "(FID IID PHENO) is deprecated; please switch to a headered format." 63 ), 64 UserWarning, 65 stacklevel=2, 66 ) 67 legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str) 68 if legacy.shape[1] < 3: 69 raise ValueError( 70 "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO." 71 ) 72 phen_df = legacy.iloc[:, :3].copy() 73 phen_df.columns = ["FID", "IID", "PHENO"] 74 75 if phen_df.empty: 76 raise ValueError("Empty phenotype file.") 77 78 columns = [str(col) for col in phen_df.columns] 79 normalized_columns = [col.lstrip("#").upper() for col in columns] 80 if "IID" not in normalized_columns: 81 raise ValueError("Phenotype file must include an IID column in the header.") 82 iid_col = columns[normalized_columns.index("IID")] 83 84 iid_series = phen_df[iid_col].astype(str).str.strip() 85 if iid_series.eq("").any(): 86 raise ValueError("Phenotype IID column contains empty values.") 87 if iid_series.duplicated().any(): 88 raise ValueError("Phenotype IID values must be unique.") 89 90 if phenotype_col is not None: 91 resolved = self._resolve_column(columns, normalized_columns, phenotype_col) 92 if resolved is None: 93 raise ValueError( 94 f"Phenotype column '{phenotype_col}' not found in header: {columns}" 95 ) 96 target_col = resolved 97 else: 98 iid_idx = normalized_columns.index("IID") 99 if iid_idx + 1 >= len(columns): 100 raise ValueError( 101 "Phenotype file must include at least one phenotype column after IID." 102 ) 103 target_col = columns[iid_idx + 1] 104 105 values = pd.to_numeric(phen_df[target_col], errors="coerce") 106 if values.isna().any(): 107 bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist() 108 raise ValueError( 109 f"Phenotype column '{target_col}' contains non-numeric or missing values: " 110 f"{bad_examples}" 111 ) 112 113 phenotype_name = str(target_col).lstrip("#") 114 return PhenotypeObject( 115 samples=iid_series.tolist(), 116 values=values.to_numpy(), 117 phenotype_name=phenotype_name, 118 quantitative=quantitative, 119 )
Abstract method to read data from the provided file.
Subclasses must implement this method to read and parse the data.
The implementation should construct an instance of
snputils.phenotype.genobj.MultiPhenotypeObject or
snputils.phenotype.genobj.PhenotypeObject based on the read data.
34def load_dataset( 35 name: str, 36 chromosomes: Union[List[str], List[int], str, int], 37 variants_ids: Optional[List[str]] = None, 38 sample_ids: Optional[List[str]] = None, 39 verbose: bool = True, 40 **read_kwargs 41) -> SNPObject: 42 """ 43 Load a genome dataset. 44 45 Args: 46 name (str): Name of the dataset to load. Call `available_datasets_list()` to get the list of available datasets. 47 chromosomes (List[str] | List[int] | str | int): Chromosomes to load. 48 variants_ids (List[str]): List of variant IDs to load. 49 sample_ids (List[str]): List of sample IDs to load. 50 verbose (bool): Whether to show progress. 51 **read_kwargs: Keyword arguments to pass to `PGENReader.read()`. 52 53 Returns: 54 SNPObject: SNPObject containing the loaded dataset. 55 """ 56 if isinstance(chromosomes, (str, int)): 57 chromosomes = [chromosomes] 58 chromosomes = [str(chr).lower().replace("chr", "") for chr in chromosomes] 59 60 if variants_ids is not None: 61 variants_ids_txt = tempfile.NamedTemporaryFile(mode='w') 62 variants_ids_txt.write("\n".join(variants_ids)) 63 variants_ids_txt.flush() 64 65 if sample_ids is not None: 66 sample_ids_txt = tempfile.NamedTemporaryFile(mode='w') 67 sample_ids_txt.write("\n".join(sample_ids)) 68 sample_ids_txt.flush() 69 70 merge_list_txt = tempfile.NamedTemporaryFile(mode='w') 71 72 data_home = get_data_home() 73 74 if name == "1kgp": 75 data_path = data_home / name 76 data_path.mkdir(parents=True, exist_ok=True) 77 for chr in chromosomes: 78 chr_path = data_path / chr_urls[name][chr] 79 if not Path(chr_path).exists(): 80 log.info(f"Downloading chromosome {chr}...") 81 download_url(f"{base_urls[name]}/{chr_urls[name][chr]}", chr_path, show_progress=verbose) 82 else: 83 log.info(f"Chromosome {chr} already exists. Skipping download.") 84 85 # Filter and convert to PGEN 86 log.info(f"Processing chromosome {chr}...") 87 out_file = chr_urls[name][chr].replace('.vcf.gz', '') 88 execute_plink_cmd( 89 ["--vcf", f"{chr_urls[name][chr]}"] 90 + (["--keep", sample_ids_txt.name] if sample_ids is not None else []) 91 + (["--extract", variants_ids_txt.name] if variants_ids is not None else []) 92 + [ 93 "--set-missing-var-ids", "@:#", 94 "--make-pgen", 95 "--out", out_file, 96 ], cwd=data_path) 97 merge_list_txt.write(f"{out_file}\n") 98 99 if len(chromosomes) > 1: 100 # Merge the PGEN files into single PGEN fileset 101 log.info("Merging PGEN files...") 102 merge_list_txt.flush() 103 print(f"Merge list file contents: {open(merge_list_txt.name, 'r').read()}") 104 execute_plink_cmd(["--pmerge-list", merge_list_txt.name, "--make-pgen", "--out", "1kgp"], 105 cwd=data_path) 106 else: 107 # Rename the single PGEN file 108 for ext in ["pgen", "psam", "pvar"]: 109 Path(data_path / f"{out_file}.{ext}").rename(data_path / f"1kgp.{ext}") 110 111 # Read PGEN fileset with PGENReader into SNPObject 112 log.info("Reading PGEN fileset...") 113 snpobj = PGENReader(data_path / "1kgp").read(**read_kwargs) 114 else: 115 raise NotImplementedError(f"Dataset {name} not implemented.") 116 117 if variants_ids is not None: 118 variants_ids_txt.close() 119 if sample_ids is not None: 120 sample_ids_txt.close() 121 merge_list_txt.close() 122 123 return snpobj
Load a genome dataset.
Arguments:
- name (str): Name of the dataset to load. Call
available_datasets_list()to get the list of available datasets. - chromosomes (List[str] | List[int] | str | int): Chromosomes to load.
- variants_ids (List[str]): List of variant IDs to load.
- sample_ids (List[str]): List of sample IDs to load.
- verbose (bool): Whether to show progress.
- **read_kwargs: Keyword arguments to pass to
PGENReader.read().
Returns:
SNPObject: SNPObject containing the loaded dataset.