snputils.processing.maasmds
1import pathlib 2import numpy as np 3import copy 4from typing import Optional, Dict, List, Union 5 6from snputils.snp.genobj.snpobj import SNPObject 7from snputils.ancestry.genobj.local import LocalAncestryObject 8from ._utils.mds_distance import distance_mat, mds_transform 9from ._utils.gen_tools import array_process, process_labels_weights 10 11 12class maasMDS: 13 """ 14 A class for multiple array ancestry-specific multidimensional scaling (maasMDS). 15 16 This class supports both separate and averaged strand processing for SNP data. If the `snpobj`, 17 `laiobj`, `labels_file`, and `ancestry` parameters are all provided during instantiation, 18 the `fit_transform` method will be automatically called, applying the specified maasMDS method to transform 19 the data upon instantiation. 20 """ 21 def __init__( 22 self, 23 snpobj, 24 laiobj, 25 labels_file, 26 ancestry, 27 is_masked: bool = True, 28 prob_thresh: float = 0, 29 average_strands: bool = False, 30 is_weighted: bool = False, 31 groups_to_remove: Dict[int, List[str]] = {}, 32 min_percent_snps: float = 4, 33 save_masks: bool = False, 34 load_masks: bool = False, 35 masks_file: Union[str, pathlib.Path] = 'masks.npz', 36 distance_type: str = 'AP', 37 n_components: int = 2, 38 rsid_or_chrompos: int = 2 39 ): 40 """ 41 Args: 42 snpobj (SNPObject, optional): 43 A SNPObject instance. 44 laiobj (LAIObject, optional): 45 A LAIObject instance. 46 labels_file (str, optional): 47 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 48 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 49 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 50 combined into groups, with respective weights. 51 ancestry (str, optional): 52 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 53 is_masked (bool, default=True): 54 True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 55 prob_thresh (float, default=0.0): 56 Minimum probability threshold for a SNP to belong to an ancestry. 57 average_strands (bool, default=False): 58 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 59 is_weighted (bool, default=False): 60 True if weights are provided in the labels file, or False otherwise. 61 groups_to_remove (dict of int to list of str, default={}): 62 Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 63 lists of groups to remove for each array. 64 Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 65 min_percent_snps (float, default=4.0): 66 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 67 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 68 save_masks (bool, default=False): 69 True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 70 load_masks (bool, default=False): 71 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise. 72 masks_file (str or pathlib.Path, default='masks.npz'): 73 Path to the `.npz` file used for saving/loading masked matrices. 74 distance_type (str, default='AP'): 75 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 76 If `average_strands=True`, use 'distance_type=AP'. 77 n_components (int, default=2): 78 The number of principal components. 79 rsid_or_chrompos (int, default=2): 80 Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 81 """ 82 self.__snpobj = snpobj 83 self.__laiobj = laiobj 84 self.__labels_file = labels_file 85 self.__ancestry = ancestry 86 self.__is_masked = is_masked 87 self.__prob_thresh = prob_thresh 88 self.__average_strands = average_strands 89 self.__groups_to_remove = groups_to_remove 90 self.__min_percent_snps = min_percent_snps 91 self.__is_weighted = is_weighted 92 self.__save_masks = save_masks 93 self.__load_masks = load_masks 94 self.__masks_file = masks_file 95 self.__distance_type = distance_type 96 self.__n_components = n_components 97 self.__rsid_or_chrompos = rsid_or_chrompos 98 self.__X_new_ = None # Store transformed SNP data 99 self.__haplotypes_ = None # Store haplotypes after filtering if min_percent_snps > 0 100 self.__samples_ = None # Store samples after filtering if min_percent_snps > 0 101 102 # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided 103 if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None: 104 self.fit_transform(snpobj, laiobj, labels_file, ancestry) 105 106 def __getitem__(self, key): 107 """ 108 To access an attribute of the class using the square bracket notation, 109 similar to a dictionary. 110 """ 111 try: 112 return getattr(self, key) 113 except AttributeError: 114 raise KeyError(f'Invalid key: {key}') 115 116 def __setitem__(self, key, value): 117 """ 118 To set an attribute of the class using the square bracket notation, 119 similar to a dictionary. 120 """ 121 try: 122 setattr(self, key, value) 123 except AttributeError: 124 raise KeyError(f'Invalid key: {key}') 125 126 def copy(self) -> 'maasMDS': 127 """ 128 Create and return a copy of `self`. 129 130 Returns: 131 **maasMDS:** 132 A new instance of the current object. 133 """ 134 return copy.copy(self) 135 136 @property 137 def snpobj(self) -> Optional['SNPObject']: 138 """ 139 Retrieve `snpobj`. 140 141 Returns: 142 **SNPObject:** A SNPObject instance. 143 """ 144 return self.__snpobj 145 146 @snpobj.setter 147 def snpobj(self, x: 'SNPObject') -> None: 148 """ 149 Update `snpobj`. 150 """ 151 self.__snpobj = x 152 153 @property 154 def laiobj(self) -> Optional['LocalAncestryObject']: 155 """ 156 Retrieve `laiobj`. 157 158 Returns: 159 **LocalAncestryObject:** A LAIObject instance. 160 """ 161 return self.__laiobj 162 163 @laiobj.setter 164 def laiobj(self, x: 'LocalAncestryObject') -> None: 165 """ 166 Update `laiobj`. 167 """ 168 self.__laiobj = x 169 170 @property 171 def labels_file(self) -> Optional[str]: 172 """ 173 Retrieve `labels_file`. 174 175 Returns: 176 **str:** 177 Path to the labels file in `.tsv` format. 178 """ 179 return self.__labels_file 180 181 @labels_file.setter 182 def labels_file(self, x: str) -> None: 183 """ 184 Update `labels_file`. 185 """ 186 self.__labels_file = x 187 188 @property 189 def ancestry(self) -> Optional[str]: 190 """ 191 Retrieve `ancestry`. 192 193 Returns: 194 **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 195 """ 196 return self.__ancestry 197 198 @ancestry.setter 199 def ancestry(self, x: str) -> None: 200 """ 201 Update `ancestry`. 202 """ 203 self.__ancestry = x 204 205 @property 206 def is_masked(self) -> bool: 207 """ 208 Retrieve `is_masked`. 209 210 Returns: 211 **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 212 """ 213 return self.__is_masked 214 215 @is_masked.setter 216 def is_masked(self, x: bool) -> None: 217 """ 218 Update `is_masked`. 219 """ 220 self.__is_masked = x 221 222 @property 223 def prob_thresh(self) -> float: 224 """ 225 Retrieve `prob_thresh`. 226 227 Returns: 228 **float:** Minimum probability threshold for a SNP to belong to an ancestry. 229 """ 230 return self.__prob_thresh 231 232 @prob_thresh.setter 233 def prob_thresh(self, x: float) -> None: 234 """ 235 Update `prob_thresh`. 236 """ 237 self.__prob_thresh = x 238 239 @property 240 def average_strands(self) -> bool: 241 """ 242 Retrieve `average_strands`. 243 244 Returns: 245 **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 246 """ 247 return self.__average_strands 248 249 @average_strands.setter 250 def average_strands(self, x: bool) -> None: 251 """ 252 Update `average_strands`. 253 """ 254 self.__average_strands = x 255 256 @property 257 def is_weighted(self) -> bool: 258 """ 259 Retrieve `is_weighted`. 260 261 Returns: 262 **bool:** True if weights are provided in the labels file, or False otherwise. 263 """ 264 return self.__is_weighted 265 266 @is_weighted.setter 267 def is_weighted(self, x: bool) -> None: 268 """ 269 Update `is_weighted`. 270 """ 271 self.__is_weighted = x 272 273 @property 274 def groups_to_remove(self) -> Dict[int, List[str]]: 275 """ 276 Retrieve `groups_to_remove`. 277 278 Returns: 279 **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 280 lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 281 """ 282 return self.__groups_to_remove 283 284 @groups_to_remove.setter 285 def groups_to_remove(self, x: Dict[int, List[str]]) -> None: 286 """ 287 Update `groups_to_remove`. 288 """ 289 self.__groups_to_remove = x 290 291 @property 292 def min_percent_snps(self) -> float: 293 """ 294 Retrieve `min_percent_snps`. 295 296 Returns: 297 **float:** 298 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 299 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 300 """ 301 return self.__min_percent_snps 302 303 @min_percent_snps.setter 304 def min_percent_snps(self, x: float) -> None: 305 """ 306 Update `min_percent_snps`. 307 """ 308 self.__min_percent_snps = x 309 310 @property 311 def save_masks(self) -> bool: 312 """ 313 Retrieve `save_masks`. 314 315 Returns: 316 **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 317 """ 318 return self.__save_masks 319 320 @save_masks.setter 321 def save_masks(self, x: bool) -> None: 322 """ 323 Update `save_masks`. 324 """ 325 self.__save_masks = x 326 327 @property 328 def load_masks(self) -> bool: 329 """ 330 Retrieve `load_masks`. 331 332 Returns: 333 **bool:** 334 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 335 by `masks_file`, or False otherwise. 336 """ 337 return self.__load_masks 338 339 @load_masks.setter 340 def load_masks(self, x: bool) -> None: 341 """ 342 Update `load_masks`. 343 """ 344 self.__load_masks = x 345 346 @property 347 def masks_file(self) -> Union[str, pathlib.Path]: 348 """ 349 Retrieve `masks_file`. 350 351 Returns: 352 **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices. 353 """ 354 return self.__masks_file 355 356 @masks_file.setter 357 def masks_file(self, x: Union[str, pathlib.Path]) -> None: 358 """ 359 Update `masks_file`. 360 """ 361 self.__masks_file = x 362 363 @property 364 def distance_type(self) -> str: 365 """ 366 Retrieve `distance_type`. 367 368 Returns: 369 **str:** 370 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 371 If `average_strands=True`, use 'distance_type=AP'. 372 """ 373 return self.__distance_type 374 375 @distance_type.setter 376 def distance_type(self, x: str) -> None: 377 """ 378 Update `distance_type`. 379 """ 380 self.__distance_type = x 381 382 @property 383 def n_components(self) -> int: 384 """ 385 Retrieve `n_components`. 386 387 Returns: 388 **int:** The number of principal components. 389 """ 390 return self.__n_components 391 392 @n_components.setter 393 def n_components(self, x: int) -> None: 394 """ 395 Update `n_components`. 396 """ 397 self.__n_components = x 398 399 @property 400 def rsid_or_chrompos(self) -> int: 401 """ 402 Retrieve `rsid_or_chrompos`. 403 404 Returns: 405 **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 406 """ 407 return self.__rsid_or_chrompos 408 409 @rsid_or_chrompos.setter 410 def rsid_or_chrompos(self, x: int) -> None: 411 """ 412 Update `rsid_or_chrompos`. 413 """ 414 self.__rsid_or_chrompos = x 415 416 @property 417 def X_new_(self) -> Optional[np.ndarray]: 418 """ 419 Retrieve `X_new_`. 420 421 Returns: 422 **array of shape (n_haplotypes_, n_components):** 423 The transformed SNP data projected onto the `n_components` principal components. 424 n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 425 (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 426 `(n_samples * 2, n_components)`. 427 """ 428 return self.__X_new_ 429 430 @X_new_.setter 431 def X_new_(self, x: np.ndarray) -> None: 432 """ 433 Update `X_new_`. 434 """ 435 self.__X_new_ = x 436 437 @property 438 def haplotypes_(self) -> Optional[List[str]]: 439 """ 440 Retrieve `haplotypes_`. 441 442 Returns: 443 list of str: 444 A list of unique haplotype identifiers. 445 """ 446 if isinstance(self.__haplotypes_, np.ndarray): 447 return self.__haplotypes_.ravel().tolist() # Flatten and convert NumPy array to a list 448 elif isinstance(self.__haplotypes_, list): 449 if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray): 450 return self.__haplotypes_[0].ravel().tolist() # Handle list containing a single array 451 return self.__haplotypes_ # Already a flat list 452 elif self.__haplotypes_ is None: 453 return None # If no haplotypes are set 454 else: 455 raise TypeError("`haplotypes_` must be a list or a NumPy array.") 456 457 @haplotypes_.setter 458 def haplotypes_(self, x: Union[np.ndarray, List[str]]) -> None: 459 """ 460 Update `haplotypes_`. 461 """ 462 if isinstance(x, np.ndarray): 463 self.__haplotypes_ = x.ravel().tolist() # Flatten and convert to a list 464 elif isinstance(x, list): 465 if len(x) == 1 and isinstance(x[0], np.ndarray): # Handle list containing a single array 466 self.__haplotypes_ = x[0].ravel().tolist() 467 else: 468 self.__haplotypes_ = x # Use directly if already a list 469 else: 470 raise TypeError("`x` must be a list or a NumPy array.") 471 472 @property 473 def samples_(self) -> Optional[List[str]]: 474 """ 475 Retrieve `samples_`. 476 477 Returns: 478 list of str: 479 A list of sample identifiers based on `haplotypes_` and `average_strands`. 480 """ 481 haplotypes = self.haplotypes_ 482 if haplotypes is None: 483 return None 484 if self.__average_strands: 485 return haplotypes 486 else: 487 return [x[:-2] for x in haplotypes] 488 489 @property 490 def n_haplotypes(self) -> Optional[int]: 491 """ 492 Retrieve `n_haplotypes`. 493 494 Returns: 495 **int:** 496 The total number of haplotypes, potentially reduced if filtering is applied 497 (`min_percent_snps > 0`). 498 """ 499 return len(self.__haplotypes_) 500 501 @property 502 def n_samples(self) -> Optional[int]: 503 """ 504 Retrieve `n_samples`. 505 506 Returns: 507 **int:** 508 The total number of samples, potentially reduced if filtering is applied 509 (`min_percent_snps > 0`). 510 """ 511 return len(np.unique(self.samples_)) 512 513 @staticmethod 514 def _load_masks_file(masks_file): 515 mask_files = np.load(masks_file, allow_pickle=True) 516 masks = mask_files['masks'] 517 rs_ID_list = mask_files['rs_ID_list'] 518 ind_ID_list = mask_files['ind_ID_list'] 519 groups = mask_files['labels'] 520 weights = mask_files['weights'] 521 return masks, rs_ID_list, ind_ID_list, groups, weights 522 523 def fit_transform( 524 self, 525 snpobj: Optional['SNPObject'] = None, 526 laiobj: Optional['LocalAncestryObject'] = None, 527 labels_file: Optional[str] = None, 528 ancestry: Optional[str] = None, 529 average_strands: Optional[bool] = None 530 ) -> np.ndarray: 531 """ 532 Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data. 533 534 Args: 535 snpobj (SNPObject, optional): 536 A SNPObject instance. 537 laiobj (LAIObject, optional): 538 A LAIObject instance. 539 labels_file (str, optional): 540 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 541 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 542 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 543 combined into groups, with respective weights. 544 ancestry (str, optional): 545 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0. 546 average_strands (bool, optional): 547 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 548 If None, defaults to `self.average_strands`. 549 550 Returns: 551 **array of shape (n_samples, n_components):** 552 The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`. 553 """ 554 if snpobj is None: 555 snpobj = self.snpobj 556 if laiobj is None: 557 laiobj = self.laiobj 558 if labels_file is None: 559 labels_file = self.labels_file 560 if ancestry is None: 561 ancestry = self.ancestry 562 if average_strands is None: 563 average_strands = self.average_strands 564 565 if not self.is_masked: 566 self.ancestry = '1' 567 if self.load_masks: 568 masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file) 569 else: 570 masks, rs_ID_list, ind_ID_list = array_process( 571 self.snpobj, 572 self.laiobj, 573 self.average_strands, 574 self.prob_thresh, 575 self.is_masked, 576 self.rsid_or_chrompos 577 ) 578 579 masks, ind_ID_list, groups, weights = process_labels_weights( 580 self.labels_file, 581 masks, 582 rs_ID_list, 583 ind_ID_list, 584 self.average_strands, 585 self.ancestry, 586 self.min_percent_snps, 587 self.groups_to_remove, 588 self.is_weighted, 589 self.save_masks, 590 self.masks_file 591 ) 592 593 distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]] 594 595 self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components) 596 self.haplotypes_ = ind_ID_list
13class maasMDS: 14 """ 15 A class for multiple array ancestry-specific multidimensional scaling (maasMDS). 16 17 This class supports both separate and averaged strand processing for SNP data. If the `snpobj`, 18 `laiobj`, `labels_file`, and `ancestry` parameters are all provided during instantiation, 19 the `fit_transform` method will be automatically called, applying the specified maasMDS method to transform 20 the data upon instantiation. 21 """ 22 def __init__( 23 self, 24 snpobj, 25 laiobj, 26 labels_file, 27 ancestry, 28 is_masked: bool = True, 29 prob_thresh: float = 0, 30 average_strands: bool = False, 31 is_weighted: bool = False, 32 groups_to_remove: Dict[int, List[str]] = {}, 33 min_percent_snps: float = 4, 34 save_masks: bool = False, 35 load_masks: bool = False, 36 masks_file: Union[str, pathlib.Path] = 'masks.npz', 37 distance_type: str = 'AP', 38 n_components: int = 2, 39 rsid_or_chrompos: int = 2 40 ): 41 """ 42 Args: 43 snpobj (SNPObject, optional): 44 A SNPObject instance. 45 laiobj (LAIObject, optional): 46 A LAIObject instance. 47 labels_file (str, optional): 48 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 49 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 50 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 51 combined into groups, with respective weights. 52 ancestry (str, optional): 53 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 54 is_masked (bool, default=True): 55 True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 56 prob_thresh (float, default=0.0): 57 Minimum probability threshold for a SNP to belong to an ancestry. 58 average_strands (bool, default=False): 59 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 60 is_weighted (bool, default=False): 61 True if weights are provided in the labels file, or False otherwise. 62 groups_to_remove (dict of int to list of str, default={}): 63 Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 64 lists of groups to remove for each array. 65 Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 66 min_percent_snps (float, default=4.0): 67 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 68 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 69 save_masks (bool, default=False): 70 True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 71 load_masks (bool, default=False): 72 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise. 73 masks_file (str or pathlib.Path, default='masks.npz'): 74 Path to the `.npz` file used for saving/loading masked matrices. 75 distance_type (str, default='AP'): 76 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 77 If `average_strands=True`, use 'distance_type=AP'. 78 n_components (int, default=2): 79 The number of principal components. 80 rsid_or_chrompos (int, default=2): 81 Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 82 """ 83 self.__snpobj = snpobj 84 self.__laiobj = laiobj 85 self.__labels_file = labels_file 86 self.__ancestry = ancestry 87 self.__is_masked = is_masked 88 self.__prob_thresh = prob_thresh 89 self.__average_strands = average_strands 90 self.__groups_to_remove = groups_to_remove 91 self.__min_percent_snps = min_percent_snps 92 self.__is_weighted = is_weighted 93 self.__save_masks = save_masks 94 self.__load_masks = load_masks 95 self.__masks_file = masks_file 96 self.__distance_type = distance_type 97 self.__n_components = n_components 98 self.__rsid_or_chrompos = rsid_or_chrompos 99 self.__X_new_ = None # Store transformed SNP data 100 self.__haplotypes_ = None # Store haplotypes after filtering if min_percent_snps > 0 101 self.__samples_ = None # Store samples after filtering if min_percent_snps > 0 102 103 # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided 104 if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None: 105 self.fit_transform(snpobj, laiobj, labels_file, ancestry) 106 107 def __getitem__(self, key): 108 """ 109 To access an attribute of the class using the square bracket notation, 110 similar to a dictionary. 111 """ 112 try: 113 return getattr(self, key) 114 except AttributeError: 115 raise KeyError(f'Invalid key: {key}') 116 117 def __setitem__(self, key, value): 118 """ 119 To set an attribute of the class using the square bracket notation, 120 similar to a dictionary. 121 """ 122 try: 123 setattr(self, key, value) 124 except AttributeError: 125 raise KeyError(f'Invalid key: {key}') 126 127 def copy(self) -> 'maasMDS': 128 """ 129 Create and return a copy of `self`. 130 131 Returns: 132 **maasMDS:** 133 A new instance of the current object. 134 """ 135 return copy.copy(self) 136 137 @property 138 def snpobj(self) -> Optional['SNPObject']: 139 """ 140 Retrieve `snpobj`. 141 142 Returns: 143 **SNPObject:** A SNPObject instance. 144 """ 145 return self.__snpobj 146 147 @snpobj.setter 148 def snpobj(self, x: 'SNPObject') -> None: 149 """ 150 Update `snpobj`. 151 """ 152 self.__snpobj = x 153 154 @property 155 def laiobj(self) -> Optional['LocalAncestryObject']: 156 """ 157 Retrieve `laiobj`. 158 159 Returns: 160 **LocalAncestryObject:** A LAIObject instance. 161 """ 162 return self.__laiobj 163 164 @laiobj.setter 165 def laiobj(self, x: 'LocalAncestryObject') -> None: 166 """ 167 Update `laiobj`. 168 """ 169 self.__laiobj = x 170 171 @property 172 def labels_file(self) -> Optional[str]: 173 """ 174 Retrieve `labels_file`. 175 176 Returns: 177 **str:** 178 Path to the labels file in `.tsv` format. 179 """ 180 return self.__labels_file 181 182 @labels_file.setter 183 def labels_file(self, x: str) -> None: 184 """ 185 Update `labels_file`. 186 """ 187 self.__labels_file = x 188 189 @property 190 def ancestry(self) -> Optional[str]: 191 """ 192 Retrieve `ancestry`. 193 194 Returns: 195 **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 196 """ 197 return self.__ancestry 198 199 @ancestry.setter 200 def ancestry(self, x: str) -> None: 201 """ 202 Update `ancestry`. 203 """ 204 self.__ancestry = x 205 206 @property 207 def is_masked(self) -> bool: 208 """ 209 Retrieve `is_masked`. 210 211 Returns: 212 **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 213 """ 214 return self.__is_masked 215 216 @is_masked.setter 217 def is_masked(self, x: bool) -> None: 218 """ 219 Update `is_masked`. 220 """ 221 self.__is_masked = x 222 223 @property 224 def prob_thresh(self) -> float: 225 """ 226 Retrieve `prob_thresh`. 227 228 Returns: 229 **float:** Minimum probability threshold for a SNP to belong to an ancestry. 230 """ 231 return self.__prob_thresh 232 233 @prob_thresh.setter 234 def prob_thresh(self, x: float) -> None: 235 """ 236 Update `prob_thresh`. 237 """ 238 self.__prob_thresh = x 239 240 @property 241 def average_strands(self) -> bool: 242 """ 243 Retrieve `average_strands`. 244 245 Returns: 246 **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 247 """ 248 return self.__average_strands 249 250 @average_strands.setter 251 def average_strands(self, x: bool) -> None: 252 """ 253 Update `average_strands`. 254 """ 255 self.__average_strands = x 256 257 @property 258 def is_weighted(self) -> bool: 259 """ 260 Retrieve `is_weighted`. 261 262 Returns: 263 **bool:** True if weights are provided in the labels file, or False otherwise. 264 """ 265 return self.__is_weighted 266 267 @is_weighted.setter 268 def is_weighted(self, x: bool) -> None: 269 """ 270 Update `is_weighted`. 271 """ 272 self.__is_weighted = x 273 274 @property 275 def groups_to_remove(self) -> Dict[int, List[str]]: 276 """ 277 Retrieve `groups_to_remove`. 278 279 Returns: 280 **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 281 lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 282 """ 283 return self.__groups_to_remove 284 285 @groups_to_remove.setter 286 def groups_to_remove(self, x: Dict[int, List[str]]) -> None: 287 """ 288 Update `groups_to_remove`. 289 """ 290 self.__groups_to_remove = x 291 292 @property 293 def min_percent_snps(self) -> float: 294 """ 295 Retrieve `min_percent_snps`. 296 297 Returns: 298 **float:** 299 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 300 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 301 """ 302 return self.__min_percent_snps 303 304 @min_percent_snps.setter 305 def min_percent_snps(self, x: float) -> None: 306 """ 307 Update `min_percent_snps`. 308 """ 309 self.__min_percent_snps = x 310 311 @property 312 def save_masks(self) -> bool: 313 """ 314 Retrieve `save_masks`. 315 316 Returns: 317 **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 318 """ 319 return self.__save_masks 320 321 @save_masks.setter 322 def save_masks(self, x: bool) -> None: 323 """ 324 Update `save_masks`. 325 """ 326 self.__save_masks = x 327 328 @property 329 def load_masks(self) -> bool: 330 """ 331 Retrieve `load_masks`. 332 333 Returns: 334 **bool:** 335 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 336 by `masks_file`, or False otherwise. 337 """ 338 return self.__load_masks 339 340 @load_masks.setter 341 def load_masks(self, x: bool) -> None: 342 """ 343 Update `load_masks`. 344 """ 345 self.__load_masks = x 346 347 @property 348 def masks_file(self) -> Union[str, pathlib.Path]: 349 """ 350 Retrieve `masks_file`. 351 352 Returns: 353 **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices. 354 """ 355 return self.__masks_file 356 357 @masks_file.setter 358 def masks_file(self, x: Union[str, pathlib.Path]) -> None: 359 """ 360 Update `masks_file`. 361 """ 362 self.__masks_file = x 363 364 @property 365 def distance_type(self) -> str: 366 """ 367 Retrieve `distance_type`. 368 369 Returns: 370 **str:** 371 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 372 If `average_strands=True`, use 'distance_type=AP'. 373 """ 374 return self.__distance_type 375 376 @distance_type.setter 377 def distance_type(self, x: str) -> None: 378 """ 379 Update `distance_type`. 380 """ 381 self.__distance_type = x 382 383 @property 384 def n_components(self) -> int: 385 """ 386 Retrieve `n_components`. 387 388 Returns: 389 **int:** The number of principal components. 390 """ 391 return self.__n_components 392 393 @n_components.setter 394 def n_components(self, x: int) -> None: 395 """ 396 Update `n_components`. 397 """ 398 self.__n_components = x 399 400 @property 401 def rsid_or_chrompos(self) -> int: 402 """ 403 Retrieve `rsid_or_chrompos`. 404 405 Returns: 406 **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 407 """ 408 return self.__rsid_or_chrompos 409 410 @rsid_or_chrompos.setter 411 def rsid_or_chrompos(self, x: int) -> None: 412 """ 413 Update `rsid_or_chrompos`. 414 """ 415 self.__rsid_or_chrompos = x 416 417 @property 418 def X_new_(self) -> Optional[np.ndarray]: 419 """ 420 Retrieve `X_new_`. 421 422 Returns: 423 **array of shape (n_haplotypes_, n_components):** 424 The transformed SNP data projected onto the `n_components` principal components. 425 n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 426 (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 427 `(n_samples * 2, n_components)`. 428 """ 429 return self.__X_new_ 430 431 @X_new_.setter 432 def X_new_(self, x: np.ndarray) -> None: 433 """ 434 Update `X_new_`. 435 """ 436 self.__X_new_ = x 437 438 @property 439 def haplotypes_(self) -> Optional[List[str]]: 440 """ 441 Retrieve `haplotypes_`. 442 443 Returns: 444 list of str: 445 A list of unique haplotype identifiers. 446 """ 447 if isinstance(self.__haplotypes_, np.ndarray): 448 return self.__haplotypes_.ravel().tolist() # Flatten and convert NumPy array to a list 449 elif isinstance(self.__haplotypes_, list): 450 if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray): 451 return self.__haplotypes_[0].ravel().tolist() # Handle list containing a single array 452 return self.__haplotypes_ # Already a flat list 453 elif self.__haplotypes_ is None: 454 return None # If no haplotypes are set 455 else: 456 raise TypeError("`haplotypes_` must be a list or a NumPy array.") 457 458 @haplotypes_.setter 459 def haplotypes_(self, x: Union[np.ndarray, List[str]]) -> None: 460 """ 461 Update `haplotypes_`. 462 """ 463 if isinstance(x, np.ndarray): 464 self.__haplotypes_ = x.ravel().tolist() # Flatten and convert to a list 465 elif isinstance(x, list): 466 if len(x) == 1 and isinstance(x[0], np.ndarray): # Handle list containing a single array 467 self.__haplotypes_ = x[0].ravel().tolist() 468 else: 469 self.__haplotypes_ = x # Use directly if already a list 470 else: 471 raise TypeError("`x` must be a list or a NumPy array.") 472 473 @property 474 def samples_(self) -> Optional[List[str]]: 475 """ 476 Retrieve `samples_`. 477 478 Returns: 479 list of str: 480 A list of sample identifiers based on `haplotypes_` and `average_strands`. 481 """ 482 haplotypes = self.haplotypes_ 483 if haplotypes is None: 484 return None 485 if self.__average_strands: 486 return haplotypes 487 else: 488 return [x[:-2] for x in haplotypes] 489 490 @property 491 def n_haplotypes(self) -> Optional[int]: 492 """ 493 Retrieve `n_haplotypes`. 494 495 Returns: 496 **int:** 497 The total number of haplotypes, potentially reduced if filtering is applied 498 (`min_percent_snps > 0`). 499 """ 500 return len(self.__haplotypes_) 501 502 @property 503 def n_samples(self) -> Optional[int]: 504 """ 505 Retrieve `n_samples`. 506 507 Returns: 508 **int:** 509 The total number of samples, potentially reduced if filtering is applied 510 (`min_percent_snps > 0`). 511 """ 512 return len(np.unique(self.samples_)) 513 514 @staticmethod 515 def _load_masks_file(masks_file): 516 mask_files = np.load(masks_file, allow_pickle=True) 517 masks = mask_files['masks'] 518 rs_ID_list = mask_files['rs_ID_list'] 519 ind_ID_list = mask_files['ind_ID_list'] 520 groups = mask_files['labels'] 521 weights = mask_files['weights'] 522 return masks, rs_ID_list, ind_ID_list, groups, weights 523 524 def fit_transform( 525 self, 526 snpobj: Optional['SNPObject'] = None, 527 laiobj: Optional['LocalAncestryObject'] = None, 528 labels_file: Optional[str] = None, 529 ancestry: Optional[str] = None, 530 average_strands: Optional[bool] = None 531 ) -> np.ndarray: 532 """ 533 Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data. 534 535 Args: 536 snpobj (SNPObject, optional): 537 A SNPObject instance. 538 laiobj (LAIObject, optional): 539 A LAIObject instance. 540 labels_file (str, optional): 541 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 542 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 543 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 544 combined into groups, with respective weights. 545 ancestry (str, optional): 546 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0. 547 average_strands (bool, optional): 548 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 549 If None, defaults to `self.average_strands`. 550 551 Returns: 552 **array of shape (n_samples, n_components):** 553 The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`. 554 """ 555 if snpobj is None: 556 snpobj = self.snpobj 557 if laiobj is None: 558 laiobj = self.laiobj 559 if labels_file is None: 560 labels_file = self.labels_file 561 if ancestry is None: 562 ancestry = self.ancestry 563 if average_strands is None: 564 average_strands = self.average_strands 565 566 if not self.is_masked: 567 self.ancestry = '1' 568 if self.load_masks: 569 masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file) 570 else: 571 masks, rs_ID_list, ind_ID_list = array_process( 572 self.snpobj, 573 self.laiobj, 574 self.average_strands, 575 self.prob_thresh, 576 self.is_masked, 577 self.rsid_or_chrompos 578 ) 579 580 masks, ind_ID_list, groups, weights = process_labels_weights( 581 self.labels_file, 582 masks, 583 rs_ID_list, 584 ind_ID_list, 585 self.average_strands, 586 self.ancestry, 587 self.min_percent_snps, 588 self.groups_to_remove, 589 self.is_weighted, 590 self.save_masks, 591 self.masks_file 592 ) 593 594 distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]] 595 596 self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components) 597 self.haplotypes_ = ind_ID_list
A class for multiple array ancestry-specific multidimensional scaling (maasMDS).
This class supports both separate and averaged strand processing for SNP data. If the snpobj
,
laiobj
, labels_file
, and ancestry
parameters are all provided during instantiation,
the fit_transform
method will be automatically called, applying the specified maasMDS method to transform
the data upon instantiation.
22 def __init__( 23 self, 24 snpobj, 25 laiobj, 26 labels_file, 27 ancestry, 28 is_masked: bool = True, 29 prob_thresh: float = 0, 30 average_strands: bool = False, 31 is_weighted: bool = False, 32 groups_to_remove: Dict[int, List[str]] = {}, 33 min_percent_snps: float = 4, 34 save_masks: bool = False, 35 load_masks: bool = False, 36 masks_file: Union[str, pathlib.Path] = 'masks.npz', 37 distance_type: str = 'AP', 38 n_components: int = 2, 39 rsid_or_chrompos: int = 2 40 ): 41 """ 42 Args: 43 snpobj (SNPObject, optional): 44 A SNPObject instance. 45 laiobj (LAIObject, optional): 46 A LAIObject instance. 47 labels_file (str, optional): 48 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 49 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 50 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 51 combined into groups, with respective weights. 52 ancestry (str, optional): 53 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 54 is_masked (bool, default=True): 55 True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 56 prob_thresh (float, default=0.0): 57 Minimum probability threshold for a SNP to belong to an ancestry. 58 average_strands (bool, default=False): 59 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 60 is_weighted (bool, default=False): 61 True if weights are provided in the labels file, or False otherwise. 62 groups_to_remove (dict of int to list of str, default={}): 63 Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 64 lists of groups to remove for each array. 65 Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 66 min_percent_snps (float, default=4.0): 67 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 68 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 69 save_masks (bool, default=False): 70 True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 71 load_masks (bool, default=False): 72 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise. 73 masks_file (str or pathlib.Path, default='masks.npz'): 74 Path to the `.npz` file used for saving/loading masked matrices. 75 distance_type (str, default='AP'): 76 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 77 If `average_strands=True`, use 'distance_type=AP'. 78 n_components (int, default=2): 79 The number of principal components. 80 rsid_or_chrompos (int, default=2): 81 Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 82 """ 83 self.__snpobj = snpobj 84 self.__laiobj = laiobj 85 self.__labels_file = labels_file 86 self.__ancestry = ancestry 87 self.__is_masked = is_masked 88 self.__prob_thresh = prob_thresh 89 self.__average_strands = average_strands 90 self.__groups_to_remove = groups_to_remove 91 self.__min_percent_snps = min_percent_snps 92 self.__is_weighted = is_weighted 93 self.__save_masks = save_masks 94 self.__load_masks = load_masks 95 self.__masks_file = masks_file 96 self.__distance_type = distance_type 97 self.__n_components = n_components 98 self.__rsid_or_chrompos = rsid_or_chrompos 99 self.__X_new_ = None # Store transformed SNP data 100 self.__haplotypes_ = None # Store haplotypes after filtering if min_percent_snps > 0 101 self.__samples_ = None # Store samples after filtering if min_percent_snps > 0 102 103 # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided 104 if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None: 105 self.fit_transform(snpobj, laiobj, labels_file, ancestry)
Arguments:
- snpobj (SNPObject, optional): A SNPObject instance.
- laiobj (LAIObject, optional): A LAIObject instance.
- labels_file (str, optional): Path to the labels file in .tsv format. The first column,
indID
, contains the individual identifiers, and the second column,label
, specifies the groups for all individuals. Ifis_weighted=True
, aweight
column with individual weights is required. Optionally,combination
andcombination_weight
columns can specify sets of individuals to be combined into groups, with respective weights. - ancestry (str, optional): Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at
0
. - is_masked (bool, default=True): True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
- prob_thresh (float, default=0.0): Minimum probability threshold for a SNP to belong to an ancestry.
- average_strands (bool, default=False): True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
- is_weighted (bool, default=False): True if weights are provided in the labels file, or False otherwise.
- groups_to_remove (dict of int to list of str, default={}): Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are
lists of groups to remove for each array.
Example:
{1: ['group1', 'group2'], 2: [], 3: ['group3']}
. - min_percent_snps (float, default=4.0): Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
- save_masks (bool, default=False): True if the masked matrices are to be saved in a
.npz
file, or False otherwise. - load_masks (bool, default=False): True if the masked matrices are to be loaded from a pre-existing
.npz
file specified bymasks_file
, or False otherwise. - masks_file (str or pathlib.Path, default='masks.npz'): Path to the
.npz
file used for saving/loading masked matrices. - distance_type (str, default='AP'): Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
If
average_strands=True
, use 'distance_type=AP'. - n_components (int, default=2): The number of principal components.
- rsid_or_chrompos (int, default=2): Format indicator for SNP IDs in the SNP data. Use 1 for
rsID
format or 2 forchromosome_position
.
127 def copy(self) -> 'maasMDS': 128 """ 129 Create and return a copy of `self`. 130 131 Returns: 132 **maasMDS:** 133 A new instance of the current object. 134 """ 135 return copy.copy(self)
Create and return a copy of self
.
Returns:
maasMDS: A new instance of the current object.
189 @property 190 def ancestry(self) -> Optional[str]: 191 """ 192 Retrieve `ancestry`. 193 194 Returns: 195 **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`. 196 """ 197 return self.__ancestry
Retrieve ancestry
.
Returns:
str: Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at
0
.
206 @property 207 def is_masked(self) -> bool: 208 """ 209 Retrieve `is_masked`. 210 211 Returns: 212 **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise. 213 """ 214 return self.__is_masked
Retrieve is_masked
.
Returns:
bool: True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
223 @property 224 def prob_thresh(self) -> float: 225 """ 226 Retrieve `prob_thresh`. 227 228 Returns: 229 **float:** Minimum probability threshold for a SNP to belong to an ancestry. 230 """ 231 return self.__prob_thresh
Retrieve prob_thresh
.
Returns:
float: Minimum probability threshold for a SNP to belong to an ancestry.
240 @property 241 def average_strands(self) -> bool: 242 """ 243 Retrieve `average_strands`. 244 245 Returns: 246 **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 247 """ 248 return self.__average_strands
Retrieve average_strands
.
Returns:
bool: True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
257 @property 258 def is_weighted(self) -> bool: 259 """ 260 Retrieve `is_weighted`. 261 262 Returns: 263 **bool:** True if weights are provided in the labels file, or False otherwise. 264 """ 265 return self.__is_weighted
Retrieve is_weighted
.
Returns:
bool: True if weights are provided in the labels file, or False otherwise.
274 @property 275 def groups_to_remove(self) -> Dict[int, List[str]]: 276 """ 277 Retrieve `groups_to_remove`. 278 279 Returns: 280 **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 281 lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`. 282 """ 283 return self.__groups_to_remove
Retrieve groups_to_remove
.
Returns:
dict of int to list of str: Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are lists of groups to remove for each array. Example:
{1: ['group1', 'group2'], 2: [], 3: ['group3']}
.
292 @property 293 def min_percent_snps(self) -> float: 294 """ 295 Retrieve `min_percent_snps`. 296 297 Returns: 298 **float:** 299 Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 300 All individuals with fewer percent of unmasked SNPs than this threshold will be excluded. 301 """ 302 return self.__min_percent_snps
Retrieve min_percent_snps
.
Returns:
float: Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
311 @property 312 def save_masks(self) -> bool: 313 """ 314 Retrieve `save_masks`. 315 316 Returns: 317 **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise. 318 """ 319 return self.__save_masks
Retrieve save_masks
.
Returns:
bool: True if the masked matrices are to be saved in a
.npz
file, or False otherwise.
328 @property 329 def load_masks(self) -> bool: 330 """ 331 Retrieve `load_masks`. 332 333 Returns: 334 **bool:** 335 True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 336 by `masks_file`, or False otherwise. 337 """ 338 return self.__load_masks
Retrieve load_masks
.
Returns:
bool: True if the masked matrices are to be loaded from a pre-existing
.npz
file specified bymasks_file
, or False otherwise.
347 @property 348 def masks_file(self) -> Union[str, pathlib.Path]: 349 """ 350 Retrieve `masks_file`. 351 352 Returns: 353 **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices. 354 """ 355 return self.__masks_file
Retrieve masks_file
.
Returns:
str or pathlib.Path: Path to the
.npz
file used for saving/loading masked matrices.
364 @property 365 def distance_type(self) -> str: 366 """ 367 Retrieve `distance_type`. 368 369 Returns: 370 **str:** 371 Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). 372 If `average_strands=True`, use 'distance_type=AP'. 373 """ 374 return self.__distance_type
Retrieve distance_type
.
Returns:
str: Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). If
average_strands=True
, use 'distance_type=AP'.
400 @property 401 def rsid_or_chrompos(self) -> int: 402 """ 403 Retrieve `rsid_or_chrompos`. 404 405 Returns: 406 **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`. 407 """ 408 return self.__rsid_or_chrompos
Retrieve rsid_or_chrompos
.
Returns:
int: Format indicator for SNP IDs in the SNP data. Use 1 for
rsID
format or 2 forchromosome_position
.
417 @property 418 def X_new_(self) -> Optional[np.ndarray]: 419 """ 420 Retrieve `X_new_`. 421 422 Returns: 423 **array of shape (n_haplotypes_, n_components):** 424 The transformed SNP data projected onto the `n_components` principal components. 425 n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 426 (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 427 `(n_samples * 2, n_components)`. 428 """ 429 return self.__X_new_
Retrieve X_new_
.
Returns:
array of shape (n_haplotypes_, n_components): The transformed SNP data projected onto the
n_components
principal components. n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied (min_percent_snps > 0
). For diploid individuals without filtering, the shape is(n_samples * 2, n_components)
.
438 @property 439 def haplotypes_(self) -> Optional[List[str]]: 440 """ 441 Retrieve `haplotypes_`. 442 443 Returns: 444 list of str: 445 A list of unique haplotype identifiers. 446 """ 447 if isinstance(self.__haplotypes_, np.ndarray): 448 return self.__haplotypes_.ravel().tolist() # Flatten and convert NumPy array to a list 449 elif isinstance(self.__haplotypes_, list): 450 if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray): 451 return self.__haplotypes_[0].ravel().tolist() # Handle list containing a single array 452 return self.__haplotypes_ # Already a flat list 453 elif self.__haplotypes_ is None: 454 return None # If no haplotypes are set 455 else: 456 raise TypeError("`haplotypes_` must be a list or a NumPy array.")
473 @property 474 def samples_(self) -> Optional[List[str]]: 475 """ 476 Retrieve `samples_`. 477 478 Returns: 479 list of str: 480 A list of sample identifiers based on `haplotypes_` and `average_strands`. 481 """ 482 haplotypes = self.haplotypes_ 483 if haplotypes is None: 484 return None 485 if self.__average_strands: 486 return haplotypes 487 else: 488 return [x[:-2] for x in haplotypes]
Retrieve samples_
.
Returns:
list of str: A list of sample identifiers based on
haplotypes_
andaverage_strands
.
490 @property 491 def n_haplotypes(self) -> Optional[int]: 492 """ 493 Retrieve `n_haplotypes`. 494 495 Returns: 496 **int:** 497 The total number of haplotypes, potentially reduced if filtering is applied 498 (`min_percent_snps > 0`). 499 """ 500 return len(self.__haplotypes_)
Retrieve n_haplotypes
.
Returns:
int: The total number of haplotypes, potentially reduced if filtering is applied (
min_percent_snps > 0
).
502 @property 503 def n_samples(self) -> Optional[int]: 504 """ 505 Retrieve `n_samples`. 506 507 Returns: 508 **int:** 509 The total number of samples, potentially reduced if filtering is applied 510 (`min_percent_snps > 0`). 511 """ 512 return len(np.unique(self.samples_))
Retrieve n_samples
.
Returns:
int: The total number of samples, potentially reduced if filtering is applied (
min_percent_snps > 0
).
524 def fit_transform( 525 self, 526 snpobj: Optional['SNPObject'] = None, 527 laiobj: Optional['LocalAncestryObject'] = None, 528 labels_file: Optional[str] = None, 529 ancestry: Optional[str] = None, 530 average_strands: Optional[bool] = None 531 ) -> np.ndarray: 532 """ 533 Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data. 534 535 Args: 536 snpobj (SNPObject, optional): 537 A SNPObject instance. 538 laiobj (LAIObject, optional): 539 A LAIObject instance. 540 labels_file (str, optional): 541 Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 542 column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 543 weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 544 combined into groups, with respective weights. 545 ancestry (str, optional): 546 Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0. 547 average_strands (bool, optional): 548 True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. 549 If None, defaults to `self.average_strands`. 550 551 Returns: 552 **array of shape (n_samples, n_components):** 553 The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`. 554 """ 555 if snpobj is None: 556 snpobj = self.snpobj 557 if laiobj is None: 558 laiobj = self.laiobj 559 if labels_file is None: 560 labels_file = self.labels_file 561 if ancestry is None: 562 ancestry = self.ancestry 563 if average_strands is None: 564 average_strands = self.average_strands 565 566 if not self.is_masked: 567 self.ancestry = '1' 568 if self.load_masks: 569 masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file) 570 else: 571 masks, rs_ID_list, ind_ID_list = array_process( 572 self.snpobj, 573 self.laiobj, 574 self.average_strands, 575 self.prob_thresh, 576 self.is_masked, 577 self.rsid_or_chrompos 578 ) 579 580 masks, ind_ID_list, groups, weights = process_labels_weights( 581 self.labels_file, 582 masks, 583 rs_ID_list, 584 ind_ID_list, 585 self.average_strands, 586 self.ancestry, 587 self.min_percent_snps, 588 self.groups_to_remove, 589 self.is_weighted, 590 self.save_masks, 591 self.masks_file 592 ) 593 594 distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]] 595 596 self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components) 597 self.haplotypes_ = ind_ID_list
Fit the model to the SNP data stored in the provided snpobj
and apply the dimensionality reduction on the same SNP data.
Arguments:
- snpobj (SNPObject, optional): A SNPObject instance.
- laiobj (LAIObject, optional): A LAIObject instance.
- labels_file (str, optional): Path to the labels file in .tsv format. The first column,
indID
, contains the individual identifiers, and the second column,label
, specifies the groups for all individuals. Ifis_weighted=True
, aweight
column with individual weights is required. Optionally,combination
andcombination_weight
columns can specify sets of individuals to be combined into groups, with respective weights. - ancestry (str, optional): Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
- average_strands (bool, optional): True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
If None, defaults to
self.average_strands
.
Returns:
array of shape (n_samples, n_components): The transformed SNP data projected onto the
n_components
principal components, stored inself.X_new_
.