snputils.processing.maasmds

  1import pathlib
  2import numpy as np
  3import copy
  4from typing import Optional, Dict, List, Union
  5
  6from snputils.snp.genobj.snpobj import SNPObject
  7from snputils.ancestry.genobj.local import LocalAncestryObject
  8from ._utils.mds_distance import distance_mat, mds_transform
  9from ._utils.gen_tools import array_process, process_labels_weights
 10
 11
 12class maasMDS:
 13    """
 14    A class for multiple array ancestry-specific multidimensional scaling (maasMDS).
 15
 16    This class supports both separate and averaged strand processing for SNP data. If the `snpobj`, 
 17    `laiobj`, `labels_file`, and `ancestry` parameters are all provided during instantiation, 
 18    the `fit_transform` method will be automatically called, applying the specified maasMDS method to transform 
 19    the data upon instantiation.
 20    """
 21    def __init__(
 22            self, 
 23            snpobj, 
 24            laiobj,
 25            labels_file,
 26            ancestry,
 27            is_masked: bool = True,
 28            prob_thresh: float = 0,
 29            average_strands: bool = False,
 30            is_weighted: bool = False,
 31            groups_to_remove: Dict[int, List[str]] = {},
 32            min_percent_snps: float = 4,
 33            save_masks: bool = False,
 34            load_masks: bool = False,
 35            masks_file: Union[str, pathlib.Path] = 'masks.npz',
 36            distance_type: str = 'AP',
 37            n_components: int = 2,
 38            rsid_or_chrompos: int = 2
 39        ):
 40        """
 41        Args:
 42            snpobj (SNPObject, optional): 
 43                A SNPObject instance.
 44            laiobj (LAIObject, optional): 
 45                A LAIObject instance.
 46            labels_file (str, optional): 
 47                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
 48                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
 49                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
 50                combined into groups, with respective weights.
 51            ancestry (str, optional): 
 52                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
 53            is_masked (bool, default=True): 
 54                True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
 55            prob_thresh (float, default=0.0): 
 56                Minimum probability threshold for a SNP to belong to an ancestry.
 57            average_strands (bool, default=False): 
 58                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
 59            is_weighted (bool, default=False): 
 60                True if weights are provided in the labels file, or False otherwise.
 61            groups_to_remove (dict of int to list of str, default={}): 
 62                Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
 63                lists of groups to remove for each array.
 64                Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
 65            min_percent_snps (float, default=4.0): 
 66                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
 67                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
 68            save_masks (bool, default=False): 
 69                True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
 70            load_masks (bool, default=False): 
 71                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise.
 72            masks_file (str or pathlib.Path, default='masks.npz'): 
 73                Path to the `.npz` file used for saving/loading masked matrices.
 74            distance_type (str, default='AP'): 
 75                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
 76                If `average_strands=True`, use 'distance_type=AP'.
 77            n_components (int, default=2): 
 78                The number of principal components.
 79            rsid_or_chrompos (int, default=2): 
 80                Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
 81        """
 82        self.__snpobj = snpobj
 83        self.__laiobj = laiobj
 84        self.__labels_file = labels_file
 85        self.__ancestry = ancestry
 86        self.__is_masked = is_masked
 87        self.__prob_thresh = prob_thresh
 88        self.__average_strands = average_strands
 89        self.__groups_to_remove = groups_to_remove
 90        self.__min_percent_snps = min_percent_snps
 91        self.__is_weighted = is_weighted
 92        self.__save_masks = save_masks
 93        self.__load_masks = load_masks
 94        self.__masks_file = masks_file
 95        self.__distance_type = distance_type
 96        self.__n_components = n_components
 97        self.__rsid_or_chrompos = rsid_or_chrompos
 98        self.__X_new_ = None  # Store transformed SNP data
 99        self.__haplotypes_ = None  # Store haplotypes after filtering if min_percent_snps > 0
100        self.__samples_ = None  # Store samples after filtering if min_percent_snps > 0
101
102        # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided
103        if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None:
104            self.fit_transform(snpobj, laiobj, labels_file, ancestry)
105
106    def __getitem__(self, key):
107        """
108        To access an attribute of the class using the square bracket notation,
109        similar to a dictionary.
110        """
111        try:
112            return getattr(self, key)
113        except AttributeError:
114            raise KeyError(f'Invalid key: {key}')
115
116    def __setitem__(self, key, value):
117        """
118        To set an attribute of the class using the square bracket notation,
119        similar to a dictionary.
120        """
121        try:
122            setattr(self, key, value)
123        except AttributeError:
124            raise KeyError(f'Invalid key: {key}')
125        
126    def copy(self) -> 'maasMDS':
127        """
128        Create and return a copy of `self`.
129
130        Returns:
131            **maasMDS:** 
132                A new instance of the current object.
133        """
134        return copy.copy(self)
135
136    @property
137    def snpobj(self) -> Optional['SNPObject']:
138        """
139        Retrieve `snpobj`.
140        
141        Returns:
142            **SNPObject:** A SNPObject instance.
143        """
144        return self.__snpobj
145
146    @snpobj.setter
147    def snpobj(self, x: 'SNPObject') -> None:
148        """
149        Update `snpobj`.
150        """
151        self.__snpobj = x
152
153    @property
154    def laiobj(self) -> Optional['LocalAncestryObject']:
155        """
156        Retrieve `laiobj`.
157        
158        Returns:
159            **LocalAncestryObject:** A LAIObject instance.
160        """
161        return self.__laiobj
162
163    @laiobj.setter
164    def laiobj(self, x: 'LocalAncestryObject') -> None:
165        """
166        Update `laiobj`.
167        """
168        self.__laiobj = x
169
170    @property
171    def labels_file(self) -> Optional[str]:
172        """
173        Retrieve `labels_file`.
174        
175        Returns:
176            **str:** 
177                Path to the labels file in `.tsv` format.
178        """
179        return self.__labels_file
180
181    @labels_file.setter
182    def labels_file(self, x: str) -> None:
183        """
184        Update `labels_file`.
185        """
186        self.__labels_file = x
187
188    @property
189    def ancestry(self) -> Optional[str]:
190        """
191        Retrieve `ancestry`.
192        
193        Returns:
194            **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
195        """
196        return self.__ancestry
197
198    @ancestry.setter
199    def ancestry(self, x: str) -> None:
200        """
201        Update `ancestry`.
202        """
203        self.__ancestry = x
204
205    @property
206    def is_masked(self) -> bool:
207        """
208        Retrieve `is_masked`.
209        
210        Returns:
211            **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
212        """
213        return self.__is_masked
214
215    @is_masked.setter
216    def is_masked(self, x: bool) -> None:
217        """
218        Update `is_masked`.
219        """
220        self.__is_masked = x
221
222    @property
223    def prob_thresh(self) -> float:
224        """
225        Retrieve `prob_thresh`.
226        
227        Returns:
228            **float:** Minimum probability threshold for a SNP to belong to an ancestry.
229        """
230        return self.__prob_thresh
231
232    @prob_thresh.setter
233    def prob_thresh(self, x: float) -> None:
234        """
235        Update `prob_thresh`.
236        """
237        self.__prob_thresh = x
238
239    @property
240    def average_strands(self) -> bool:
241        """
242        Retrieve `average_strands`.
243        
244        Returns:
245            **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
246        """
247        return self.__average_strands
248
249    @average_strands.setter
250    def average_strands(self, x: bool) -> None:
251        """
252        Update `average_strands`.
253        """
254        self.__average_strands = x
255
256    @property
257    def is_weighted(self) -> bool:
258        """
259        Retrieve `is_weighted`.
260        
261        Returns:
262            **bool:** True if weights are provided in the labels file, or False otherwise.
263        """
264        return self.__is_weighted
265
266    @is_weighted.setter
267    def is_weighted(self, x: bool) -> None:
268        """
269        Update `is_weighted`.
270        """
271        self.__is_weighted = x
272
273    @property
274    def groups_to_remove(self) -> Dict[int, List[str]]:
275        """
276        Retrieve `groups_to_remove`.
277        
278        Returns:
279            **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
280                lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
281        """
282        return self.__groups_to_remove
283
284    @groups_to_remove.setter
285    def groups_to_remove(self, x: Dict[int, List[str]]) -> None:
286        """
287        Update `groups_to_remove`.
288        """
289        self.__groups_to_remove = x
290
291    @property
292    def min_percent_snps(self) -> float:
293        """
294        Retrieve `min_percent_snps`.
295        
296        Returns:
297            **float:** 
298                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
299                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
300        """
301        return self.__min_percent_snps
302
303    @min_percent_snps.setter
304    def min_percent_snps(self, x: float) -> None:
305        """
306        Update `min_percent_snps`.
307        """
308        self.__min_percent_snps = x
309
310    @property
311    def save_masks(self) -> bool:
312        """
313        Retrieve `save_masks`.
314        
315        Returns:
316            **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
317        """
318        return self.__save_masks
319
320    @save_masks.setter
321    def save_masks(self, x: bool) -> None:
322        """
323        Update `save_masks`.
324        """
325        self.__save_masks = x
326
327    @property
328    def load_masks(self) -> bool:
329        """
330        Retrieve `load_masks`.
331        
332        Returns:
333            **bool:** 
334                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 
335                by `masks_file`, or False otherwise.
336        """
337        return self.__load_masks
338
339    @load_masks.setter
340    def load_masks(self, x: bool) -> None:
341        """
342        Update `load_masks`.
343        """
344        self.__load_masks = x
345
346    @property
347    def masks_file(self) -> Union[str, pathlib.Path]:
348        """
349        Retrieve `masks_file`.
350        
351        Returns:
352            **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices.
353        """
354        return self.__masks_file
355
356    @masks_file.setter
357    def masks_file(self, x: Union[str, pathlib.Path]) -> None:
358        """
359        Update `masks_file`.
360        """
361        self.__masks_file = x
362
363    @property
364    def distance_type(self) -> str:
365        """
366        Retrieve `distance_type`.
367        
368        Returns:
369            **str:** 
370                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
371                If `average_strands=True`, use 'distance_type=AP'.
372        """
373        return self.__distance_type
374
375    @distance_type.setter
376    def distance_type(self, x: str) -> None:
377        """
378        Update `distance_type`.
379        """
380        self.__distance_type = x
381
382    @property
383    def n_components(self) -> int:
384        """
385        Retrieve `n_components`.
386        
387        Returns:
388            **int:** The number of principal components.
389        """
390        return self.__n_components
391
392    @n_components.setter
393    def n_components(self, x: int) -> None:
394        """
395        Update `n_components`.
396        """
397        self.__n_components = x
398
399    @property
400    def rsid_or_chrompos(self) -> int:
401        """
402        Retrieve `rsid_or_chrompos`.
403        
404        Returns:
405            **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
406        """
407        return self.__rsid_or_chrompos
408
409    @rsid_or_chrompos.setter
410    def rsid_or_chrompos(self, x: int) -> None:
411        """
412        Update `rsid_or_chrompos`.
413        """
414        self.__rsid_or_chrompos = x
415
416    @property
417    def X_new_(self) -> Optional[np.ndarray]:
418        """
419        Retrieve `X_new_`.
420
421        Returns:
422            **array of shape (n_haplotypes_, n_components):** 
423                The transformed SNP data projected onto the `n_components` principal components.
424                n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 
425                (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 
426                `(n_samples * 2, n_components)`.
427        """
428        return self.__X_new_
429
430    @X_new_.setter
431    def X_new_(self, x: np.ndarray) -> None:
432        """
433        Update `X_new_`.
434        """
435        self.__X_new_ = x
436
437    @property
438    def haplotypes_(self) -> Optional[List[str]]:
439        """
440        Retrieve `haplotypes_`.
441
442        Returns:
443            list of str:
444                A list of unique haplotype identifiers.
445        """
446        if isinstance(self.__haplotypes_, np.ndarray):
447            return self.__haplotypes_.ravel().tolist()  # Flatten and convert NumPy array to a list
448        elif isinstance(self.__haplotypes_, list):
449            if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray):
450                return self.__haplotypes_[0].ravel().tolist()  # Handle list containing a single array
451            return self.__haplotypes_  # Already a flat list
452        elif self.__haplotypes_ is None:
453            return None  # If no haplotypes are set
454        else:
455            raise TypeError("`haplotypes_` must be a list or a NumPy array.")
456
457    @haplotypes_.setter
458    def haplotypes_(self, x: Union[np.ndarray, List[str]]) -> None:
459        """
460        Update `haplotypes_`.
461        """
462        if isinstance(x, np.ndarray):
463            self.__haplotypes_ = x.ravel().tolist()  # Flatten and convert to a list
464        elif isinstance(x, list):
465            if len(x) == 1 and isinstance(x[0], np.ndarray):  # Handle list containing a single array
466                self.__haplotypes_ = x[0].ravel().tolist()
467            else:
468                self.__haplotypes_ = x  # Use directly if already a list
469        else:
470            raise TypeError("`x` must be a list or a NumPy array.")
471
472    @property
473    def samples_(self) -> Optional[List[str]]:
474        """
475        Retrieve `samples_`.
476
477        Returns:
478            list of str:
479                A list of sample identifiers based on `haplotypes_` and `average_strands`.
480        """
481        haplotypes = self.haplotypes_
482        if haplotypes is None:
483            return None
484        if self.__average_strands:
485            return haplotypes
486        else:
487            return [x[:-2] for x in haplotypes]
488
489    @property
490    def n_haplotypes(self) -> Optional[int]:
491        """
492        Retrieve `n_haplotypes`.
493
494        Returns:
495            **int:**
496                The total number of haplotypes, potentially reduced if filtering is applied 
497                (`min_percent_snps > 0`).
498        """
499        return len(self.__haplotypes_)
500
501    @property
502    def n_samples(self) -> Optional[int]:
503        """
504        Retrieve `n_samples`.
505
506        Returns:
507            **int:**
508                The total number of samples, potentially reduced if filtering is applied 
509                (`min_percent_snps > 0`).
510        """
511        return len(np.unique(self.samples_))
512
513    @staticmethod
514    def _load_masks_file(masks_file):
515        mask_files = np.load(masks_file, allow_pickle=True)
516        masks = mask_files['masks']
517        rs_ID_list = mask_files['rs_ID_list']
518        ind_ID_list = mask_files['ind_ID_list']
519        groups = mask_files['labels']
520        weights = mask_files['weights']
521        return masks, rs_ID_list, ind_ID_list, groups, weights
522
523    def fit_transform(
524            self,
525            snpobj: Optional['SNPObject'] = None, 
526            laiobj: Optional['LocalAncestryObject'] = None,
527            labels_file: Optional[str] = None,
528            ancestry: Optional[str] = None,
529            average_strands: Optional[bool] = None
530        ) -> np.ndarray:
531        """
532        Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data.
533
534        Args:
535            snpobj (SNPObject, optional): 
536                A SNPObject instance.
537            laiobj (LAIObject, optional): 
538                A LAIObject instance.
539            labels_file (str, optional): 
540                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
541                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
542                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
543                combined into groups, with respective weights.
544            ancestry (str, optional): 
545                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
546            average_strands (bool, optional): 
547                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
548                If None, defaults to `self.average_strands`.
549
550        Returns:
551            **array of shape (n_samples, n_components):** 
552                The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`.
553        """
554        if snpobj is None:
555            snpobj = self.snpobj
556        if laiobj is None:
557            laiobj = self.laiobj
558        if labels_file is None:
559            labels_file = self.labels_file
560        if ancestry is None:
561            ancestry = self.ancestry
562        if average_strands is None:
563            average_strands = self.average_strands
564        
565        if not self.is_masked:
566            self.ancestry = '1'
567        if self.load_masks:
568            masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file)
569        else:
570            masks, rs_ID_list, ind_ID_list = array_process(
571                self.snpobj,
572                self.laiobj,
573                self.average_strands,
574                self.prob_thresh, 
575                self.is_masked, 
576                self.rsid_or_chrompos
577            )
578
579            masks, ind_ID_list, groups, weights = process_labels_weights(
580                self.labels_file, 
581                masks, 
582                rs_ID_list,
583                ind_ID_list, 
584                self.average_strands, 
585                self.ancestry, 
586                self.min_percent_snps, 
587                self.groups_to_remove,
588                self.is_weighted, 
589                self.save_masks, 
590                self.masks_file
591            )
592        
593        distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]]
594        
595        self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components)
596        self.haplotypes_ = ind_ID_list
class maasMDS:
 13class maasMDS:
 14    """
 15    A class for multiple array ancestry-specific multidimensional scaling (maasMDS).
 16
 17    This class supports both separate and averaged strand processing for SNP data. If the `snpobj`, 
 18    `laiobj`, `labels_file`, and `ancestry` parameters are all provided during instantiation, 
 19    the `fit_transform` method will be automatically called, applying the specified maasMDS method to transform 
 20    the data upon instantiation.
 21    """
 22    def __init__(
 23            self, 
 24            snpobj, 
 25            laiobj,
 26            labels_file,
 27            ancestry,
 28            is_masked: bool = True,
 29            prob_thresh: float = 0,
 30            average_strands: bool = False,
 31            is_weighted: bool = False,
 32            groups_to_remove: Dict[int, List[str]] = {},
 33            min_percent_snps: float = 4,
 34            save_masks: bool = False,
 35            load_masks: bool = False,
 36            masks_file: Union[str, pathlib.Path] = 'masks.npz',
 37            distance_type: str = 'AP',
 38            n_components: int = 2,
 39            rsid_or_chrompos: int = 2
 40        ):
 41        """
 42        Args:
 43            snpobj (SNPObject, optional): 
 44                A SNPObject instance.
 45            laiobj (LAIObject, optional): 
 46                A LAIObject instance.
 47            labels_file (str, optional): 
 48                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
 49                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
 50                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
 51                combined into groups, with respective weights.
 52            ancestry (str, optional): 
 53                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
 54            is_masked (bool, default=True): 
 55                True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
 56            prob_thresh (float, default=0.0): 
 57                Minimum probability threshold for a SNP to belong to an ancestry.
 58            average_strands (bool, default=False): 
 59                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
 60            is_weighted (bool, default=False): 
 61                True if weights are provided in the labels file, or False otherwise.
 62            groups_to_remove (dict of int to list of str, default={}): 
 63                Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
 64                lists of groups to remove for each array.
 65                Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
 66            min_percent_snps (float, default=4.0): 
 67                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
 68                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
 69            save_masks (bool, default=False): 
 70                True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
 71            load_masks (bool, default=False): 
 72                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise.
 73            masks_file (str or pathlib.Path, default='masks.npz'): 
 74                Path to the `.npz` file used for saving/loading masked matrices.
 75            distance_type (str, default='AP'): 
 76                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
 77                If `average_strands=True`, use 'distance_type=AP'.
 78            n_components (int, default=2): 
 79                The number of principal components.
 80            rsid_or_chrompos (int, default=2): 
 81                Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
 82        """
 83        self.__snpobj = snpobj
 84        self.__laiobj = laiobj
 85        self.__labels_file = labels_file
 86        self.__ancestry = ancestry
 87        self.__is_masked = is_masked
 88        self.__prob_thresh = prob_thresh
 89        self.__average_strands = average_strands
 90        self.__groups_to_remove = groups_to_remove
 91        self.__min_percent_snps = min_percent_snps
 92        self.__is_weighted = is_weighted
 93        self.__save_masks = save_masks
 94        self.__load_masks = load_masks
 95        self.__masks_file = masks_file
 96        self.__distance_type = distance_type
 97        self.__n_components = n_components
 98        self.__rsid_or_chrompos = rsid_or_chrompos
 99        self.__X_new_ = None  # Store transformed SNP data
100        self.__haplotypes_ = None  # Store haplotypes after filtering if min_percent_snps > 0
101        self.__samples_ = None  # Store samples after filtering if min_percent_snps > 0
102
103        # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided
104        if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None:
105            self.fit_transform(snpobj, laiobj, labels_file, ancestry)
106
107    def __getitem__(self, key):
108        """
109        To access an attribute of the class using the square bracket notation,
110        similar to a dictionary.
111        """
112        try:
113            return getattr(self, key)
114        except AttributeError:
115            raise KeyError(f'Invalid key: {key}')
116
117    def __setitem__(self, key, value):
118        """
119        To set an attribute of the class using the square bracket notation,
120        similar to a dictionary.
121        """
122        try:
123            setattr(self, key, value)
124        except AttributeError:
125            raise KeyError(f'Invalid key: {key}')
126        
127    def copy(self) -> 'maasMDS':
128        """
129        Create and return a copy of `self`.
130
131        Returns:
132            **maasMDS:** 
133                A new instance of the current object.
134        """
135        return copy.copy(self)
136
137    @property
138    def snpobj(self) -> Optional['SNPObject']:
139        """
140        Retrieve `snpobj`.
141        
142        Returns:
143            **SNPObject:** A SNPObject instance.
144        """
145        return self.__snpobj
146
147    @snpobj.setter
148    def snpobj(self, x: 'SNPObject') -> None:
149        """
150        Update `snpobj`.
151        """
152        self.__snpobj = x
153
154    @property
155    def laiobj(self) -> Optional['LocalAncestryObject']:
156        """
157        Retrieve `laiobj`.
158        
159        Returns:
160            **LocalAncestryObject:** A LAIObject instance.
161        """
162        return self.__laiobj
163
164    @laiobj.setter
165    def laiobj(self, x: 'LocalAncestryObject') -> None:
166        """
167        Update `laiobj`.
168        """
169        self.__laiobj = x
170
171    @property
172    def labels_file(self) -> Optional[str]:
173        """
174        Retrieve `labels_file`.
175        
176        Returns:
177            **str:** 
178                Path to the labels file in `.tsv` format.
179        """
180        return self.__labels_file
181
182    @labels_file.setter
183    def labels_file(self, x: str) -> None:
184        """
185        Update `labels_file`.
186        """
187        self.__labels_file = x
188
189    @property
190    def ancestry(self) -> Optional[str]:
191        """
192        Retrieve `ancestry`.
193        
194        Returns:
195            **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
196        """
197        return self.__ancestry
198
199    @ancestry.setter
200    def ancestry(self, x: str) -> None:
201        """
202        Update `ancestry`.
203        """
204        self.__ancestry = x
205
206    @property
207    def is_masked(self) -> bool:
208        """
209        Retrieve `is_masked`.
210        
211        Returns:
212            **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
213        """
214        return self.__is_masked
215
216    @is_masked.setter
217    def is_masked(self, x: bool) -> None:
218        """
219        Update `is_masked`.
220        """
221        self.__is_masked = x
222
223    @property
224    def prob_thresh(self) -> float:
225        """
226        Retrieve `prob_thresh`.
227        
228        Returns:
229            **float:** Minimum probability threshold for a SNP to belong to an ancestry.
230        """
231        return self.__prob_thresh
232
233    @prob_thresh.setter
234    def prob_thresh(self, x: float) -> None:
235        """
236        Update `prob_thresh`.
237        """
238        self.__prob_thresh = x
239
240    @property
241    def average_strands(self) -> bool:
242        """
243        Retrieve `average_strands`.
244        
245        Returns:
246            **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
247        """
248        return self.__average_strands
249
250    @average_strands.setter
251    def average_strands(self, x: bool) -> None:
252        """
253        Update `average_strands`.
254        """
255        self.__average_strands = x
256
257    @property
258    def is_weighted(self) -> bool:
259        """
260        Retrieve `is_weighted`.
261        
262        Returns:
263            **bool:** True if weights are provided in the labels file, or False otherwise.
264        """
265        return self.__is_weighted
266
267    @is_weighted.setter
268    def is_weighted(self, x: bool) -> None:
269        """
270        Update `is_weighted`.
271        """
272        self.__is_weighted = x
273
274    @property
275    def groups_to_remove(self) -> Dict[int, List[str]]:
276        """
277        Retrieve `groups_to_remove`.
278        
279        Returns:
280            **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
281                lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
282        """
283        return self.__groups_to_remove
284
285    @groups_to_remove.setter
286    def groups_to_remove(self, x: Dict[int, List[str]]) -> None:
287        """
288        Update `groups_to_remove`.
289        """
290        self.__groups_to_remove = x
291
292    @property
293    def min_percent_snps(self) -> float:
294        """
295        Retrieve `min_percent_snps`.
296        
297        Returns:
298            **float:** 
299                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
300                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
301        """
302        return self.__min_percent_snps
303
304    @min_percent_snps.setter
305    def min_percent_snps(self, x: float) -> None:
306        """
307        Update `min_percent_snps`.
308        """
309        self.__min_percent_snps = x
310
311    @property
312    def save_masks(self) -> bool:
313        """
314        Retrieve `save_masks`.
315        
316        Returns:
317            **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
318        """
319        return self.__save_masks
320
321    @save_masks.setter
322    def save_masks(self, x: bool) -> None:
323        """
324        Update `save_masks`.
325        """
326        self.__save_masks = x
327
328    @property
329    def load_masks(self) -> bool:
330        """
331        Retrieve `load_masks`.
332        
333        Returns:
334            **bool:** 
335                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 
336                by `masks_file`, or False otherwise.
337        """
338        return self.__load_masks
339
340    @load_masks.setter
341    def load_masks(self, x: bool) -> None:
342        """
343        Update `load_masks`.
344        """
345        self.__load_masks = x
346
347    @property
348    def masks_file(self) -> Union[str, pathlib.Path]:
349        """
350        Retrieve `masks_file`.
351        
352        Returns:
353            **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices.
354        """
355        return self.__masks_file
356
357    @masks_file.setter
358    def masks_file(self, x: Union[str, pathlib.Path]) -> None:
359        """
360        Update `masks_file`.
361        """
362        self.__masks_file = x
363
364    @property
365    def distance_type(self) -> str:
366        """
367        Retrieve `distance_type`.
368        
369        Returns:
370            **str:** 
371                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
372                If `average_strands=True`, use 'distance_type=AP'.
373        """
374        return self.__distance_type
375
376    @distance_type.setter
377    def distance_type(self, x: str) -> None:
378        """
379        Update `distance_type`.
380        """
381        self.__distance_type = x
382
383    @property
384    def n_components(self) -> int:
385        """
386        Retrieve `n_components`.
387        
388        Returns:
389            **int:** The number of principal components.
390        """
391        return self.__n_components
392
393    @n_components.setter
394    def n_components(self, x: int) -> None:
395        """
396        Update `n_components`.
397        """
398        self.__n_components = x
399
400    @property
401    def rsid_or_chrompos(self) -> int:
402        """
403        Retrieve `rsid_or_chrompos`.
404        
405        Returns:
406            **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
407        """
408        return self.__rsid_or_chrompos
409
410    @rsid_or_chrompos.setter
411    def rsid_or_chrompos(self, x: int) -> None:
412        """
413        Update `rsid_or_chrompos`.
414        """
415        self.__rsid_or_chrompos = x
416
417    @property
418    def X_new_(self) -> Optional[np.ndarray]:
419        """
420        Retrieve `X_new_`.
421
422        Returns:
423            **array of shape (n_haplotypes_, n_components):** 
424                The transformed SNP data projected onto the `n_components` principal components.
425                n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 
426                (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 
427                `(n_samples * 2, n_components)`.
428        """
429        return self.__X_new_
430
431    @X_new_.setter
432    def X_new_(self, x: np.ndarray) -> None:
433        """
434        Update `X_new_`.
435        """
436        self.__X_new_ = x
437
438    @property
439    def haplotypes_(self) -> Optional[List[str]]:
440        """
441        Retrieve `haplotypes_`.
442
443        Returns:
444            list of str:
445                A list of unique haplotype identifiers.
446        """
447        if isinstance(self.__haplotypes_, np.ndarray):
448            return self.__haplotypes_.ravel().tolist()  # Flatten and convert NumPy array to a list
449        elif isinstance(self.__haplotypes_, list):
450            if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray):
451                return self.__haplotypes_[0].ravel().tolist()  # Handle list containing a single array
452            return self.__haplotypes_  # Already a flat list
453        elif self.__haplotypes_ is None:
454            return None  # If no haplotypes are set
455        else:
456            raise TypeError("`haplotypes_` must be a list or a NumPy array.")
457
458    @haplotypes_.setter
459    def haplotypes_(self, x: Union[np.ndarray, List[str]]) -> None:
460        """
461        Update `haplotypes_`.
462        """
463        if isinstance(x, np.ndarray):
464            self.__haplotypes_ = x.ravel().tolist()  # Flatten and convert to a list
465        elif isinstance(x, list):
466            if len(x) == 1 and isinstance(x[0], np.ndarray):  # Handle list containing a single array
467                self.__haplotypes_ = x[0].ravel().tolist()
468            else:
469                self.__haplotypes_ = x  # Use directly if already a list
470        else:
471            raise TypeError("`x` must be a list or a NumPy array.")
472
473    @property
474    def samples_(self) -> Optional[List[str]]:
475        """
476        Retrieve `samples_`.
477
478        Returns:
479            list of str:
480                A list of sample identifiers based on `haplotypes_` and `average_strands`.
481        """
482        haplotypes = self.haplotypes_
483        if haplotypes is None:
484            return None
485        if self.__average_strands:
486            return haplotypes
487        else:
488            return [x[:-2] for x in haplotypes]
489
490    @property
491    def n_haplotypes(self) -> Optional[int]:
492        """
493        Retrieve `n_haplotypes`.
494
495        Returns:
496            **int:**
497                The total number of haplotypes, potentially reduced if filtering is applied 
498                (`min_percent_snps > 0`).
499        """
500        return len(self.__haplotypes_)
501
502    @property
503    def n_samples(self) -> Optional[int]:
504        """
505        Retrieve `n_samples`.
506
507        Returns:
508            **int:**
509                The total number of samples, potentially reduced if filtering is applied 
510                (`min_percent_snps > 0`).
511        """
512        return len(np.unique(self.samples_))
513
514    @staticmethod
515    def _load_masks_file(masks_file):
516        mask_files = np.load(masks_file, allow_pickle=True)
517        masks = mask_files['masks']
518        rs_ID_list = mask_files['rs_ID_list']
519        ind_ID_list = mask_files['ind_ID_list']
520        groups = mask_files['labels']
521        weights = mask_files['weights']
522        return masks, rs_ID_list, ind_ID_list, groups, weights
523
524    def fit_transform(
525            self,
526            snpobj: Optional['SNPObject'] = None, 
527            laiobj: Optional['LocalAncestryObject'] = None,
528            labels_file: Optional[str] = None,
529            ancestry: Optional[str] = None,
530            average_strands: Optional[bool] = None
531        ) -> np.ndarray:
532        """
533        Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data.
534
535        Args:
536            snpobj (SNPObject, optional): 
537                A SNPObject instance.
538            laiobj (LAIObject, optional): 
539                A LAIObject instance.
540            labels_file (str, optional): 
541                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
542                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
543                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
544                combined into groups, with respective weights.
545            ancestry (str, optional): 
546                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
547            average_strands (bool, optional): 
548                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
549                If None, defaults to `self.average_strands`.
550
551        Returns:
552            **array of shape (n_samples, n_components):** 
553                The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`.
554        """
555        if snpobj is None:
556            snpobj = self.snpobj
557        if laiobj is None:
558            laiobj = self.laiobj
559        if labels_file is None:
560            labels_file = self.labels_file
561        if ancestry is None:
562            ancestry = self.ancestry
563        if average_strands is None:
564            average_strands = self.average_strands
565        
566        if not self.is_masked:
567            self.ancestry = '1'
568        if self.load_masks:
569            masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file)
570        else:
571            masks, rs_ID_list, ind_ID_list = array_process(
572                self.snpobj,
573                self.laiobj,
574                self.average_strands,
575                self.prob_thresh, 
576                self.is_masked, 
577                self.rsid_or_chrompos
578            )
579
580            masks, ind_ID_list, groups, weights = process_labels_weights(
581                self.labels_file, 
582                masks, 
583                rs_ID_list,
584                ind_ID_list, 
585                self.average_strands, 
586                self.ancestry, 
587                self.min_percent_snps, 
588                self.groups_to_remove,
589                self.is_weighted, 
590                self.save_masks, 
591                self.masks_file
592            )
593        
594        distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]]
595        
596        self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components)
597        self.haplotypes_ = ind_ID_list

A class for multiple array ancestry-specific multidimensional scaling (maasMDS).

This class supports both separate and averaged strand processing for SNP data. If the snpobj, laiobj, labels_file, and ancestry parameters are all provided during instantiation, the fit_transform method will be automatically called, applying the specified maasMDS method to transform the data upon instantiation.

maasMDS( snpobj, laiobj, labels_file, ancestry, is_masked: bool = True, prob_thresh: float = 0, average_strands: bool = False, is_weighted: bool = False, groups_to_remove: Dict[int, List[str]] = {}, min_percent_snps: float = 4, save_masks: bool = False, load_masks: bool = False, masks_file: Union[str, pathlib._local.Path] = 'masks.npz', distance_type: str = 'AP', n_components: int = 2, rsid_or_chrompos: int = 2)
 22    def __init__(
 23            self, 
 24            snpobj, 
 25            laiobj,
 26            labels_file,
 27            ancestry,
 28            is_masked: bool = True,
 29            prob_thresh: float = 0,
 30            average_strands: bool = False,
 31            is_weighted: bool = False,
 32            groups_to_remove: Dict[int, List[str]] = {},
 33            min_percent_snps: float = 4,
 34            save_masks: bool = False,
 35            load_masks: bool = False,
 36            masks_file: Union[str, pathlib.Path] = 'masks.npz',
 37            distance_type: str = 'AP',
 38            n_components: int = 2,
 39            rsid_or_chrompos: int = 2
 40        ):
 41        """
 42        Args:
 43            snpobj (SNPObject, optional): 
 44                A SNPObject instance.
 45            laiobj (LAIObject, optional): 
 46                A LAIObject instance.
 47            labels_file (str, optional): 
 48                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
 49                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
 50                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
 51                combined into groups, with respective weights.
 52            ancestry (str, optional): 
 53                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
 54            is_masked (bool, default=True): 
 55                True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
 56            prob_thresh (float, default=0.0): 
 57                Minimum probability threshold for a SNP to belong to an ancestry.
 58            average_strands (bool, default=False): 
 59                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
 60            is_weighted (bool, default=False): 
 61                True if weights are provided in the labels file, or False otherwise.
 62            groups_to_remove (dict of int to list of str, default={}): 
 63                Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
 64                lists of groups to remove for each array.
 65                Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
 66            min_percent_snps (float, default=4.0): 
 67                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
 68                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
 69            save_masks (bool, default=False): 
 70                True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
 71            load_masks (bool, default=False): 
 72                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified by `masks_file`, or False otherwise.
 73            masks_file (str or pathlib.Path, default='masks.npz'): 
 74                Path to the `.npz` file used for saving/loading masked matrices.
 75            distance_type (str, default='AP'): 
 76                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
 77                If `average_strands=True`, use 'distance_type=AP'.
 78            n_components (int, default=2): 
 79                The number of principal components.
 80            rsid_or_chrompos (int, default=2): 
 81                Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
 82        """
 83        self.__snpobj = snpobj
 84        self.__laiobj = laiobj
 85        self.__labels_file = labels_file
 86        self.__ancestry = ancestry
 87        self.__is_masked = is_masked
 88        self.__prob_thresh = prob_thresh
 89        self.__average_strands = average_strands
 90        self.__groups_to_remove = groups_to_remove
 91        self.__min_percent_snps = min_percent_snps
 92        self.__is_weighted = is_weighted
 93        self.__save_masks = save_masks
 94        self.__load_masks = load_masks
 95        self.__masks_file = masks_file
 96        self.__distance_type = distance_type
 97        self.__n_components = n_components
 98        self.__rsid_or_chrompos = rsid_or_chrompos
 99        self.__X_new_ = None  # Store transformed SNP data
100        self.__haplotypes_ = None  # Store haplotypes after filtering if min_percent_snps > 0
101        self.__samples_ = None  # Store samples after filtering if min_percent_snps > 0
102
103        # Fit and transform if a `snpobj`, `laiobj`, `labels_file`, and `ancestry` are provided
104        if self.snpobj is not None and self.laiobj is not None and self.labels_file is not None and self.ancestry is not None:
105            self.fit_transform(snpobj, laiobj, labels_file, ancestry)
Arguments:
  • snpobj (SNPObject, optional): A SNPObject instance.
  • laiobj (LAIObject, optional): A LAIObject instance.
  • labels_file (str, optional): Path to the labels file in .tsv format. The first column, indID, contains the individual identifiers, and the second column, label, specifies the groups for all individuals. If is_weighted=True, a weight column with individual weights is required. Optionally, combination and combination_weight columns can specify sets of individuals to be combined into groups, with respective weights.
  • ancestry (str, optional): Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
  • is_masked (bool, default=True): True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
  • prob_thresh (float, default=0.0): Minimum probability threshold for a SNP to belong to an ancestry.
  • average_strands (bool, default=False): True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
  • is_weighted (bool, default=False): True if weights are provided in the labels file, or False otherwise.
  • groups_to_remove (dict of int to list of str, default={}): Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are lists of groups to remove for each array. Example: {1: ['group1', 'group2'], 2: [], 3: ['group3']}.
  • min_percent_snps (float, default=4.0): Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
  • save_masks (bool, default=False): True if the masked matrices are to be saved in a .npz file, or False otherwise.
  • load_masks (bool, default=False): True if the masked matrices are to be loaded from a pre-existing .npz file specified by masks_file, or False otherwise.
  • masks_file (str or pathlib.Path, default='masks.npz'): Path to the .npz file used for saving/loading masked matrices.
  • distance_type (str, default='AP'): Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). If average_strands=True, use 'distance_type=AP'.
  • n_components (int, default=2): The number of principal components.
  • rsid_or_chrompos (int, default=2): Format indicator for SNP IDs in the SNP data. Use 1 for rsID format or 2 for chromosome_position.
def copy(self) -> maasMDS:
127    def copy(self) -> 'maasMDS':
128        """
129        Create and return a copy of `self`.
130
131        Returns:
132            **maasMDS:** 
133                A new instance of the current object.
134        """
135        return copy.copy(self)

Create and return a copy of self.

Returns:

maasMDS: A new instance of the current object.

snpobj: Optional[snputils.snp.genobj.SNPObject]
137    @property
138    def snpobj(self) -> Optional['SNPObject']:
139        """
140        Retrieve `snpobj`.
141        
142        Returns:
143            **SNPObject:** A SNPObject instance.
144        """
145        return self.__snpobj

Retrieve snpobj.

Returns:

SNPObject: A SNPObject instance.

laiobj: Optional[snputils.ancestry.genobj.LocalAncestryObject]
154    @property
155    def laiobj(self) -> Optional['LocalAncestryObject']:
156        """
157        Retrieve `laiobj`.
158        
159        Returns:
160            **LocalAncestryObject:** A LAIObject instance.
161        """
162        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LAIObject instance.

labels_file: Optional[str]
171    @property
172    def labels_file(self) -> Optional[str]:
173        """
174        Retrieve `labels_file`.
175        
176        Returns:
177            **str:** 
178                Path to the labels file in `.tsv` format.
179        """
180        return self.__labels_file

Retrieve labels_file.

Returns:

str: Path to the labels file in .tsv format.

ancestry: Optional[str]
189    @property
190    def ancestry(self) -> Optional[str]:
191        """
192        Retrieve `ancestry`.
193        
194        Returns:
195            **str:** Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at `0`.
196        """
197        return self.__ancestry

Retrieve ancestry.

Returns:

str: Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.

is_masked: bool
206    @property
207    def is_masked(self) -> bool:
208        """
209        Retrieve `is_masked`.
210        
211        Returns:
212            **bool:** True if an ancestry file is passed for ancestry-specific masking, or False otherwise.
213        """
214        return self.__is_masked

Retrieve is_masked.

Returns:

bool: True if an ancestry file is passed for ancestry-specific masking, or False otherwise.

prob_thresh: float
223    @property
224    def prob_thresh(self) -> float:
225        """
226        Retrieve `prob_thresh`.
227        
228        Returns:
229            **float:** Minimum probability threshold for a SNP to belong to an ancestry.
230        """
231        return self.__prob_thresh

Retrieve prob_thresh.

Returns:

float: Minimum probability threshold for a SNP to belong to an ancestry.

average_strands: bool
240    @property
241    def average_strands(self) -> bool:
242        """
243        Retrieve `average_strands`.
244        
245        Returns:
246            **bool:** True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
247        """
248        return self.__average_strands

Retrieve average_strands.

Returns:

bool: True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.

is_weighted: bool
257    @property
258    def is_weighted(self) -> bool:
259        """
260        Retrieve `is_weighted`.
261        
262        Returns:
263            **bool:** True if weights are provided in the labels file, or False otherwise.
264        """
265        return self.__is_weighted

Retrieve is_weighted.

Returns:

bool: True if weights are provided in the labels file, or False otherwise.

groups_to_remove: Dict[int, List[str]]
274    @property
275    def groups_to_remove(self) -> Dict[int, List[str]]:
276        """
277        Retrieve `groups_to_remove`.
278        
279        Returns:
280            **dict of int to list of str:** Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are 
281                lists of groups to remove for each array. Example: `{1: ['group1', 'group2'], 2: [], 3: ['group3']}`.
282        """
283        return self.__groups_to_remove

Retrieve groups_to_remove.

Returns:

dict of int to list of str: Dictionary specifying groups to exclude from analysis. Keys are array numbers, and values are lists of groups to remove for each array. Example: {1: ['group1', 'group2'], 2: [], 3: ['group3']}.

min_percent_snps: float
292    @property
293    def min_percent_snps(self) -> float:
294        """
295        Retrieve `min_percent_snps`.
296        
297        Returns:
298            **float:** 
299                Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. 
300                All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.
301        """
302        return self.__min_percent_snps

Retrieve min_percent_snps.

Returns:

float: Minimum percentage of SNPs to be known in an individual for an individual to be included in the analysis. All individuals with fewer percent of unmasked SNPs than this threshold will be excluded.

save_masks: bool
311    @property
312    def save_masks(self) -> bool:
313        """
314        Retrieve `save_masks`.
315        
316        Returns:
317            **bool:** True if the masked matrices are to be saved in a `.npz` file, or False otherwise.
318        """
319        return self.__save_masks

Retrieve save_masks.

Returns:

bool: True if the masked matrices are to be saved in a .npz file, or False otherwise.

load_masks: bool
328    @property
329    def load_masks(self) -> bool:
330        """
331        Retrieve `load_masks`.
332        
333        Returns:
334            **bool:** 
335                True if the masked matrices are to be loaded from a pre-existing `.npz` file specified 
336                by `masks_file`, or False otherwise.
337        """
338        return self.__load_masks

Retrieve load_masks.

Returns:

bool: True if the masked matrices are to be loaded from a pre-existing .npz file specified by masks_file, or False otherwise.

masks_file: Union[str, pathlib._local.Path]
347    @property
348    def masks_file(self) -> Union[str, pathlib.Path]:
349        """
350        Retrieve `masks_file`.
351        
352        Returns:
353            **str or pathlib.Path:** Path to the `.npz` file used for saving/loading masked matrices.
354        """
355        return self.__masks_file

Retrieve masks_file.

Returns:

str or pathlib.Path: Path to the .npz file used for saving/loading masked matrices.

distance_type: str
364    @property
365    def distance_type(self) -> str:
366        """
367        Retrieve `distance_type`.
368        
369        Returns:
370            **str:** 
371                Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise).
372                If `average_strands=True`, use 'distance_type=AP'.
373        """
374        return self.__distance_type

Retrieve distance_type.

Returns:

str: Distance metric to use. Options to choose from are: 'Manhattan', 'RMS' (Root Mean Square), 'AP' (Average Pairwise). If average_strands=True, use 'distance_type=AP'.

n_components: int
383    @property
384    def n_components(self) -> int:
385        """
386        Retrieve `n_components`.
387        
388        Returns:
389            **int:** The number of principal components.
390        """
391        return self.__n_components

Retrieve n_components.

Returns:

int: The number of principal components.

rsid_or_chrompos: int
400    @property
401    def rsid_or_chrompos(self) -> int:
402        """
403        Retrieve `rsid_or_chrompos`.
404        
405        Returns:
406            **int:** Format indicator for SNP IDs in the SNP data. Use 1 for `rsID` format or 2 for `chromosome_position`.
407        """
408        return self.__rsid_or_chrompos

Retrieve rsid_or_chrompos.

Returns:

int: Format indicator for SNP IDs in the SNP data. Use 1 for rsID format or 2 for chromosome_position.

X_new_: Optional[numpy.ndarray]
417    @property
418    def X_new_(self) -> Optional[np.ndarray]:
419        """
420        Retrieve `X_new_`.
421
422        Returns:
423            **array of shape (n_haplotypes_, n_components):** 
424                The transformed SNP data projected onto the `n_components` principal components.
425                n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied 
426                (`min_percent_snps > 0`). For diploid individuals without filtering, the shape is 
427                `(n_samples * 2, n_components)`.
428        """
429        return self.__X_new_

Retrieve X_new_.

Returns:

array of shape (n_haplotypes_, n_components): The transformed SNP data projected onto the n_components principal components. n_haplotypes_ is the number of haplotypes, potentially reduced if filtering is applied (min_percent_snps > 0). For diploid individuals without filtering, the shape is (n_samples * 2, n_components).

haplotypes_: Optional[List[str]]
438    @property
439    def haplotypes_(self) -> Optional[List[str]]:
440        """
441        Retrieve `haplotypes_`.
442
443        Returns:
444            list of str:
445                A list of unique haplotype identifiers.
446        """
447        if isinstance(self.__haplotypes_, np.ndarray):
448            return self.__haplotypes_.ravel().tolist()  # Flatten and convert NumPy array to a list
449        elif isinstance(self.__haplotypes_, list):
450            if len(self.__haplotypes_) == 1 and isinstance(self.__haplotypes_[0], np.ndarray):
451                return self.__haplotypes_[0].ravel().tolist()  # Handle list containing a single array
452            return self.__haplotypes_  # Already a flat list
453        elif self.__haplotypes_ is None:
454            return None  # If no haplotypes are set
455        else:
456            raise TypeError("`haplotypes_` must be a list or a NumPy array.")

Retrieve haplotypes_.

Returns:

list of str: A list of unique haplotype identifiers.

samples_: Optional[List[str]]
473    @property
474    def samples_(self) -> Optional[List[str]]:
475        """
476        Retrieve `samples_`.
477
478        Returns:
479            list of str:
480                A list of sample identifiers based on `haplotypes_` and `average_strands`.
481        """
482        haplotypes = self.haplotypes_
483        if haplotypes is None:
484            return None
485        if self.__average_strands:
486            return haplotypes
487        else:
488            return [x[:-2] for x in haplotypes]

Retrieve samples_.

Returns:

list of str: A list of sample identifiers based on haplotypes_ and average_strands.

n_haplotypes: Optional[int]
490    @property
491    def n_haplotypes(self) -> Optional[int]:
492        """
493        Retrieve `n_haplotypes`.
494
495        Returns:
496            **int:**
497                The total number of haplotypes, potentially reduced if filtering is applied 
498                (`min_percent_snps > 0`).
499        """
500        return len(self.__haplotypes_)

Retrieve n_haplotypes.

Returns:

int: The total number of haplotypes, potentially reduced if filtering is applied (min_percent_snps > 0).

n_samples: Optional[int]
502    @property
503    def n_samples(self) -> Optional[int]:
504        """
505        Retrieve `n_samples`.
506
507        Returns:
508            **int:**
509                The total number of samples, potentially reduced if filtering is applied 
510                (`min_percent_snps > 0`).
511        """
512        return len(np.unique(self.samples_))

Retrieve n_samples.

Returns:

int: The total number of samples, potentially reduced if filtering is applied (min_percent_snps > 0).

def fit_transform( self, snpobj: Optional[snputils.snp.genobj.SNPObject] = None, laiobj: Optional[snputils.ancestry.genobj.LocalAncestryObject] = None, labels_file: Optional[str] = None, ancestry: Optional[str] = None, average_strands: Optional[bool] = None) -> numpy.ndarray:
524    def fit_transform(
525            self,
526            snpobj: Optional['SNPObject'] = None, 
527            laiobj: Optional['LocalAncestryObject'] = None,
528            labels_file: Optional[str] = None,
529            ancestry: Optional[str] = None,
530            average_strands: Optional[bool] = None
531        ) -> np.ndarray:
532        """
533        Fit the model to the SNP data stored in the provided `snpobj` and apply the dimensionality reduction on the same SNP data.
534
535        Args:
536            snpobj (SNPObject, optional): 
537                A SNPObject instance.
538            laiobj (LAIObject, optional): 
539                A LAIObject instance.
540            labels_file (str, optional): 
541                Path to the labels file in .tsv format. The first column, `indID`, contains the individual identifiers, and the second 
542                column, `label`, specifies the groups for all individuals. If `is_weighted=True`, a `weight` column with individual 
543                weights is required. Optionally, `combination` and `combination_weight` columns can specify sets of individuals to be 
544                combined into groups, with respective weights.
545            ancestry (str, optional): 
546                Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
547            average_strands (bool, optional): 
548                True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise.
549                If None, defaults to `self.average_strands`.
550
551        Returns:
552            **array of shape (n_samples, n_components):** 
553                The transformed SNP data projected onto the `n_components` principal components, stored in `self.X_new_`.
554        """
555        if snpobj is None:
556            snpobj = self.snpobj
557        if laiobj is None:
558            laiobj = self.laiobj
559        if labels_file is None:
560            labels_file = self.labels_file
561        if ancestry is None:
562            ancestry = self.ancestry
563        if average_strands is None:
564            average_strands = self.average_strands
565        
566        if not self.is_masked:
567            self.ancestry = '1'
568        if self.load_masks:
569            masks, rs_ID_list, ind_ID_list, groups, weights = self._load_masks_file(self.masks_file)
570        else:
571            masks, rs_ID_list, ind_ID_list = array_process(
572                self.snpobj,
573                self.laiobj,
574                self.average_strands,
575                self.prob_thresh, 
576                self.is_masked, 
577                self.rsid_or_chrompos
578            )
579
580            masks, ind_ID_list, groups, weights = process_labels_weights(
581                self.labels_file, 
582                masks, 
583                rs_ID_list,
584                ind_ID_list, 
585                self.average_strands, 
586                self.ancestry, 
587                self.min_percent_snps, 
588                self.groups_to_remove,
589                self.is_weighted, 
590                self.save_masks, 
591                self.masks_file
592            )
593        
594        distance_list = [[distance_mat(first=masks[0][self.ancestry], dist_func=self.distance_type)]]
595        
596        self.X_new_ = mds_transform(distance_list, groups, weights, ind_ID_list, self.n_components)
597        self.haplotypes_ = ind_ID_list

Fit the model to the SNP data stored in the provided snpobj and apply the dimensionality reduction on the same SNP data.

Arguments:
  • snpobj (SNPObject, optional): A SNPObject instance.
  • laiobj (LAIObject, optional): A LAIObject instance.
  • labels_file (str, optional): Path to the labels file in .tsv format. The first column, indID, contains the individual identifiers, and the second column, label, specifies the groups for all individuals. If is_weighted=True, a weight column with individual weights is required. Optionally, combination and combination_weight columns can specify sets of individuals to be combined into groups, with respective weights.
  • ancestry (str, optional): Ancestry for which dimensionality reduction is to be performed. Ancestry counter starts at 0.
  • average_strands (bool, optional): True if the haplotypes from the two parents are to be combined (averaged) for each individual, or False otherwise. If None, defaults to self.average_strands.
Returns:

array of shape (n_samples, n_components): The transformed SNP data projected onto the n_components principal components, stored in self.X_new_.