snputils.ancestry.genobj

1from .wide import GlobalAncestryObject
2from .local import LocalAncestryObject
3
4__all__ = ['GlobalAncestryObject', 'LocalAncestryObject']
class GlobalAncestryObject(snputils.ancestry.genobj.base.AncestryObject):
 10class GlobalAncestryObject(AncestryObject):
 11    """
 12    A class for Global Ancestry Inference (GAI) data.
 13    """
 14    def __init__(
 15        self,
 16        Q: np.ndarray,
 17        P: Optional[np.ndarray] = None,
 18        samples: Optional[Sequence] = None,
 19        snps: Optional[Sequence] = None,
 20        ancestries: Optional[Sequence] = None
 21    ) -> None:
 22        """
 23        Args:
 24            Q (array of shape (n_samples, n_ancestries)):
 25                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
 26                and each column corresponds to an ancestry.
 27            P (array of shape (n_snps, n_ancestries)):
 28                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
 29                and each column corresponds to an ancestry.
 30            samples (sequence of length n_samples, optional):
 31                A sequence containing unique identifiers for each sample. If None, sample identifiers 
 32                are assigned as integers from `0` to `n_samples - 1`.
 33            snps (sequence of length n_snps, optional):
 34                A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 
 35                from `0` to `n_snps - 1`.
 36            ancestries (sequence of length n_samples, optional):
 37                A sequence containing ancestry labels for each sample.
 38        """
 39        # Determine dimensions
 40        n_samples, n_ancestries_Q = Q.shape
 41        if P is not None:
 42            n_snps, n_ancestries_P = P.shape
 43            if n_ancestries_Q != n_ancestries_P:
 44                raise ValueError(
 45                    f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same."
 46                )
 47
 48        n_ancestries = n_ancestries_Q
 49
 50        # Assign default sample identifiers if none provided
 51        if samples is None:
 52            samples = list(range(n_samples))
 53        else:
 54            samples = list(samples)
 55            if len(samples) != n_samples:
 56                raise ValueError(
 57                    f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})."
 58                )
 59
 60        # Assign default SNP identifiers if none provided
 61        if P is None:
 62            snps = None
 63        else:
 64            if snps is None:
 65                snps = list(range(n_snps))
 66            else:
 67                snps = list(snps)
 68                if len(snps) != n_snps:
 69                    raise ValueError(
 70                        f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})."
 71                    )
 72
 73        if ancestries is not None:
 74            if len(ancestries) != n_samples:
 75                raise ValueError(
 76                    f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})."
 77                )
 78
 79        super().__init__(n_samples, n_ancestries)
 80
 81        # Store attributes
 82        self.__Q = Q
 83        self.__P = P
 84        self.__samples = np.asarray(samples)
 85        self.__snps = np.asarray(snps) if snps is not None else None
 86        self.__ancestries = np.asarray(ancestries) if ancestries is not None else None
 87
 88        # Perform sanity checks
 89        self._sanity_check()
 90
 91    def __getitem__(self, key):
 92        """
 93        To access an attribute of the class using the square bracket notation,
 94        similar to a dictionary.
 95        """
 96        try:
 97            return getattr(self, key)
 98        except AttributeError:
 99            raise KeyError(f'Invalid key: {key}')
100
101    def __setitem__(self, key, value):
102        """
103        To set an attribute of the class using the square bracket notation,
104        similar to a dictionary.
105        """
106        try:
107            setattr(self, key, value)
108        except AttributeError:
109            raise KeyError(f'Invalid key: {key}')
110
111    @property
112    def Q(self) -> np.ndarray:
113        """
114        Retrieve `Q`.
115
116        Returns:
117            **array of shape (n_samples, n_ancestries):** 
118                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
119                and each column corresponds to an ancestry.
120        """
121        return self.__Q
122    
123    @Q.setter
124    def Q(self, x: np.ndarray):
125        """
126        Update `Q`.
127        """
128        if x.shape != (self.n_samples, self.n_ancestries):
129            raise ValueError(
130                f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {x.shape}."
131            )
132        self.__Q = x
133    
134    @property
135    def P(self) -> np.ndarray:
136        """
137        Retrieve `P`.
138
139        Returns:
140            **array of shape (n_snps, n_ancestries):** 
141                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
142                and each column corresponds to an ancestry.
143        """
144        return self.__P
145
146    @P.setter
147    def P(self, x: np.ndarray):
148        """
149        Update `P`.
150        """
151        if x.shape[1] != self.n_ancestries:
152            raise ValueError(
153                f"P must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}."
154            )
155        self.__P = x
156        self._sanity_check()
157    
158    @property
159    def F(self) -> np.ndarray:
160        """
161        Alias for `P`.
162
163        Returns:
164            **array of shape (n_snps, n_ancestries):** 
165                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
166                and each column corresponds to an ancestry.
167        """
168        return self.P
169
170    @F.setter
171    def F(self, x: np.ndarray):
172        """
173        Update `F`.
174        """
175        if x.shape[1] != self.n_ancestries:
176            raise ValueError(
177                f"F must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}."
178            )
179        self.__P = x
180    
181    @property
182    def samples(self) -> Optional[np.ndarray]:
183        """
184        Retrieve `samples`.
185
186        Returns:
187            **array of shape (n_samples,):** 
188                An array containing unique identifiers for each sample. If None, sample 
189                identifiers are assigned as integers from `0` to `n_samples - 1`.
190        """
191        return self.__samples
192        
193    @samples.setter
194    def samples(self, x: Sequence):
195        """
196        Update `samples`.
197        """
198        x = list(x)
199        if len(x) != self.n_samples:
200            raise ValueError(
201                f"samples must have length {self.n_samples}; got length {len(x)}."
202            )
203        self.__samples = x
204
205    @property
206    def snps(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `snps`.
209
210        Returns:
211            **array of shape (n_snps,):** 
212                An array containing identifiers for each SNP. If None, SNPs are assigned as integers 
213                from `0` to `n_snps - 1`.
214        """
215        return self.__snps
216
217    @snps.setter
218    def snps(self, x: Sequence):
219        """
220        Update `snps`.
221        """
222        x = list(x)
223        if len(x) != self.n_snps:
224            raise ValueError(
225                f"snps must have length {self.n_snps}; got length {len(x)}."
226            )
227        self.__snps = np.asarray(x)
228
229    @property
230    def ancestries(self) -> Optional[np.ndarray]:
231        """
232        Retrieve `ancestries`.
233
234        Returns:
235            **array of shape (n_samples,):** 
236                An array containing ancestry labels for each sample.
237        """
238        return self.__ancestries
239    
240    @ancestries.setter
241    def ancestries(self, x: Sequence):
242        """
243        Update `ancestries`.
244        """
245        x = list(x)
246        num_x = len(x)
247        num_unique_x = len(np.unique(x))
248
249        if num_x != self.n_samples:
250            raise ValueError(
251                f"ancestries must have length {self.n_samples}; got length {num_x}."
252            )
253        if num_unique_x > self.n_ancestries:
254            raise ValueError(
255                f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_x} unique labels."
256            )
257        self.__ancestries = np.asarray(x)
258    
259    @property
260    def n_samples(self) -> int:
261        """
262        Retrieve `n_samples`.
263
264        Returns:
265            **int:** The total number of samples.
266        """
267        return self.__Q.shape[0]
268
269    @property
270    def n_snps(self) -> int:
271        """
272        Retrieve `n_snps`.
273
274        Returns:
275            **int:** The total number of SNPs.
276        """
277        return 0 if self.__P is None else self.__P.shape[0]
278
279    @property
280    def n_ancestries(self) -> int:
281        """
282        Retrieve `n_ancestries`.
283
284        Returns:
285            **int:** The total number of unique ancestries.
286        """
287        return self.__Q.shape[1]
288
289    def copy(self) -> 'GlobalAncestryObject':
290        """
291        Create and return a copy of `self`.
292
293        Returns:
294            **GlobalAncestryObject:** A new instance of the current object.
295        """
296        return copy.copy(self)
297
298    def keys(self) -> List[str]:
299        """
300        Retrieve a list of public attribute names for `self`.
301
302        Returns:
303            **list of str:** 
304                A list of attribute names, with internal name-mangling removed, 
305                for easier reference to public attributes in the instance.
306        """
307        return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
308
309    def _sanity_check(self) -> None:
310        """
311        Perform sanity checks to ensure that matrix dimensions are consistent with expected sizes.
312        
313        Raises:
314            **ValueError:** If any of the matrix dimensions do not match the expected sizes.
315        """       
316        # Check that the Q matrix has the correct shape
317        if self.__Q.shape != (self.n_samples, self.n_ancestries):
318            raise ValueError(
319                f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {self.__Q.shape}."
320            )
321
322        # Check that the P matrix has the correct shape (if provided)
323        if self.__P is not None:
324            if self.__P.shape != (self.n_snps, self.n_ancestries):
325                raise ValueError(
326                    f"P must have shape ({self.n_snps}, {self.n_ancestries}); got {self.__P.shape}."
327                )
328
329        # Check that samples length matches n_samples
330        if self.samples is not None:
331            if len(self.__samples) != self.n_samples:
332                raise ValueError(
333                    f"samples must have length {self.n_samples}; got length {len(self.__samples)}."
334                )
335
336        # Check that snps length matches n_snps
337        if self.snps is not None:
338            if len(self.__snps) != self.n_snps:
339                raise ValueError(
340                    f"snps must have length {self.n_snps}; got length {len(self.__snps)}."
341                )
342
343        # Check that ancestries length matches n_samples
344        if self.ancestries is not None:
345            if len(self.__ancestries) != self.n_samples:
346                raise ValueError(
347                    f"ancestries must have length {self.n_samples}; got length {len(self.__ancestries)}."
348                )
349
350            # Check number of unique ancestry labels
351            num_unique_ancestries = len(np.unique(self.__ancestries))
352            if num_unique_ancestries > self.n_ancestries:
353                raise ValueError(
354                    f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_ancestries} unique labels."
355                )
356
357    def save(self, file: Union[str, Path]) -> None:
358        """
359        Save the data stored in `self` to a specified file or set of files.
360
361        The format of the saved file(s) is determined by the file extension provided in the `file` 
362        argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 
363        the file is treated as a prefix for saving ADMIXTURE files.
364
365        **Supported formats:**
366
367        - `.pkl`: Pickle format for saving `self` in serialized form.
368        - Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
369
370        Args:
371            file (str or pathlib.Path): 
372                Path to the file where the data will be saved. If the extension is `.pkl`, the object
373                is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
374        """
375        path = Path(file)
376        suffix = path.suffix.lower()
377
378        if suffix == '.pkl':
379            self.save_pickle(path)
380        else:
381            self.save_admixture(path)
382
383    def save_admixture(self, file_prefix: Union[str, Path]) -> None:
384        """
385        Save the data stored in `self` into multiple ADMIXTURE files.
386        If the file already exists, it will be overwritten.
387
388        **Output files:**
389
390        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
391        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
392        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
393        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
394        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
395        
396        Args:
397            file_prefix (str or pathlib.Path): 
398                The base prefix for output file names, including directory path but excluding file extensions. 
399                The prefix is used to generate specific file names for each output, with file-specific 
400                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
401        """
402        from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter
403
404        AdmixtureWriter(self, file_prefix).write()
405
406    def save_pickle(self, file: Union[str, Path]) -> None:
407        """
408        Save `self` in serialized form to a `.pkl` file.
409        If the file already exists, it will be overwritten.
410
411        Args:
412            file (str or pathlib.Path): 
413                Path to the file where the data will be saved. It should end with `.pkl`. 
414                If the provided path does not have this extension, it will be appended.
415        """
416        import pickle
417        with open(file, 'wb') as file:
418            pickle.dump(self, file)

A class for Global Ancestry Inference (GAI) data.

GlobalAncestryObject( Q: numpy.ndarray, P: numpy.ndarray | None = None, samples: Sequence | None = None, snps: Sequence | None = None, ancestries: Sequence | None = None)
14    def __init__(
15        self,
16        Q: np.ndarray,
17        P: Optional[np.ndarray] = None,
18        samples: Optional[Sequence] = None,
19        snps: Optional[Sequence] = None,
20        ancestries: Optional[Sequence] = None
21    ) -> None:
22        """
23        Args:
24            Q (array of shape (n_samples, n_ancestries)):
25                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
26                and each column corresponds to an ancestry.
27            P (array of shape (n_snps, n_ancestries)):
28                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
29                and each column corresponds to an ancestry.
30            samples (sequence of length n_samples, optional):
31                A sequence containing unique identifiers for each sample. If None, sample identifiers 
32                are assigned as integers from `0` to `n_samples - 1`.
33            snps (sequence of length n_snps, optional):
34                A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 
35                from `0` to `n_snps - 1`.
36            ancestries (sequence of length n_samples, optional):
37                A sequence containing ancestry labels for each sample.
38        """
39        # Determine dimensions
40        n_samples, n_ancestries_Q = Q.shape
41        if P is not None:
42            n_snps, n_ancestries_P = P.shape
43            if n_ancestries_Q != n_ancestries_P:
44                raise ValueError(
45                    f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same."
46                )
47
48        n_ancestries = n_ancestries_Q
49
50        # Assign default sample identifiers if none provided
51        if samples is None:
52            samples = list(range(n_samples))
53        else:
54            samples = list(samples)
55            if len(samples) != n_samples:
56                raise ValueError(
57                    f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})."
58                )
59
60        # Assign default SNP identifiers if none provided
61        if P is None:
62            snps = None
63        else:
64            if snps is None:
65                snps = list(range(n_snps))
66            else:
67                snps = list(snps)
68                if len(snps) != n_snps:
69                    raise ValueError(
70                        f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})."
71                    )
72
73        if ancestries is not None:
74            if len(ancestries) != n_samples:
75                raise ValueError(
76                    f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})."
77                )
78
79        super().__init__(n_samples, n_ancestries)
80
81        # Store attributes
82        self.__Q = Q
83        self.__P = P
84        self.__samples = np.asarray(samples)
85        self.__snps = np.asarray(snps) if snps is not None else None
86        self.__ancestries = np.asarray(ancestries) if ancestries is not None else None
87
88        # Perform sanity checks
89        self._sanity_check()
Arguments:
  • Q (array of shape (n_samples, n_ancestries)): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
  • P (array of shape (n_snps, n_ancestries)): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
  • samples (sequence of length n_samples, optional): A sequence containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from 0 to n_samples - 1.
  • snps (sequence of length n_snps, optional): A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers from 0 to n_snps - 1.
  • ancestries (sequence of length n_samples, optional): A sequence containing ancestry labels for each sample.
Q: numpy.ndarray
111    @property
112    def Q(self) -> np.ndarray:
113        """
114        Retrieve `Q`.
115
116        Returns:
117            **array of shape (n_samples, n_ancestries):** 
118                A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample,
119                and each column corresponds to an ancestry.
120        """
121        return self.__Q

Retrieve Q.

Returns:

array of shape (n_samples, n_ancestries): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.

P: numpy.ndarray
134    @property
135    def P(self) -> np.ndarray:
136        """
137        Retrieve `P`.
138
139        Returns:
140            **array of shape (n_snps, n_ancestries):** 
141                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
142                and each column corresponds to an ancestry.
143        """
144        return self.__P

Retrieve P.

Returns:

array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

F: numpy.ndarray
158    @property
159    def F(self) -> np.ndarray:
160        """
161        Alias for `P`.
162
163        Returns:
164            **array of shape (n_snps, n_ancestries):** 
165                A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP,
166                and each column corresponds to an ancestry.
167        """
168        return self.P

Alias for P.

Returns:

array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

samples: numpy.ndarray | None
181    @property
182    def samples(self) -> Optional[np.ndarray]:
183        """
184        Retrieve `samples`.
185
186        Returns:
187            **array of shape (n_samples,):** 
188                An array containing unique identifiers for each sample. If None, sample 
189                identifiers are assigned as integers from `0` to `n_samples - 1`.
190        """
191        return self.__samples

Retrieve samples.

Returns:

array of shape (n_samples,): An array containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from 0 to n_samples - 1.

snps: numpy.ndarray | None
205    @property
206    def snps(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `snps`.
209
210        Returns:
211            **array of shape (n_snps,):** 
212                An array containing identifiers for each SNP. If None, SNPs are assigned as integers 
213                from `0` to `n_snps - 1`.
214        """
215        return self.__snps

Retrieve snps.

Returns:

array of shape (n_snps,): An array containing identifiers for each SNP. If None, SNPs are assigned as integers from 0 to n_snps - 1.

ancestries: numpy.ndarray | None
229    @property
230    def ancestries(self) -> Optional[np.ndarray]:
231        """
232        Retrieve `ancestries`.
233
234        Returns:
235            **array of shape (n_samples,):** 
236                An array containing ancestry labels for each sample.
237        """
238        return self.__ancestries

Retrieve ancestries.

Returns:

array of shape (n_samples,): An array containing ancestry labels for each sample.

n_samples: int
259    @property
260    def n_samples(self) -> int:
261        """
262        Retrieve `n_samples`.
263
264        Returns:
265            **int:** The total number of samples.
266        """
267        return self.__Q.shape[0]

Retrieve n_samples.

Returns:

int: The total number of samples.

n_snps: int
269    @property
270    def n_snps(self) -> int:
271        """
272        Retrieve `n_snps`.
273
274        Returns:
275            **int:** The total number of SNPs.
276        """
277        return 0 if self.__P is None else self.__P.shape[0]

Retrieve n_snps.

Returns:

int: The total number of SNPs.

n_ancestries: int
279    @property
280    def n_ancestries(self) -> int:
281        """
282        Retrieve `n_ancestries`.
283
284        Returns:
285            **int:** The total number of unique ancestries.
286        """
287        return self.__Q.shape[1]

Retrieve n_ancestries.

Returns:

int: The total number of unique ancestries.

def copy(self) -> 'GlobalAncestryObject':
289    def copy(self) -> 'GlobalAncestryObject':
290        """
291        Create and return a copy of `self`.
292
293        Returns:
294            **GlobalAncestryObject:** A new instance of the current object.
295        """
296        return copy.copy(self)

Create and return a copy of self.

Returns:

GlobalAncestryObject: A new instance of the current object.

def keys(self) -> List[str]:
298    def keys(self) -> List[str]:
299        """
300        Retrieve a list of public attribute names for `self`.
301
302        Returns:
303            **list of str:** 
304                A list of attribute names, with internal name-mangling removed, 
305                for easier reference to public attributes in the instance.
306        """
307        return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def save(self, file: str | pathlib.Path) -> None:
357    def save(self, file: Union[str, Path]) -> None:
358        """
359        Save the data stored in `self` to a specified file or set of files.
360
361        The format of the saved file(s) is determined by the file extension provided in the `file` 
362        argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 
363        the file is treated as a prefix for saving ADMIXTURE files.
364
365        **Supported formats:**
366
367        - `.pkl`: Pickle format for saving `self` in serialized form.
368        - Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
369
370        Args:
371            file (str or pathlib.Path): 
372                Path to the file where the data will be saved. If the extension is `.pkl`, the object
373                is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
374        """
375        path = Path(file)
376        suffix = path.suffix.lower()
377
378        if suffix == '.pkl':
379            self.save_pickle(path)
380        else:
381            self.save_admixture(path)

Save the data stored in self to a specified file or set of files.

The format of the saved file(s) is determined by the file extension provided in the file argument. If the extension is .pkl, the object is serialized as a pickle file. Otherwise, the file is treated as a prefix for saving ADMIXTURE files.

Supported formats:

  • .pkl: Pickle format for saving self in serialized form.
  • Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. If the extension is .pkl, the object is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
def save_admixture(self, file_prefix: str | pathlib.Path) -> None:
383    def save_admixture(self, file_prefix: Union[str, Path]) -> None:
384        """
385        Save the data stored in `self` into multiple ADMIXTURE files.
386        If the file already exists, it will be overwritten.
387
388        **Output files:**
389
390        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
391        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
392        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
393        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
394        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
395        
396        Args:
397            file_prefix (str or pathlib.Path): 
398                The base prefix for output file names, including directory path but excluding file extensions. 
399                The prefix is used to generate specific file names for each output, with file-specific 
400                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
401        """
402        from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter
403
404        AdmixtureWriter(self, file_prefix).write()

Save the data stored in self into multiple ADMIXTURE files. If the file already exists, it will be overwritten.

Output files:

  • <file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).
  • <file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).
  • <file_prefix>.map: Ancestry file (if ancestries information is available).
Arguments:
  • file_prefix (str or pathlib.Path): The base prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).
def save_pickle(self, file: str | pathlib.Path) -> None:
406    def save_pickle(self, file: Union[str, Path]) -> None:
407        """
408        Save `self` in serialized form to a `.pkl` file.
409        If the file already exists, it will be overwritten.
410
411        Args:
412            file (str or pathlib.Path): 
413                Path to the file where the data will be saved. It should end with `.pkl`. 
414                If the provided path does not have this extension, it will be appended.
415        """
416        import pickle
417        with open(file, 'wb') as file:
418            pickle.dump(self, file)

Save self in serialized form to a .pkl file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pkl. If the provided path does not have this extension, it will be appended.
class LocalAncestryObject(snputils.ancestry.genobj.base.AncestryObject):
 17class LocalAncestryObject(AncestryObject):
 18    """
 19    A class for window-level Local Ancestry Inference (LAI) data.
 20    """
 21    def __init__(
 22        self,
 23        haplotypes: List[str], 
 24        lai: np.ndarray,
 25        samples: Optional[List[str]] = None, 
 26        ancestry_map: Optional[Dict[str, str]] = None, 
 27        window_sizes: Optional[np.ndarray] = None,
 28        centimorgan_pos: Optional[np.ndarray] = None,
 29        chromosomes: Optional[np.ndarray] = None,
 30        physical_pos: Optional[np.ndarray] = None
 31    ) -> None:
 32        """
 33        Args:
 34            haplotypes (list of str of length n_haplotypes):
 35                A list of unique haplotype identifiers.
 36            lai (array of shape (n_windows, n_haplotypes)): 
 37                A 2D array containing local ancestry inference values, where each row represents a 
 38                genomic window, and each column corresponds to a haplotype phase for each sample.
 39            samples (list of str of length n_samples, optional):
 40                A list of unique sample identifiers.
 41            ancestry_map (dict of str to str, optional):
 42                A dictionary mapping ancestry codes to region names.
 43            window_sizes (array of shape (n_windows,), optional): 
 44                An array specifying the number of SNPs in each genomic window.
 45            centimorgan_pos (array of shape (n_windows, 2), optional): 
 46                A 2D array containing the start and end centimorgan positions for each window.
 47            chromosomes (array of shape (n_windows,), optional): 
 48                An array with chromosome numbers corresponding to each genomic window.
 49            physical_pos (array of shape (n_windows, 2), optional): 
 50                A 2D array containing the start and end physical positions for each window.
 51        """
 52        if lai.ndim != 2:
 53            raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).")
 54        
 55        # Determine the number of unique ancestries and samples from the LAI array
 56        n_ancestries = len(np.unique(lai))
 57        n_haplotypes = lai.shape[1]
 58        n_samples = n_haplotypes // 2
 59
 60        super(LocalAncestryObject, self).__init__(n_samples, n_ancestries)
 61
 62        self.__haplotypes = haplotypes
 63        self.__lai = lai
 64        self.__window_sizes = window_sizes
 65        self.__centimorgan_pos = centimorgan_pos
 66        self.__samples = samples
 67        self.__chromosomes = chromosomes
 68        self.__physical_pos = physical_pos
 69        self.__ancestry_map = ancestry_map
 70
 71        # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map
 72        self._sanity_check()
 73
 74    def __getitem__(self, key):
 75        """
 76        To access an attribute of the class using the square bracket notation,
 77        similar to a dictionary.
 78        """
 79        try:
 80            return getattr(self, key)
 81        except AttributeError:
 82            raise KeyError(f'Invalid key: {key}')
 83
 84    def __setitem__(self, key, value):
 85        """
 86        To set an attribute of the class using the square bracket notation,
 87        similar to a dictionary.
 88        """
 89        try:
 90            setattr(self, key, value)
 91        except AttributeError:
 92            raise KeyError(f'Invalid key: {key}')
 93
 94    @property
 95    def haplotypes(self) -> List[str]:
 96        """
 97        Retrieve `haplotypes`.
 98
 99        Returns:
100            **list of length n_haplotypes:** A list of unique haplotype identifiers.
101        """
102        return self.__haplotypes
103
104    @haplotypes.setter
105    def haplotypes(self, x):
106        """
107        Update `haplotypes`.
108        """
109        self.__haplotypes = x
110
111    @property
112    def lai(self) -> np.ndarray:
113        """
114        Retrieve `lai`.
115
116        Returns:
117            **array of shape (n_windows, n_haplotypes):** 
118                A 2D array containing local ancestry inference values, where each row represents a 
119                genomic window, and each column corresponds to a haplotype phase for each sample.
120        """
121        return self.__lai
122
123    @lai.setter
124    def lai(self, x):
125        """
126        Update `lai`.
127        """
128        self.__lai = x
129
130    @property
131    def samples(self) -> Optional[List[str]]:
132        """
133        Retrieve `samples`.
134
135        Returns:
136            **list of str:** A list of unique sample identifiers.
137        """
138        if self.__samples is not None:
139            return self.__samples
140        elif self.__haplotypes is not None:
141            return [hap.split('.')[0] for hap in self.__haplotypes][::2]
142        else:
143            return None
144    
145    @samples.setter
146    def samples(self, x):
147        """
148        Update `samples`.
149        """
150        self.__samples = x
151
152    @property
153    def ancestry_map(self) -> Optional[Dict[str, str]]:
154        """
155        Retrieve `ancestry_map`.
156
157        Returns:
158            **dict of str to str:** A dictionary mapping ancestry codes to region names.
159        """
160        return self.__ancestry_map
161
162    @ancestry_map.setter
163    def ancestry_map(self, x):
164        """
165        Update `ancestry_map`.
166        """
167        self.__ancestry_map = x
168
169    @property
170    def window_sizes(self) -> Optional[np.ndarray]:
171        """
172        Retrieve `window_sizes`.
173
174        Returns:
175            **array of shape (n_windows,):** 
176                An array specifying the number of SNPs in each genomic window.
177        """
178        return self.__window_sizes
179        
180    @window_sizes.setter
181    def window_sizes(self, x):
182        """
183        Update `window_sizes`.
184        """
185        self.__window_sizes = x
186
187    @property
188    def centimorgan_pos(self) -> Optional[np.ndarray]:
189        """
190        Retrieve `centimorgan_pos`.
191
192        Returns:
193            **array of shape (n_windows, 2):** 
194                A 2D array containing the start and end centimorgan positions for each window.
195        """
196        return self.__centimorgan_pos
197
198    @centimorgan_pos.setter
199    def centimorgan_pos(self, x):
200        """
201        Update `centimorgan_pos`.
202        """
203        self.__centimorgan_pos = x
204
205    @property
206    def chromosomes(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `chromosomes`.
209
210        Returns:
211            **array of shape (n_windows,):** 
212                An array with chromosome numbers corresponding to each genomic window.
213        """
214        return self.__chromosomes
215        
216    @chromosomes.setter
217    def chromosomes(self, x):
218        """
219        Update `chromosomes`.
220        """
221        self.__chromosomes = x
222
223    @property
224    def physical_pos(self) -> Optional[np.ndarray]:
225        """
226        Retrieve `physical_pos`.
227
228        Returns:
229            **array of shape (n_windows, 2):** 
230                A 2D array containing the start and end physical positions for each window.
231        """
232        return self.__physical_pos
233
234    @physical_pos.setter
235    def physical_pos(self, x):
236        """
237        Update `physical_pos`.
238        """
239        self.__physical_pos = x
240
241    @property
242    def n_samples(self) -> int:
243        """
244        Retrieve `n_samples`.
245
246        Returns:
247            **int:** 
248                The total number of samples.
249        """
250        if self.__samples is not None:
251            return len(self.__samples)
252        elif self.__haplotypes is not None:
253            # Divide by 2 because each sample has two associated haplotypes
254            return len(self.__haplotypes) // 2
255        else:
256            # Divide by 2 because columns represent haplotypes
257            return self.__lai.shape[1] // 2
258
259    @property
260    def n_ancestries(self) -> int:
261        """
262        Retrieve `n_ancestries`.
263
264        Returns:
265            **int:** The total number of unique ancestries.
266        """
267        return len(np.unique(self.__lai))
268    
269    @property
270    def n_haplotypes(self) -> int:
271        """
272        Retrieve `n_haplotypes`.
273
274        Returns:
275            **int:** The total number of haplotypes.
276        """
277        if self.__haplotypes is not None:
278            return len(self.__haplotypes)
279        else:
280            return self.__lai.shape[1]
281
282    @property
283    def n_windows(self) -> int:
284        """
285        Retrieve `n_windows`.
286
287        Returns:
288            **int:** The total number of genomic windows.
289        """
290        return self.__lai.shape[0]
291
292    def copy(self) -> 'LocalAncestryObject':
293        """
294        Create and return a copy of `self`.
295
296        Returns:
297            **LocalAncestryObject:** 
298                A new instance of the current object.
299        """
300        return copy.copy(self)
301
302    def keys(self) -> List[str]:
303        """
304        Retrieve a list of public attribute names for `self`.
305
306        Returns:
307            **list of str:** 
308                A list of attribute names, with internal name-mangling removed, 
309                for easier reference to public attributes in the instance.
310        """
311        return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
312
313    def filter_windows(
314            self,
315            indexes: Union[int, Sequence[int], np.ndarray],
316            include: bool = True,
317            inplace: bool = False
318        ) -> Optional['LocalAncestryObject']:
319        """
320        Filter genomic windows based on specified indexes. 
321
322        This method updates the `lai` attribute to include or exclude the specified genomic windows. 
323        Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 
324        updated accordingly if they are not None. The order of genomic windows is preserved.
325
326        Negative indexes are supported and follow 
327        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 
328
329        Args:
330            indexes (int or array-like of int): 
331                Index(es) of the windows to include or exclude. Can be a single integer or a
332                sequence of integers. Negative indexes are supported.
333            include (bool, default=True): 
334                If True, includes only the specified windows. If False, excludes the specified
335                windows. Default is True.
336            inplace (bool, default=False): 
337                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 
338                the windows filtered. Default is False.
339
340        Returns:
341            **Optional[LocalAncestryObject]:** 
342                A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 
343                If `inplace=True`, modifies `self` in place and returns None.
344        """
345        # Convert indexes to a NumPy array
346        indexes = np.atleast_1d(indexes)
347
348        # Get total number of windows
349        n_windows = self.n_windows
350
351        # Validate indexes, allowing negative indexes
352        if np.any((indexes < -n_windows) | (indexes >= n_windows)):
353            raise IndexError("One or more indexes are out of bounds.")
354
355        # Create boolean mask
356        mask = np.zeros(n_windows, dtype=bool)
357        mask[indexes] = True
358
359        # Invert mask if `include=False`
360        if not include:
361            mask = ~mask
362        
363        # Filter `lai`
364        filtered_lai = self['lai'][mask, :] 
365        
366        # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering
367        filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None
368        filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None
369        filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None
370        filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None
371
372        # Modify the original object if `inplace=True`, otherwise create and return a copy
373        if inplace:
374            self['lai'] = filtered_lai
375            if filtered_window_sizes is not None:
376                self['window_sizes'] = filtered_window_sizes
377            if filtered_chromosomes is not None:
378                self['chromosomes'] = filtered_chromosomes
379            if filtered_centimorgan_pos is not None:
380                self['centimorgan_pos'] = filtered_centimorgan_pos
381            if filtered_physical_pos is not None:
382                self['physical_pos'] = filtered_physical_pos
383            return None
384        else:
385            laiobj = self.copy()
386            laiobj['lai'] = filtered_lai
387            if filtered_window_sizes is not None:
388                laiobj['window_sizes'] = filtered_window_sizes
389            if filtered_chromosomes is not None:
390                laiobj['chromosomes'] = filtered_chromosomes
391            if filtered_centimorgan_pos is not None:
392                laiobj['centimorgan_pos'] = filtered_centimorgan_pos
393            if filtered_physical_pos is not None:
394                laiobj['physical_pos'] = filtered_physical_pos
395            return laiobj
396
397    def filter_samples(
398        self,
399        samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
400        indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
401        include: bool = True,
402        reorder: bool = False,
403        inplace: bool = False
404    ) -> Optional['LocalAncestryObject']:
405        """
406        Filter samples based on specified names or indexes.
407
408        This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 
409        samples. Each sample is associated with two haplotypes, which are included or excluded together.
410        The order of the samples is preserved. Set `reorder=True` to match the ordering of the
411        provided `samples` and/or `indexes` lists when including.
412
413        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
414        indexes will be included or excluded.
415        
416        Negative indexes are supported and follow 
417        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
418
419        Args:
420            samples (str or array_like of str, optional): 
421                 Name(s) of the samples to include or exclude. Can be a single sample name or a
422                 sequence of sample names. Default is None.
423            indexes (int or array_like of int, optional):
424                Index(es) of the samples to include or exclude. Can be a single index or a sequence
425                of indexes. Negative indexes are supported. Default is None.
426            include (bool, default=True): 
427                If True, includes only the specified samples. If False, excludes the specified
428                samples. Default is True.
429            inplace (bool, default=False): 
430                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 
431                samples filtered. Default is False.
432
433        Returns:
434            **Optional[LocalAncestryObject]:** 
435                A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 
436                If `inplace=True`, modifies `self` in place and returns None.
437        """
438        if samples is None and indexes is None:
439            raise UserWarning("At least one of 'samples' or 'indexes' must be provided.")
440
441        n_haplotypes = self.n_haplotypes
442        n_samples = self.n_samples
443
444        # Create mask based on sample names
445        if samples is not None:
446            samples = np.asarray(samples).ravel()
447            # Extract sample names from haplotype identifiers
448            haplotype_ids = np.array(self['haplotypes'])
449            sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids])
450            # Create mask for haplotypes belonging to specified samples
451            mask_samples = np.isin(sample_names, samples)
452        else:
453            mask_samples = np.zeros(n_haplotypes, dtype=bool)
454
455        # Create mask based on sample indexes
456        if indexes is not None:
457            indexes = np.asarray(indexes).ravel()
458
459            # Validate indexes, allowing negative indexes
460            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
461            if out_of_bounds_indexes.size > 0:
462                raise ValueError(f"One or more sample indexes are out of bounds.")
463
464            # Adjust negative indexes
465            indexes = np.mod(indexes, n_samples)
466            
467            # Get haplotype indexes for the specified sample indexes
468            haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1])
469            # Create mask for haplotypes
470            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
471            mask_indexes[haplotype_indexes] = True
472        else:
473            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
474
475        # Combine masks using logical OR (union of samples)
476        mask_combined = mask_samples | mask_indexes
477
478        if not include:
479            mask_combined = ~mask_combined
480
481        # Optionally compute an ordering of selected samples that follows the provided lists
482        ordered_sample_indices = None
483        sample_mask = mask_combined.reshape(-1, 2).any(axis=1)
484        if include and reorder:
485            sel_sample_indices = np.where(sample_mask)[0]
486            ordered_list: List[int] = []
487            added = np.zeros(self.n_samples, dtype=bool)
488
489            # Source of sample names for ordering logic
490            haplotype_ids = np.array(self['haplotypes'])
491            sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2]
492
493            # Respect the order in `samples`
494            if samples is not None:
495                for s in np.atleast_1d(samples):
496                    # Find the sample index by name (first occurrence)
497                    matches = np.where(sample_names_by_sample == s)[0]
498                    for idx in matches:
499                        if sample_mask[idx] and not added[idx]:
500                            ordered_list.append(int(idx))
501                            added[idx] = True
502
503            # Then respect the order in `indexes`
504            if indexes is not None:
505                adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples)
506                for idx in adj_idx:
507                    if sample_mask[idx] and not added[idx]:
508                        ordered_list.append(int(idx))
509                        added[idx] = True
510
511            # Append any remaining selected samples in their original order
512            for idx in sel_sample_indices:
513                if not added[idx]:
514                    ordered_list.append(int(idx))
515
516            ordered_sample_indices = np.asarray(ordered_list, dtype=int)
517
518        # Filter / reorder arrays
519        if ordered_sample_indices is not None:
520            hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1])
521            filtered_lai = self['lai'][:, hap_idx]
522            filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist()
523            filtered_samples = (
524                np.array(self['samples'])[ordered_sample_indices].tolist()
525                if self['samples'] is not None else None
526            )
527        else:
528            # Filter `lai`
529            filtered_lai = self['lai'][:, mask_combined]
530            # Filter `haplotypes`
531            filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist()
532            # Filter `samples`, checking if they are None before filtering
533            filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None
534
535        if inplace:
536            self['haplotypes'] = filtered_haplotypes
537            self['samples'] = filtered_samples
538            self['lai'] = filtered_lai
539            return None
540        else:
541            laiobj = self.copy()
542            laiobj['haplotypes'] = filtered_haplotypes
543            laiobj['samples'] = filtered_samples
544            laiobj['lai'] = filtered_lai
545            return laiobj
546
547    def convert_to_snp_level(
548        self,
549        snpobject: Optional['SNPObject'] = None,
550        variants_chrom: Optional[np.ndarray] = None,
551        variants_pos: Optional[np.ndarray] = None,
552        variants_ref: Optional[np.ndarray] = None,
553        variants_alt: Optional[np.ndarray] = None,
554        variants_filter_pass: Optional[np.ndarray] = None,
555        variants_id: Optional[np.ndarray] = None,
556        variants_qual: Optional[np.ndarray] = None,
557        lai_format: str = "3D"
558    ) -> 'SNPObject':
559        """
560        Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 
561        with optional support for SNP data.
562        
563        If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 
564        SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 
565        coordinates are used to assign ancestry values based on their respective windows.
566
567        If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments.
568        In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned.
569
570        Args:
571            snpobject (SNPObject, optional):
572                An existing `SNPObject` to extract SNP attributes from.
573            variants_chrom (array of shape (n_snps,), optional): 
574                An array containing the chromosome for each SNP.
575            variants_pos (array of shape (n_snps,), optional): 
576                An array containing the chromosomal positions for each SNP.
577            variants_ref (array of shape (n_snps,), optional): 
578                An array containing the reference allele for each SNP.
579            variants_alt (array of shape (n_snps,), optional): 
580                An array containing the alternate allele for each SNP.
581            variants_filter_pass (array of shape (n_snps,), optional): 
582                An array indicating whether each SNP passed control checks.
583            variants_id (array of shape (n_snps,), optional): 
584                An array containing unique identifiers (IDs) for each SNP.
585            variants_qual (array of shape (n_snps,), optional): 
586                An array containing the Phred-scaled quality score for each SNP.
587            lai_format (str, optional):
588                Determines the shape of `calldata_lai`:
589                    - `"3D"` (default): Shape `(n_snps, n_samples, 2)`.
590                    - `"2D"`: Shape `(n_snps, n_samples * 2)`.
591
592        Returns:
593            **SNPObject**: 
594                A `SNPObject` containing SNP-level ancestry data and updated SNP attributes.
595        """
596        from snputils.snp.genobj.snpobj import SNPObject
597
598        assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'."
599
600        # Extract attributes from SNPObject if provided
601        if snpobject is not None:
602            variants_chrom = variants_chrom or snpobject.variants_chrom
603            variants_pos = variants_pos or snpobject.variants_pos
604            variants_ref = variants_ref or snpobject.variants_ref
605            variants_alt = variants_alt or snpobject.variants_alt
606            variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass
607            variants_id = variants_id or snpobject.variants_id
608            variants_qual = variants_qual or snpobject.variants_qual
609
610        n_samples = self.n_samples
611        lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None
612
613        if variants_pos is None or variants_chrom is None:
614            # Generate all SNP positions and corresponding chromosome labels between window boundaries
615            variants_pos_list = []
616            variants_chrom_list = []
617            ancestry_list = []
618
619            for i in range(self.n_windows):
620                start = int(self.physical_pos[i, 0])
621                end = int(self.physical_pos[i, 1])
622                chrom = self.chromosomes[i]
623
624                # Generate SNP positions at each base pair within the window range
625                positions_in_window = np.arange(start, end + 1)
626                if positions_in_window.size == 0:
627                    continue  # Skip windows that contain no valid SNP positions
628
629                n_positions = positions_in_window.size
630                variants_pos_list.append(positions_in_window)
631                variants_chrom_list.append(np.full(n_positions, chrom))
632
633                ancestry_repeated = (
634                    np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0)
635                    if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0)
636                )
637                ancestry_list.append(ancestry_repeated)
638
639            # Store SNP positions, their corresponding chromosome labels, and their associated ancestry
640            variants_pos = np.concatenate(variants_pos_list)
641            variants_chrom = np.concatenate(variants_chrom_list)
642            calldata_lai = np.concatenate(ancestry_list)
643        else:
644            # Use the provided SNP positions and chromosomes
645            n_snps = len(variants_pos)
646            if len(variants_chrom) != n_snps:
647                raise ValueError("`variants_pos` and `variants_chrom` must have the same length.")
648
649            # Initialize an array to store the corresponding window index for each SNP
650            # Default value is -1, meaning no matching window found
651            snp_to_window_indices = np.full(n_snps, -1, dtype=int)
652
653            # Identify unique chromosome names sorted by order of appearence
654            _, idx = np.unique(variants_chrom, return_index=True)
655            unique_chroms = variants_chrom[np.sort(idx)]
656
657            # Iterate through each unique chromosome to map SNPs to windows
658            for chrom in unique_chroms:
659                # Get indices of SNPs that belong to the current chromosome
660                snp_indices = np.where(variants_chrom == chrom)[0]
661                snp_pos_chr = variants_pos[snp_indices]
662                
663                # Get indices of windows that belong to the current chromosome
664                window_indices = np.where(self.chromosomes == chrom)[0]
665                if window_indices.size == 0:
666                    continue  # Skip if no windows exist for this chromosome
667                
668                # Extract start and end positions of the windows on this chromosome
669                window_starts_chr = self.physical_pos[:, 0][window_indices]
670                window_ends_chr = self.physical_pos[:, 1][window_indices]
671                
672                # Find the right-most window that a SNP would fit into (sorted order)
673                inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1
674                
675                # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries
676                valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds])
677
678                # Assign valid SNPs to their corresponding window indices
679                snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]]
680                log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}")
681
682            # Initialize SNP-level ancestry array with NaNs as default
683            shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2)
684            calldata_lai = np.full(shape, np.nan, dtype='uint8')
685
686            # Assign ancestry values to SNPs with valid window assignments
687            valid_mask = (snp_to_window_indices != -1)
688            snp_indices = np.where(valid_mask)[0]
689            snp_to_window_indices = snp_to_window_indices[snp_indices]
690
691            if lai_format == "3D":
692                calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices]
693            else:  # "2D"
694                calldata_lai[snp_indices] = self.lai[snp_to_window_indices]
695
696        if snpobject is not None:
697            # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai``
698            snpobject.variants_chrom = variants_chrom
699            snpobject.variants_pos = variants_pos
700            snpobject.variants_ref = variants_ref
701            snpobject.variants_alt = variants_alt
702            snpobject.variants_filter_pass = variants_filter_pass
703            snpobject.variants_id = variants_id
704            snpobject.variants_qual = variants_qual
705            snpobject.calldata_lai = calldata_lai
706            snpobject.ancestry_map = self.ancestry_map
707            return snpobject
708        else:
709            # Otherwise, create a new SNPObject
710            return SNPObject(
711                calldata_lai=calldata_lai.view(),
712                samples=self.samples,
713                variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref,
714                variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt,
715                variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass,
716                variants_chrom=variants_chrom.view(),
717                variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id,
718                variants_pos=variants_pos.view(),
719                variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual,
720                ancestry_map=self.ancestry_map
721            )
722
723    def _sanity_check(self) -> None:
724        """
725        Perform sanity checks on the parsed data to ensure data integrity.
726
727        This method checks that all unique ancestries in LAI are represented 
728        in the ancestry map.
729
730        Args:
731            lai (np.ndarray): The LAI data array.
732            ancestry_map (dict, optional): A dictionary mapping ancestry codes to region names, if available.
733        """
734        # Get unique ancestries from LAI data
735        unique_ancestries = np.unique(self.lai)
736
737        if self.ancestry_map is not None:
738            # Check if all unique ancestries in the LAI are present in the ancestry map
739            for ancestry in unique_ancestries:
740                if str(ancestry) not in self.ancestry_map:
741                    warnings.warn(
742                        f"Ancestry '{ancestry}' found in LAI data is not represented in the ancestry map."
743                    )
744
745    def save(self, file: Union[str, Path]) -> None:
746        """
747        Save the data stored in `self` to a specified file.
748        If the file already exists, it will be overwritten.
749
750        The format of the saved file is determined by the file extension provided in the `file` 
751        argument.
752
753        **Supported formats:**
754
755        - `.msp`: Text-based MSP format.
756        - `.msp.tsv`: Text-based MSP format with TSV extension.
757        - `.pkl`: Pickle format for saving `self` in serialized form.
758
759        Args:
760            file (str or pathlib.Path): 
761                Path to the file where the data will be saved. The extension of the file determines the save format. 
762                Supported extensions: `.msp`, `.msp.tsv`, `.pkl`.
763        """
764        path = Path(file)
765        suffixes = [suffix.lower() for suffix in path.suffixes]
766
767        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
768            self.save_msp(file)
769        elif suffixes[-1] == '.pkl':
770            self.save_pickle(file)
771        else:
772            raise ValueError(
773                f"Unsupported file extension: {suffixes[-1]}"
774                "Supported extensions are: .msp, .msp.tsv, .pkl."
775            )
776
777    def save_msp(self, file: Union[str, Path]) -> None:
778        """
779        Save the data stored in `self` to a `.msp` file.
780        If the file already exists, it will be overwritten.
781
782        Args:
783            file (str or pathlib.Path): 
784                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
785                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
786        """
787        from snputils.ancestry.io.local.write import MSPWriter
788
789        MSPWriter(self, file).write()
790
791    def save_pickle(self, file: Union[str, Path]) -> None:
792        """
793        Save `self` in serialized form to a `.pkl` file.
794        If the file already exists, it will be overwritten.
795
796        Args:
797            file (str or pathlib.Path): 
798                Path to the file where the data will be saved. It should end with `.pkl`. 
799                If the provided path does not have this extension, it will be appended.
800        """
801        import pickle
802        with open(file, 'wb') as file:
803            pickle.dump(self, file)

A class for window-level Local Ancestry Inference (LAI) data.

LocalAncestryObject( haplotypes: List[str], lai: numpy.ndarray, samples: List[str] | None = None, ancestry_map: Dict[str, str] | None = None, window_sizes: numpy.ndarray | None = None, centimorgan_pos: numpy.ndarray | None = None, chromosomes: numpy.ndarray | None = None, physical_pos: numpy.ndarray | None = None)
21    def __init__(
22        self,
23        haplotypes: List[str], 
24        lai: np.ndarray,
25        samples: Optional[List[str]] = None, 
26        ancestry_map: Optional[Dict[str, str]] = None, 
27        window_sizes: Optional[np.ndarray] = None,
28        centimorgan_pos: Optional[np.ndarray] = None,
29        chromosomes: Optional[np.ndarray] = None,
30        physical_pos: Optional[np.ndarray] = None
31    ) -> None:
32        """
33        Args:
34            haplotypes (list of str of length n_haplotypes):
35                A list of unique haplotype identifiers.
36            lai (array of shape (n_windows, n_haplotypes)): 
37                A 2D array containing local ancestry inference values, where each row represents a 
38                genomic window, and each column corresponds to a haplotype phase for each sample.
39            samples (list of str of length n_samples, optional):
40                A list of unique sample identifiers.
41            ancestry_map (dict of str to str, optional):
42                A dictionary mapping ancestry codes to region names.
43            window_sizes (array of shape (n_windows,), optional): 
44                An array specifying the number of SNPs in each genomic window.
45            centimorgan_pos (array of shape (n_windows, 2), optional): 
46                A 2D array containing the start and end centimorgan positions for each window.
47            chromosomes (array of shape (n_windows,), optional): 
48                An array with chromosome numbers corresponding to each genomic window.
49            physical_pos (array of shape (n_windows, 2), optional): 
50                A 2D array containing the start and end physical positions for each window.
51        """
52        if lai.ndim != 2:
53            raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).")
54        
55        # Determine the number of unique ancestries and samples from the LAI array
56        n_ancestries = len(np.unique(lai))
57        n_haplotypes = lai.shape[1]
58        n_samples = n_haplotypes // 2
59
60        super(LocalAncestryObject, self).__init__(n_samples, n_ancestries)
61
62        self.__haplotypes = haplotypes
63        self.__lai = lai
64        self.__window_sizes = window_sizes
65        self.__centimorgan_pos = centimorgan_pos
66        self.__samples = samples
67        self.__chromosomes = chromosomes
68        self.__physical_pos = physical_pos
69        self.__ancestry_map = ancestry_map
70
71        # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map
72        self._sanity_check()
Arguments:
  • haplotypes (list of str of length n_haplotypes): A list of unique haplotype identifiers.
  • lai (array of shape (n_windows, n_haplotypes)): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
  • samples (list of str of length n_samples, optional): A list of unique sample identifiers.
  • ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
  • window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
  • centimorgan_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end centimorgan positions for each window.
  • chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
  • physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
haplotypes: List[str]
 94    @property
 95    def haplotypes(self) -> List[str]:
 96        """
 97        Retrieve `haplotypes`.
 98
 99        Returns:
100            **list of length n_haplotypes:** A list of unique haplotype identifiers.
101        """
102        return self.__haplotypes

Retrieve haplotypes.

Returns:

list of length n_haplotypes: A list of unique haplotype identifiers.

lai: numpy.ndarray
111    @property
112    def lai(self) -> np.ndarray:
113        """
114        Retrieve `lai`.
115
116        Returns:
117            **array of shape (n_windows, n_haplotypes):** 
118                A 2D array containing local ancestry inference values, where each row represents a 
119                genomic window, and each column corresponds to a haplotype phase for each sample.
120        """
121        return self.__lai

Retrieve lai.

Returns:

array of shape (n_windows, n_haplotypes): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.

samples: List[str] | None
130    @property
131    def samples(self) -> Optional[List[str]]:
132        """
133        Retrieve `samples`.
134
135        Returns:
136            **list of str:** A list of unique sample identifiers.
137        """
138        if self.__samples is not None:
139            return self.__samples
140        elif self.__haplotypes is not None:
141            return [hap.split('.')[0] for hap in self.__haplotypes][::2]
142        else:
143            return None

Retrieve samples.

Returns:

list of str: A list of unique sample identifiers.

ancestry_map: Dict[str, str] | None
152    @property
153    def ancestry_map(self) -> Optional[Dict[str, str]]:
154        """
155        Retrieve `ancestry_map`.
156
157        Returns:
158            **dict of str to str:** A dictionary mapping ancestry codes to region names.
159        """
160        return self.__ancestry_map

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names.

window_sizes: numpy.ndarray | None
169    @property
170    def window_sizes(self) -> Optional[np.ndarray]:
171        """
172        Retrieve `window_sizes`.
173
174        Returns:
175            **array of shape (n_windows,):** 
176                An array specifying the number of SNPs in each genomic window.
177        """
178        return self.__window_sizes

Retrieve window_sizes.

Returns:

array of shape (n_windows,): An array specifying the number of SNPs in each genomic window.

centimorgan_pos: numpy.ndarray | None
187    @property
188    def centimorgan_pos(self) -> Optional[np.ndarray]:
189        """
190        Retrieve `centimorgan_pos`.
191
192        Returns:
193            **array of shape (n_windows, 2):** 
194                A 2D array containing the start and end centimorgan positions for each window.
195        """
196        return self.__centimorgan_pos

Retrieve centimorgan_pos.

Returns:

array of shape (n_windows, 2): A 2D array containing the start and end centimorgan positions for each window.

chromosomes: numpy.ndarray | None
205    @property
206    def chromosomes(self) -> Optional[np.ndarray]:
207        """
208        Retrieve `chromosomes`.
209
210        Returns:
211            **array of shape (n_windows,):** 
212                An array with chromosome numbers corresponding to each genomic window.
213        """
214        return self.__chromosomes

Retrieve chromosomes.

Returns:

array of shape (n_windows,): An array with chromosome numbers corresponding to each genomic window.

physical_pos: numpy.ndarray | None
223    @property
224    def physical_pos(self) -> Optional[np.ndarray]:
225        """
226        Retrieve `physical_pos`.
227
228        Returns:
229            **array of shape (n_windows, 2):** 
230                A 2D array containing the start and end physical positions for each window.
231        """
232        return self.__physical_pos

Retrieve physical_pos.

Returns:

array of shape (n_windows, 2): A 2D array containing the start and end physical positions for each window.

n_samples: int
241    @property
242    def n_samples(self) -> int:
243        """
244        Retrieve `n_samples`.
245
246        Returns:
247            **int:** 
248                The total number of samples.
249        """
250        if self.__samples is not None:
251            return len(self.__samples)
252        elif self.__haplotypes is not None:
253            # Divide by 2 because each sample has two associated haplotypes
254            return len(self.__haplotypes) // 2
255        else:
256            # Divide by 2 because columns represent haplotypes
257            return self.__lai.shape[1] // 2

Retrieve n_samples.

Returns:

int: The total number of samples.

n_ancestries: int
259    @property
260    def n_ancestries(self) -> int:
261        """
262        Retrieve `n_ancestries`.
263
264        Returns:
265            **int:** The total number of unique ancestries.
266        """
267        return len(np.unique(self.__lai))

Retrieve n_ancestries.

Returns:

int: The total number of unique ancestries.

n_haplotypes: int
269    @property
270    def n_haplotypes(self) -> int:
271        """
272        Retrieve `n_haplotypes`.
273
274        Returns:
275            **int:** The total number of haplotypes.
276        """
277        if self.__haplotypes is not None:
278            return len(self.__haplotypes)
279        else:
280            return self.__lai.shape[1]

Retrieve n_haplotypes.

Returns:

int: The total number of haplotypes.

n_windows: int
282    @property
283    def n_windows(self) -> int:
284        """
285        Retrieve `n_windows`.
286
287        Returns:
288            **int:** The total number of genomic windows.
289        """
290        return self.__lai.shape[0]

Retrieve n_windows.

Returns:

int: The total number of genomic windows.

def copy(self) -> 'LocalAncestryObject':
292    def copy(self) -> 'LocalAncestryObject':
293        """
294        Create and return a copy of `self`.
295
296        Returns:
297            **LocalAncestryObject:** 
298                A new instance of the current object.
299        """
300        return copy.copy(self)

Create and return a copy of self.

Returns:

LocalAncestryObject: A new instance of the current object.

def keys(self) -> List[str]:
302    def keys(self) -> List[str]:
303        """
304        Retrieve a list of public attribute names for `self`.
305
306        Returns:
307            **list of str:** 
308                A list of attribute names, with internal name-mangling removed, 
309                for easier reference to public attributes in the instance.
310        """
311        return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]

Retrieve a list of public attribute names for self.

Returns:

list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.

def filter_windows( self, indexes: int | Sequence[int] | numpy.ndarray, include: bool = True, inplace: bool = False) -> ForwardRef('LocalAncestryObject') | None:
313    def filter_windows(
314            self,
315            indexes: Union[int, Sequence[int], np.ndarray],
316            include: bool = True,
317            inplace: bool = False
318        ) -> Optional['LocalAncestryObject']:
319        """
320        Filter genomic windows based on specified indexes. 
321
322        This method updates the `lai` attribute to include or exclude the specified genomic windows. 
323        Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 
324        updated accordingly if they are not None. The order of genomic windows is preserved.
325
326        Negative indexes are supported and follow 
327        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 
328
329        Args:
330            indexes (int or array-like of int): 
331                Index(es) of the windows to include or exclude. Can be a single integer or a
332                sequence of integers. Negative indexes are supported.
333            include (bool, default=True): 
334                If True, includes only the specified windows. If False, excludes the specified
335                windows. Default is True.
336            inplace (bool, default=False): 
337                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 
338                the windows filtered. Default is False.
339
340        Returns:
341            **Optional[LocalAncestryObject]:** 
342                A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 
343                If `inplace=True`, modifies `self` in place and returns None.
344        """
345        # Convert indexes to a NumPy array
346        indexes = np.atleast_1d(indexes)
347
348        # Get total number of windows
349        n_windows = self.n_windows
350
351        # Validate indexes, allowing negative indexes
352        if np.any((indexes < -n_windows) | (indexes >= n_windows)):
353            raise IndexError("One or more indexes are out of bounds.")
354
355        # Create boolean mask
356        mask = np.zeros(n_windows, dtype=bool)
357        mask[indexes] = True
358
359        # Invert mask if `include=False`
360        if not include:
361            mask = ~mask
362        
363        # Filter `lai`
364        filtered_lai = self['lai'][mask, :] 
365        
366        # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering
367        filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None
368        filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None
369        filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None
370        filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None
371
372        # Modify the original object if `inplace=True`, otherwise create and return a copy
373        if inplace:
374            self['lai'] = filtered_lai
375            if filtered_window_sizes is not None:
376                self['window_sizes'] = filtered_window_sizes
377            if filtered_chromosomes is not None:
378                self['chromosomes'] = filtered_chromosomes
379            if filtered_centimorgan_pos is not None:
380                self['centimorgan_pos'] = filtered_centimorgan_pos
381            if filtered_physical_pos is not None:
382                self['physical_pos'] = filtered_physical_pos
383            return None
384        else:
385            laiobj = self.copy()
386            laiobj['lai'] = filtered_lai
387            if filtered_window_sizes is not None:
388                laiobj['window_sizes'] = filtered_window_sizes
389            if filtered_chromosomes is not None:
390                laiobj['chromosomes'] = filtered_chromosomes
391            if filtered_centimorgan_pos is not None:
392                laiobj['centimorgan_pos'] = filtered_centimorgan_pos
393            if filtered_physical_pos is not None:
394                laiobj['physical_pos'] = filtered_physical_pos
395            return laiobj

Filter genomic windows based on specified indexes.

This method updates the lai attribute to include or exclude the specified genomic windows. Attributes such as window_sizes, centimorgan_pos, chromosomes, and physical_pos will also be updated accordingly if they are not None. The order of genomic windows is preserved.

Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • indexes (int or array-like of int): Index(es) of the windows to include or exclude. Can be a single integer or a sequence of integers. Negative indexes are supported.
  • include (bool, default=True): If True, includes only the specified windows. If False, excludes the specified windows. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new LocalAncestryObject with the windows filtered. Default is False.
Returns:

Optional[LocalAncestryObject]: A new LocalAncestryObject with the specified windows filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def filter_samples( self, samples: str | Sequence[str] | numpy.ndarray | None = None, indexes: int | Sequence[int] | numpy.ndarray | None = None, include: bool = True, reorder: bool = False, inplace: bool = False) -> ForwardRef('LocalAncestryObject') | None:
397    def filter_samples(
398        self,
399        samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None,
400        indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None,
401        include: bool = True,
402        reorder: bool = False,
403        inplace: bool = False
404    ) -> Optional['LocalAncestryObject']:
405        """
406        Filter samples based on specified names or indexes.
407
408        This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 
409        samples. Each sample is associated with two haplotypes, which are included or excluded together.
410        The order of the samples is preserved. Set `reorder=True` to match the ordering of the
411        provided `samples` and/or `indexes` lists when including.
412
413        If both samples and indexes are provided, any sample matching either a name in samples or an index in 
414        indexes will be included or excluded.
415        
416        Negative indexes are supported and follow 
417        [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html).
418
419        Args:
420            samples (str or array_like of str, optional): 
421                 Name(s) of the samples to include or exclude. Can be a single sample name or a
422                 sequence of sample names. Default is None.
423            indexes (int or array_like of int, optional):
424                Index(es) of the samples to include or exclude. Can be a single index or a sequence
425                of indexes. Negative indexes are supported. Default is None.
426            include (bool, default=True): 
427                If True, includes only the specified samples. If False, excludes the specified
428                samples. Default is True.
429            inplace (bool, default=False): 
430                If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 
431                samples filtered. Default is False.
432
433        Returns:
434            **Optional[LocalAncestryObject]:** 
435                A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 
436                If `inplace=True`, modifies `self` in place and returns None.
437        """
438        if samples is None and indexes is None:
439            raise UserWarning("At least one of 'samples' or 'indexes' must be provided.")
440
441        n_haplotypes = self.n_haplotypes
442        n_samples = self.n_samples
443
444        # Create mask based on sample names
445        if samples is not None:
446            samples = np.asarray(samples).ravel()
447            # Extract sample names from haplotype identifiers
448            haplotype_ids = np.array(self['haplotypes'])
449            sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids])
450            # Create mask for haplotypes belonging to specified samples
451            mask_samples = np.isin(sample_names, samples)
452        else:
453            mask_samples = np.zeros(n_haplotypes, dtype=bool)
454
455        # Create mask based on sample indexes
456        if indexes is not None:
457            indexes = np.asarray(indexes).ravel()
458
459            # Validate indexes, allowing negative indexes
460            out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)]
461            if out_of_bounds_indexes.size > 0:
462                raise ValueError(f"One or more sample indexes are out of bounds.")
463
464            # Adjust negative indexes
465            indexes = np.mod(indexes, n_samples)
466            
467            # Get haplotype indexes for the specified sample indexes
468            haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1])
469            # Create mask for haplotypes
470            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
471            mask_indexes[haplotype_indexes] = True
472        else:
473            mask_indexes = np.zeros(n_haplotypes, dtype=bool)
474
475        # Combine masks using logical OR (union of samples)
476        mask_combined = mask_samples | mask_indexes
477
478        if not include:
479            mask_combined = ~mask_combined
480
481        # Optionally compute an ordering of selected samples that follows the provided lists
482        ordered_sample_indices = None
483        sample_mask = mask_combined.reshape(-1, 2).any(axis=1)
484        if include and reorder:
485            sel_sample_indices = np.where(sample_mask)[0]
486            ordered_list: List[int] = []
487            added = np.zeros(self.n_samples, dtype=bool)
488
489            # Source of sample names for ordering logic
490            haplotype_ids = np.array(self['haplotypes'])
491            sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2]
492
493            # Respect the order in `samples`
494            if samples is not None:
495                for s in np.atleast_1d(samples):
496                    # Find the sample index by name (first occurrence)
497                    matches = np.where(sample_names_by_sample == s)[0]
498                    for idx in matches:
499                        if sample_mask[idx] and not added[idx]:
500                            ordered_list.append(int(idx))
501                            added[idx] = True
502
503            # Then respect the order in `indexes`
504            if indexes is not None:
505                adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples)
506                for idx in adj_idx:
507                    if sample_mask[idx] and not added[idx]:
508                        ordered_list.append(int(idx))
509                        added[idx] = True
510
511            # Append any remaining selected samples in their original order
512            for idx in sel_sample_indices:
513                if not added[idx]:
514                    ordered_list.append(int(idx))
515
516            ordered_sample_indices = np.asarray(ordered_list, dtype=int)
517
518        # Filter / reorder arrays
519        if ordered_sample_indices is not None:
520            hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1])
521            filtered_lai = self['lai'][:, hap_idx]
522            filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist()
523            filtered_samples = (
524                np.array(self['samples'])[ordered_sample_indices].tolist()
525                if self['samples'] is not None else None
526            )
527        else:
528            # Filter `lai`
529            filtered_lai = self['lai'][:, mask_combined]
530            # Filter `haplotypes`
531            filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist()
532            # Filter `samples`, checking if they are None before filtering
533            filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None
534
535        if inplace:
536            self['haplotypes'] = filtered_haplotypes
537            self['samples'] = filtered_samples
538            self['lai'] = filtered_lai
539            return None
540        else:
541            laiobj = self.copy()
542            laiobj['haplotypes'] = filtered_haplotypes
543            laiobj['samples'] = filtered_samples
544            laiobj['lai'] = filtered_lai
545            return laiobj

Filter samples based on specified names or indexes.

This method updates the lai, haplotypes, and samples attributes to include or exclude the specified samples. Each sample is associated with two haplotypes, which are included or excluded together. The order of the samples is preserved. Set reorder=True to match the ordering of the provided samples and/or indexes lists when including.

If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.

Negative indexes are supported and follow NumPy's indexing conventions.

Arguments:
  • samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
  • indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
  • include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
  • inplace (bool, default=False): If True, modifies self in place. If False, returns a new LocalAncestryObject with the samples filtered. Default is False.
Returns:

Optional[LocalAncestryObject]: A new LocalAncestryObject with the specified samples filtered if inplace=False. If inplace=True, modifies self in place and returns None.

def convert_to_snp_level( self, snpobject: ForwardRef('SNPObject') | None = None, variants_chrom: numpy.ndarray | None = None, variants_pos: numpy.ndarray | None = None, variants_ref: numpy.ndarray | None = None, variants_alt: numpy.ndarray | None = None, variants_filter_pass: numpy.ndarray | None = None, variants_id: numpy.ndarray | None = None, variants_qual: numpy.ndarray | None = None, lai_format: str = '3D') -> 'SNPObject':
547    def convert_to_snp_level(
548        self,
549        snpobject: Optional['SNPObject'] = None,
550        variants_chrom: Optional[np.ndarray] = None,
551        variants_pos: Optional[np.ndarray] = None,
552        variants_ref: Optional[np.ndarray] = None,
553        variants_alt: Optional[np.ndarray] = None,
554        variants_filter_pass: Optional[np.ndarray] = None,
555        variants_id: Optional[np.ndarray] = None,
556        variants_qual: Optional[np.ndarray] = None,
557        lai_format: str = "3D"
558    ) -> 'SNPObject':
559        """
560        Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 
561        with optional support for SNP data.
562        
563        If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 
564        SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 
565        coordinates are used to assign ancestry values based on their respective windows.
566
567        If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments.
568        In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned.
569
570        Args:
571            snpobject (SNPObject, optional):
572                An existing `SNPObject` to extract SNP attributes from.
573            variants_chrom (array of shape (n_snps,), optional): 
574                An array containing the chromosome for each SNP.
575            variants_pos (array of shape (n_snps,), optional): 
576                An array containing the chromosomal positions for each SNP.
577            variants_ref (array of shape (n_snps,), optional): 
578                An array containing the reference allele for each SNP.
579            variants_alt (array of shape (n_snps,), optional): 
580                An array containing the alternate allele for each SNP.
581            variants_filter_pass (array of shape (n_snps,), optional): 
582                An array indicating whether each SNP passed control checks.
583            variants_id (array of shape (n_snps,), optional): 
584                An array containing unique identifiers (IDs) for each SNP.
585            variants_qual (array of shape (n_snps,), optional): 
586                An array containing the Phred-scaled quality score for each SNP.
587            lai_format (str, optional):
588                Determines the shape of `calldata_lai`:
589                    - `"3D"` (default): Shape `(n_snps, n_samples, 2)`.
590                    - `"2D"`: Shape `(n_snps, n_samples * 2)`.
591
592        Returns:
593            **SNPObject**: 
594                A `SNPObject` containing SNP-level ancestry data and updated SNP attributes.
595        """
596        from snputils.snp.genobj.snpobj import SNPObject
597
598        assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'."
599
600        # Extract attributes from SNPObject if provided
601        if snpobject is not None:
602            variants_chrom = variants_chrom or snpobject.variants_chrom
603            variants_pos = variants_pos or snpobject.variants_pos
604            variants_ref = variants_ref or snpobject.variants_ref
605            variants_alt = variants_alt or snpobject.variants_alt
606            variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass
607            variants_id = variants_id or snpobject.variants_id
608            variants_qual = variants_qual or snpobject.variants_qual
609
610        n_samples = self.n_samples
611        lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None
612
613        if variants_pos is None or variants_chrom is None:
614            # Generate all SNP positions and corresponding chromosome labels between window boundaries
615            variants_pos_list = []
616            variants_chrom_list = []
617            ancestry_list = []
618
619            for i in range(self.n_windows):
620                start = int(self.physical_pos[i, 0])
621                end = int(self.physical_pos[i, 1])
622                chrom = self.chromosomes[i]
623
624                # Generate SNP positions at each base pair within the window range
625                positions_in_window = np.arange(start, end + 1)
626                if positions_in_window.size == 0:
627                    continue  # Skip windows that contain no valid SNP positions
628
629                n_positions = positions_in_window.size
630                variants_pos_list.append(positions_in_window)
631                variants_chrom_list.append(np.full(n_positions, chrom))
632
633                ancestry_repeated = (
634                    np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0)
635                    if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0)
636                )
637                ancestry_list.append(ancestry_repeated)
638
639            # Store SNP positions, their corresponding chromosome labels, and their associated ancestry
640            variants_pos = np.concatenate(variants_pos_list)
641            variants_chrom = np.concatenate(variants_chrom_list)
642            calldata_lai = np.concatenate(ancestry_list)
643        else:
644            # Use the provided SNP positions and chromosomes
645            n_snps = len(variants_pos)
646            if len(variants_chrom) != n_snps:
647                raise ValueError("`variants_pos` and `variants_chrom` must have the same length.")
648
649            # Initialize an array to store the corresponding window index for each SNP
650            # Default value is -1, meaning no matching window found
651            snp_to_window_indices = np.full(n_snps, -1, dtype=int)
652
653            # Identify unique chromosome names sorted by order of appearence
654            _, idx = np.unique(variants_chrom, return_index=True)
655            unique_chroms = variants_chrom[np.sort(idx)]
656
657            # Iterate through each unique chromosome to map SNPs to windows
658            for chrom in unique_chroms:
659                # Get indices of SNPs that belong to the current chromosome
660                snp_indices = np.where(variants_chrom == chrom)[0]
661                snp_pos_chr = variants_pos[snp_indices]
662                
663                # Get indices of windows that belong to the current chromosome
664                window_indices = np.where(self.chromosomes == chrom)[0]
665                if window_indices.size == 0:
666                    continue  # Skip if no windows exist for this chromosome
667                
668                # Extract start and end positions of the windows on this chromosome
669                window_starts_chr = self.physical_pos[:, 0][window_indices]
670                window_ends_chr = self.physical_pos[:, 1][window_indices]
671                
672                # Find the right-most window that a SNP would fit into (sorted order)
673                inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1
674                
675                # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries
676                valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds])
677
678                # Assign valid SNPs to their corresponding window indices
679                snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]]
680                log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}")
681
682            # Initialize SNP-level ancestry array with NaNs as default
683            shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2)
684            calldata_lai = np.full(shape, np.nan, dtype='uint8')
685
686            # Assign ancestry values to SNPs with valid window assignments
687            valid_mask = (snp_to_window_indices != -1)
688            snp_indices = np.where(valid_mask)[0]
689            snp_to_window_indices = snp_to_window_indices[snp_indices]
690
691            if lai_format == "3D":
692                calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices]
693            else:  # "2D"
694                calldata_lai[snp_indices] = self.lai[snp_to_window_indices]
695
696        if snpobject is not None:
697            # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai``
698            snpobject.variants_chrom = variants_chrom
699            snpobject.variants_pos = variants_pos
700            snpobject.variants_ref = variants_ref
701            snpobject.variants_alt = variants_alt
702            snpobject.variants_filter_pass = variants_filter_pass
703            snpobject.variants_id = variants_id
704            snpobject.variants_qual = variants_qual
705            snpobject.calldata_lai = calldata_lai
706            snpobject.ancestry_map = self.ancestry_map
707            return snpobject
708        else:
709            # Otherwise, create a new SNPObject
710            return SNPObject(
711                calldata_lai=calldata_lai.view(),
712                samples=self.samples,
713                variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref,
714                variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt,
715                variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass,
716                variants_chrom=variants_chrom.view(),
717                variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id,
718                variants_pos=variants_pos.view(),
719                variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual,
720                ancestry_map=self.ancestry_map
721            )

Convert self into a snputils.snp.genobj.SNPObject SNP-level Local Ancestry Information (LAI), with optional support for SNP data.

If SNP positions (variants_pos) and/or chromosomes (variants_chrom) are not specified, the method generates SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP coordinates are used to assign ancestry values based on their respective windows.

If a SNPObject is provided, its attributes are used unless explicitly overridden by the function arguments. In that case, the SNPObject is updated with the (optional) new attributes and the computed calldata_lai, then returned.

Arguments:
  • snpobject (SNPObject, optional): An existing SNPObject to extract SNP attributes from.
  • variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
  • variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
  • variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
  • variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
  • variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
  • variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
  • variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
  • lai_format (str, optional): Determines the shape of calldata_lai:
    • "3D" (default): Shape (n_snps, n_samples, 2).
    • "2D": Shape (n_snps, n_samples * 2).
Returns:

SNPObject: A SNPObject containing SNP-level ancestry data and updated SNP attributes.

def save(self, file: str | pathlib.Path) -> None:
745    def save(self, file: Union[str, Path]) -> None:
746        """
747        Save the data stored in `self` to a specified file.
748        If the file already exists, it will be overwritten.
749
750        The format of the saved file is determined by the file extension provided in the `file` 
751        argument.
752
753        **Supported formats:**
754
755        - `.msp`: Text-based MSP format.
756        - `.msp.tsv`: Text-based MSP format with TSV extension.
757        - `.pkl`: Pickle format for saving `self` in serialized form.
758
759        Args:
760            file (str or pathlib.Path): 
761                Path to the file where the data will be saved. The extension of the file determines the save format. 
762                Supported extensions: `.msp`, `.msp.tsv`, `.pkl`.
763        """
764        path = Path(file)
765        suffixes = [suffix.lower() for suffix in path.suffixes]
766
767        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
768            self.save_msp(file)
769        elif suffixes[-1] == '.pkl':
770            self.save_pickle(file)
771        else:
772            raise ValueError(
773                f"Unsupported file extension: {suffixes[-1]}"
774                "Supported extensions are: .msp, .msp.tsv, .pkl."
775            )

Save the data stored in self to a specified file. If the file already exists, it will be overwritten.

The format of the saved file is determined by the file extension provided in the file argument.

Supported formats:

  • .msp: Text-based MSP format.
  • .msp.tsv: Text-based MSP format with TSV extension.
  • .pkl: Pickle format for saving self in serialized form.
Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format. Supported extensions: .msp, .msp.tsv, .pkl.
def save_msp(self, file: str | pathlib.Path) -> None:
777    def save_msp(self, file: Union[str, Path]) -> None:
778        """
779        Save the data stored in `self` to a `.msp` file.
780        If the file already exists, it will be overwritten.
781
782        Args:
783            file (str or pathlib.Path): 
784                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
785                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
786        """
787        from snputils.ancestry.io.local.write import MSPWriter
788
789        MSPWriter(self, file).write()

Save the data stored in self to a .msp file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.
def save_pickle(self, file: str | pathlib.Path) -> None:
791    def save_pickle(self, file: Union[str, Path]) -> None:
792        """
793        Save `self` in serialized form to a `.pkl` file.
794        If the file already exists, it will be overwritten.
795
796        Args:
797            file (str or pathlib.Path): 
798                Path to the file where the data will be saved. It should end with `.pkl`. 
799                If the provided path does not have this extension, it will be appended.
800        """
801        import pickle
802        with open(file, 'wb') as file:
803            pickle.dump(self, file)

Save self in serialized form to a .pkl file. If the file already exists, it will be overwritten.

Arguments:
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .pkl. If the provided path does not have this extension, it will be appended.