snputils.ancestry.genobj
10class GlobalAncestryObject(AncestryObject): 11 """ 12 A class for Global Ancestry Inference (GAI) data. 13 """ 14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: Optional[np.ndarray] = None, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 if P is not None: 42 n_snps, n_ancestries_P = P.shape 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if P is None: 62 snps = None 63 else: 64 if snps is None: 65 snps = list(range(n_snps)) 66 else: 67 snps = list(snps) 68 if len(snps) != n_snps: 69 raise ValueError( 70 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 71 ) 72 73 if ancestries is not None: 74 if len(ancestries) != n_samples: 75 raise ValueError( 76 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 77 ) 78 79 super().__init__(n_samples, n_ancestries) 80 81 # Store attributes 82 self.__Q = Q 83 self.__P = P 84 self.__samples = np.asarray(samples) 85 self.__snps = np.asarray(snps) if snps is not None else None 86 self.__ancestries = np.asarray(ancestries) if ancestries is not None else None 87 88 # Perform sanity checks 89 self._sanity_check() 90 91 def __getitem__(self, key): 92 """ 93 To access an attribute of the class using the square bracket notation, 94 similar to a dictionary. 95 """ 96 try: 97 return getattr(self, key) 98 except AttributeError: 99 raise KeyError(f'Invalid key: {key}') 100 101 def __setitem__(self, key, value): 102 """ 103 To set an attribute of the class using the square bracket notation, 104 similar to a dictionary. 105 """ 106 try: 107 setattr(self, key, value) 108 except AttributeError: 109 raise KeyError(f'Invalid key: {key}') 110 111 @property 112 def Q(self) -> np.ndarray: 113 """ 114 Retrieve `Q`. 115 116 Returns: 117 **array of shape (n_samples, n_ancestries):** 118 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 119 and each column corresponds to an ancestry. 120 """ 121 return self.__Q 122 123 @Q.setter 124 def Q(self, x: np.ndarray): 125 """ 126 Update `Q`. 127 """ 128 if x.shape != (self.n_samples, self.n_ancestries): 129 raise ValueError( 130 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {x.shape}." 131 ) 132 self.__Q = x 133 134 @property 135 def P(self) -> np.ndarray: 136 """ 137 Retrieve `P`. 138 139 Returns: 140 **array of shape (n_snps, n_ancestries):** 141 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 142 and each column corresponds to an ancestry. 143 """ 144 return self.__P 145 146 @P.setter 147 def P(self, x: np.ndarray): 148 """ 149 Update `P`. 150 """ 151 if x.shape[1] != self.n_ancestries: 152 raise ValueError( 153 f"P must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}." 154 ) 155 self.__P = x 156 self._sanity_check() 157 158 @property 159 def F(self) -> np.ndarray: 160 """ 161 Alias for `P`. 162 163 Returns: 164 **array of shape (n_snps, n_ancestries):** 165 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 166 and each column corresponds to an ancestry. 167 """ 168 return self.P 169 170 @F.setter 171 def F(self, x: np.ndarray): 172 """ 173 Update `F`. 174 """ 175 if x.shape[1] != self.n_ancestries: 176 raise ValueError( 177 f"F must have {self.n_ancestries} columns (one per ancestry); got shape {x.shape}." 178 ) 179 self.__P = x 180 181 @property 182 def samples(self) -> Optional[np.ndarray]: 183 """ 184 Retrieve `samples`. 185 186 Returns: 187 **array of shape (n_samples,):** 188 An array containing unique identifiers for each sample. If None, sample 189 identifiers are assigned as integers from `0` to `n_samples - 1`. 190 """ 191 return self.__samples 192 193 @samples.setter 194 def samples(self, x: Sequence): 195 """ 196 Update `samples`. 197 """ 198 x = list(x) 199 if len(x) != self.n_samples: 200 raise ValueError( 201 f"samples must have length {self.n_samples}; got length {len(x)}." 202 ) 203 self.__samples = x 204 205 @property 206 def snps(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `snps`. 209 210 Returns: 211 **array of shape (n_snps,):** 212 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 213 from `0` to `n_snps - 1`. 214 """ 215 return self.__snps 216 217 @snps.setter 218 def snps(self, x: Sequence): 219 """ 220 Update `snps`. 221 """ 222 x = list(x) 223 if len(x) != self.n_snps: 224 raise ValueError( 225 f"snps must have length {self.n_snps}; got length {len(x)}." 226 ) 227 self.__snps = np.asarray(x) 228 229 @property 230 def ancestries(self) -> Optional[np.ndarray]: 231 """ 232 Retrieve `ancestries`. 233 234 Returns: 235 **array of shape (n_samples,):** 236 An array containing ancestry labels for each sample. 237 """ 238 return self.__ancestries 239 240 @ancestries.setter 241 def ancestries(self, x: Sequence): 242 """ 243 Update `ancestries`. 244 """ 245 x = list(x) 246 num_x = len(x) 247 num_unique_x = len(np.unique(x)) 248 249 if num_x != self.n_samples: 250 raise ValueError( 251 f"ancestries must have length {self.n_samples}; got length {num_x}." 252 ) 253 if num_unique_x > self.n_ancestries: 254 raise ValueError( 255 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_x} unique labels." 256 ) 257 self.__ancestries = np.asarray(x) 258 259 @property 260 def n_samples(self) -> int: 261 """ 262 Retrieve `n_samples`. 263 264 Returns: 265 **int:** The total number of samples. 266 """ 267 return self.__Q.shape[0] 268 269 @property 270 def n_snps(self) -> int: 271 """ 272 Retrieve `n_snps`. 273 274 Returns: 275 **int:** The total number of SNPs. 276 """ 277 return 0 if self.__P is None else self.__P.shape[0] 278 279 @property 280 def n_ancestries(self) -> int: 281 """ 282 Retrieve `n_ancestries`. 283 284 Returns: 285 **int:** The total number of unique ancestries. 286 """ 287 return self.__Q.shape[1] 288 289 def copy(self) -> 'GlobalAncestryObject': 290 """ 291 Create and return a copy of `self`. 292 293 Returns: 294 **GlobalAncestryObject:** A new instance of the current object. 295 """ 296 return copy.copy(self) 297 298 def keys(self) -> List[str]: 299 """ 300 Retrieve a list of public attribute names for `self`. 301 302 Returns: 303 **list of str:** 304 A list of attribute names, with internal name-mangling removed, 305 for easier reference to public attributes in the instance. 306 """ 307 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 308 309 def _sanity_check(self) -> None: 310 """ 311 Perform sanity checks to ensure that matrix dimensions are consistent with expected sizes. 312 313 Raises: 314 **ValueError:** If any of the matrix dimensions do not match the expected sizes. 315 """ 316 # Check that the Q matrix has the correct shape 317 if self.__Q.shape != (self.n_samples, self.n_ancestries): 318 raise ValueError( 319 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {self.__Q.shape}." 320 ) 321 322 # Check that the P matrix has the correct shape (if provided) 323 if self.__P is not None: 324 if self.__P.shape != (self.n_snps, self.n_ancestries): 325 raise ValueError( 326 f"P must have shape ({self.n_snps}, {self.n_ancestries}); got {self.__P.shape}." 327 ) 328 329 # Check that samples length matches n_samples 330 if self.samples is not None: 331 if len(self.__samples) != self.n_samples: 332 raise ValueError( 333 f"samples must have length {self.n_samples}; got length {len(self.__samples)}." 334 ) 335 336 # Check that snps length matches n_snps 337 if self.snps is not None: 338 if len(self.__snps) != self.n_snps: 339 raise ValueError( 340 f"snps must have length {self.n_snps}; got length {len(self.__snps)}." 341 ) 342 343 # Check that ancestries length matches n_samples 344 if self.ancestries is not None: 345 if len(self.__ancestries) != self.n_samples: 346 raise ValueError( 347 f"ancestries must have length {self.n_samples}; got length {len(self.__ancestries)}." 348 ) 349 350 # Check number of unique ancestry labels 351 num_unique_ancestries = len(np.unique(self.__ancestries)) 352 if num_unique_ancestries > self.n_ancestries: 353 raise ValueError( 354 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_ancestries} unique labels." 355 ) 356 357 def save(self, file: Union[str, Path]) -> None: 358 """ 359 Save the data stored in `self` to a specified file or set of files. 360 361 The format of the saved file(s) is determined by the file extension provided in the `file` 362 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 363 the file is treated as a prefix for saving ADMIXTURE files. 364 365 **Supported formats:** 366 367 - `.pkl`: Pickle format for saving `self` in serialized form. 368 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 369 370 Args: 371 file (str or pathlib.Path): 372 Path to the file where the data will be saved. If the extension is `.pkl`, the object 373 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 374 """ 375 path = Path(file) 376 suffix = path.suffix.lower() 377 378 if suffix == '.pkl': 379 self.save_pickle(path) 380 else: 381 self.save_admixture(path) 382 383 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 384 """ 385 Save the data stored in `self` into multiple ADMIXTURE files. 386 If the file already exists, it will be overwritten. 387 388 **Output files:** 389 390 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 391 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 392 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 393 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 394 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 395 396 Args: 397 file_prefix (str or pathlib.Path): 398 The base prefix for output file names, including directory path but excluding file extensions. 399 The prefix is used to generate specific file names for each output, with file-specific 400 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 401 """ 402 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 403 404 AdmixtureWriter(self, file_prefix).write() 405 406 def save_pickle(self, file: Union[str, Path]) -> None: 407 """ 408 Save `self` in serialized form to a `.pkl` file. 409 If the file already exists, it will be overwritten. 410 411 Args: 412 file (str or pathlib.Path): 413 Path to the file where the data will be saved. It should end with `.pkl`. 414 If the provided path does not have this extension, it will be appended. 415 """ 416 import pickle 417 with open(file, 'wb') as file: 418 pickle.dump(self, file)
A class for Global Ancestry Inference (GAI) data.
14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: Optional[np.ndarray] = None, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 if P is not None: 42 n_snps, n_ancestries_P = P.shape 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if P is None: 62 snps = None 63 else: 64 if snps is None: 65 snps = list(range(n_snps)) 66 else: 67 snps = list(snps) 68 if len(snps) != n_snps: 69 raise ValueError( 70 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 71 ) 72 73 if ancestries is not None: 74 if len(ancestries) != n_samples: 75 raise ValueError( 76 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 77 ) 78 79 super().__init__(n_samples, n_ancestries) 80 81 # Store attributes 82 self.__Q = Q 83 self.__P = P 84 self.__samples = np.asarray(samples) 85 self.__snps = np.asarray(snps) if snps is not None else None 86 self.__ancestries = np.asarray(ancestries) if ancestries is not None else None 87 88 # Perform sanity checks 89 self._sanity_check()
Arguments:
- Q (array of shape (n_samples, n_ancestries)): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
- P (array of shape (n_snps, n_ancestries)): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
- samples (sequence of length n_samples, optional): A sequence containing unique identifiers for each sample. If None, sample identifiers
are assigned as integers from
0ton_samples - 1. - snps (sequence of length n_snps, optional): A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers
from
0ton_snps - 1. - ancestries (sequence of length n_samples, optional): A sequence containing ancestry labels for each sample.
111 @property 112 def Q(self) -> np.ndarray: 113 """ 114 Retrieve `Q`. 115 116 Returns: 117 **array of shape (n_samples, n_ancestries):** 118 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 119 and each column corresponds to an ancestry. 120 """ 121 return self.__Q
Retrieve Q.
Returns:
array of shape (n_samples, n_ancestries): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
134 @property 135 def P(self) -> np.ndarray: 136 """ 137 Retrieve `P`. 138 139 Returns: 140 **array of shape (n_snps, n_ancestries):** 141 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 142 and each column corresponds to an ancestry. 143 """ 144 return self.__P
Retrieve P.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
158 @property 159 def F(self) -> np.ndarray: 160 """ 161 Alias for `P`. 162 163 Returns: 164 **array of shape (n_snps, n_ancestries):** 165 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 166 and each column corresponds to an ancestry. 167 """ 168 return self.P
Alias for P.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
181 @property 182 def samples(self) -> Optional[np.ndarray]: 183 """ 184 Retrieve `samples`. 185 186 Returns: 187 **array of shape (n_samples,):** 188 An array containing unique identifiers for each sample. If None, sample 189 identifiers are assigned as integers from `0` to `n_samples - 1`. 190 """ 191 return self.__samples
Retrieve samples.
Returns:
array of shape (n_samples,): An array containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from
0ton_samples - 1.
205 @property 206 def snps(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `snps`. 209 210 Returns: 211 **array of shape (n_snps,):** 212 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 213 from `0` to `n_snps - 1`. 214 """ 215 return self.__snps
Retrieve snps.
Returns:
array of shape (n_snps,): An array containing identifiers for each SNP. If None, SNPs are assigned as integers from
0ton_snps - 1.
229 @property 230 def ancestries(self) -> Optional[np.ndarray]: 231 """ 232 Retrieve `ancestries`. 233 234 Returns: 235 **array of shape (n_samples,):** 236 An array containing ancestry labels for each sample. 237 """ 238 return self.__ancestries
Retrieve ancestries.
Returns:
array of shape (n_samples,): An array containing ancestry labels for each sample.
289 def copy(self) -> 'GlobalAncestryObject': 290 """ 291 Create and return a copy of `self`. 292 293 Returns: 294 **GlobalAncestryObject:** A new instance of the current object. 295 """ 296 return copy.copy(self)
Create and return a copy of self.
Returns:
GlobalAncestryObject: A new instance of the current object.
298 def keys(self) -> List[str]: 299 """ 300 Retrieve a list of public attribute names for `self`. 301 302 Returns: 303 **list of str:** 304 A list of attribute names, with internal name-mangling removed, 305 for easier reference to public attributes in the instance. 306 """ 307 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
357 def save(self, file: Union[str, Path]) -> None: 358 """ 359 Save the data stored in `self` to a specified file or set of files. 360 361 The format of the saved file(s) is determined by the file extension provided in the `file` 362 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 363 the file is treated as a prefix for saving ADMIXTURE files. 364 365 **Supported formats:** 366 367 - `.pkl`: Pickle format for saving `self` in serialized form. 368 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 369 370 Args: 371 file (str or pathlib.Path): 372 Path to the file where the data will be saved. If the extension is `.pkl`, the object 373 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 374 """ 375 path = Path(file) 376 suffix = path.suffix.lower() 377 378 if suffix == '.pkl': 379 self.save_pickle(path) 380 else: 381 self.save_admixture(path)
Save the data stored in self to a specified file or set of files.
The format of the saved file(s) is determined by the file extension provided in the file
argument. If the extension is .pkl, the object is serialized as a pickle file. Otherwise,
the file is treated as a prefix for saving ADMIXTURE files.
Supported formats:
.pkl: Pickle format for savingselfin serialized form.- Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. If the extension is
.pkl, the object is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
383 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 384 """ 385 Save the data stored in `self` into multiple ADMIXTURE files. 386 If the file already exists, it will be overwritten. 387 388 **Output files:** 389 390 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 391 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 392 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 393 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 394 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 395 396 Args: 397 file_prefix (str or pathlib.Path): 398 The base prefix for output file names, including directory path but excluding file extensions. 399 The prefix is used to generate specific file names for each output, with file-specific 400 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 401 """ 402 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 403 404 AdmixtureWriter(self, file_prefix).write()
Save the data stored in self into multiple ADMIXTURE files.
If the file already exists, it will be overwritten.
Output files:
<file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.<file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.<file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).<file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).<file_prefix>.map: Ancestry file (if ancestries information is available).
Arguments:
- file_prefix (str or pathlib.Path): The base prefix for output file names, including directory path but excluding file extensions.
The prefix is used to generate specific file names for each output, with file-specific
suffixes appended as described above (e.g.,
file_prefix.n_ancestries.Qfor the Q matrix file).
406 def save_pickle(self, file: Union[str, Path]) -> None: 407 """ 408 Save `self` in serialized form to a `.pkl` file. 409 If the file already exists, it will be overwritten. 410 411 Args: 412 file (str or pathlib.Path): 413 Path to the file where the data will be saved. It should end with `.pkl`. 414 If the provided path does not have this extension, it will be appended. 415 """ 416 import pickle 417 with open(file, 'wb') as file: 418 pickle.dump(self, file)
Save self in serialized form to a .pkl file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl. If the provided path does not have this extension, it will be appended.
17class LocalAncestryObject(AncestryObject): 18 """ 19 A class for window-level Local Ancestry Inference (LAI) data. 20 """ 21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check() 73 74 def __getitem__(self, key): 75 """ 76 To access an attribute of the class using the square bracket notation, 77 similar to a dictionary. 78 """ 79 try: 80 return getattr(self, key) 81 except AttributeError: 82 raise KeyError(f'Invalid key: {key}') 83 84 def __setitem__(self, key, value): 85 """ 86 To set an attribute of the class using the square bracket notation, 87 similar to a dictionary. 88 """ 89 try: 90 setattr(self, key, value) 91 except AttributeError: 92 raise KeyError(f'Invalid key: {key}') 93 94 @property 95 def haplotypes(self) -> List[str]: 96 """ 97 Retrieve `haplotypes`. 98 99 Returns: 100 **list of length n_haplotypes:** A list of unique haplotype identifiers. 101 """ 102 return self.__haplotypes 103 104 @haplotypes.setter 105 def haplotypes(self, x): 106 """ 107 Update `haplotypes`. 108 """ 109 self.__haplotypes = x 110 111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai 122 123 @lai.setter 124 def lai(self, x): 125 """ 126 Update `lai`. 127 """ 128 self.__lai = x 129 130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None 144 145 @samples.setter 146 def samples(self, x): 147 """ 148 Update `samples`. 149 """ 150 self.__samples = x 151 152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map 161 162 @ancestry_map.setter 163 def ancestry_map(self, x): 164 """ 165 Update `ancestry_map`. 166 """ 167 self.__ancestry_map = x 168 169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes 179 180 @window_sizes.setter 181 def window_sizes(self, x): 182 """ 183 Update `window_sizes`. 184 """ 185 self.__window_sizes = x 186 187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos 197 198 @centimorgan_pos.setter 199 def centimorgan_pos(self, x): 200 """ 201 Update `centimorgan_pos`. 202 """ 203 self.__centimorgan_pos = x 204 205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes 215 216 @chromosomes.setter 217 def chromosomes(self, x): 218 """ 219 Update `chromosomes`. 220 """ 221 self.__chromosomes = x 222 223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos 233 234 @physical_pos.setter 235 def physical_pos(self, x): 236 """ 237 Update `physical_pos`. 238 """ 239 self.__physical_pos = x 240 241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2 258 259 @property 260 def n_ancestries(self) -> int: 261 """ 262 Retrieve `n_ancestries`. 263 264 Returns: 265 **int:** The total number of unique ancestries. 266 """ 267 return len(np.unique(self.__lai)) 268 269 @property 270 def n_haplotypes(self) -> int: 271 """ 272 Retrieve `n_haplotypes`. 273 274 Returns: 275 **int:** The total number of haplotypes. 276 """ 277 if self.__haplotypes is not None: 278 return len(self.__haplotypes) 279 else: 280 return self.__lai.shape[1] 281 282 @property 283 def n_windows(self) -> int: 284 """ 285 Retrieve `n_windows`. 286 287 Returns: 288 **int:** The total number of genomic windows. 289 """ 290 return self.__lai.shape[0] 291 292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self) 301 302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 312 313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj 396 397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 reorder: bool = False, 403 inplace: bool = False 404 ) -> Optional['LocalAncestryObject']: 405 """ 406 Filter samples based on specified names or indexes. 407 408 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 409 samples. Each sample is associated with two haplotypes, which are included or excluded together. 410 The order of the samples is preserved. Set `reorder=True` to match the ordering of the 411 provided `samples` and/or `indexes` lists when including. 412 413 If both samples and indexes are provided, any sample matching either a name in samples or an index in 414 indexes will be included or excluded. 415 416 Negative indexes are supported and follow 417 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 418 419 Args: 420 samples (str or array_like of str, optional): 421 Name(s) of the samples to include or exclude. Can be a single sample name or a 422 sequence of sample names. Default is None. 423 indexes (int or array_like of int, optional): 424 Index(es) of the samples to include or exclude. Can be a single index or a sequence 425 of indexes. Negative indexes are supported. Default is None. 426 include (bool, default=True): 427 If True, includes only the specified samples. If False, excludes the specified 428 samples. Default is True. 429 inplace (bool, default=False): 430 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 431 samples filtered. Default is False. 432 433 Returns: 434 **Optional[LocalAncestryObject]:** 435 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 436 If `inplace=True`, modifies `self` in place and returns None. 437 """ 438 if samples is None and indexes is None: 439 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 440 441 n_haplotypes = self.n_haplotypes 442 n_samples = self.n_samples 443 444 # Create mask based on sample names 445 if samples is not None: 446 samples = np.asarray(samples).ravel() 447 # Extract sample names from haplotype identifiers 448 haplotype_ids = np.array(self['haplotypes']) 449 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 450 # Create mask for haplotypes belonging to specified samples 451 mask_samples = np.isin(sample_names, samples) 452 else: 453 mask_samples = np.zeros(n_haplotypes, dtype=bool) 454 455 # Create mask based on sample indexes 456 if indexes is not None: 457 indexes = np.asarray(indexes).ravel() 458 459 # Validate indexes, allowing negative indexes 460 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 461 if out_of_bounds_indexes.size > 0: 462 raise ValueError(f"One or more sample indexes are out of bounds.") 463 464 # Adjust negative indexes 465 indexes = np.mod(indexes, n_samples) 466 467 # Get haplotype indexes for the specified sample indexes 468 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 469 # Create mask for haplotypes 470 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 471 mask_indexes[haplotype_indexes] = True 472 else: 473 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 474 475 # Combine masks using logical OR (union of samples) 476 mask_combined = mask_samples | mask_indexes 477 478 if not include: 479 mask_combined = ~mask_combined 480 481 # Optionally compute an ordering of selected samples that follows the provided lists 482 ordered_sample_indices = None 483 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 484 if include and reorder: 485 sel_sample_indices = np.where(sample_mask)[0] 486 ordered_list: List[int] = [] 487 added = np.zeros(self.n_samples, dtype=bool) 488 489 # Source of sample names for ordering logic 490 haplotype_ids = np.array(self['haplotypes']) 491 sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2] 492 493 # Respect the order in `samples` 494 if samples is not None: 495 for s in np.atleast_1d(samples): 496 # Find the sample index by name (first occurrence) 497 matches = np.where(sample_names_by_sample == s)[0] 498 for idx in matches: 499 if sample_mask[idx] and not added[idx]: 500 ordered_list.append(int(idx)) 501 added[idx] = True 502 503 # Then respect the order in `indexes` 504 if indexes is not None: 505 adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples) 506 for idx in adj_idx: 507 if sample_mask[idx] and not added[idx]: 508 ordered_list.append(int(idx)) 509 added[idx] = True 510 511 # Append any remaining selected samples in their original order 512 for idx in sel_sample_indices: 513 if not added[idx]: 514 ordered_list.append(int(idx)) 515 516 ordered_sample_indices = np.asarray(ordered_list, dtype=int) 517 518 # Filter / reorder arrays 519 if ordered_sample_indices is not None: 520 hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1]) 521 filtered_lai = self['lai'][:, hap_idx] 522 filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist() 523 filtered_samples = ( 524 np.array(self['samples'])[ordered_sample_indices].tolist() 525 if self['samples'] is not None else None 526 ) 527 else: 528 # Filter `lai` 529 filtered_lai = self['lai'][:, mask_combined] 530 # Filter `haplotypes` 531 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 532 # Filter `samples`, checking if they are None before filtering 533 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 534 535 if inplace: 536 self['haplotypes'] = filtered_haplotypes 537 self['samples'] = filtered_samples 538 self['lai'] = filtered_lai 539 return None 540 else: 541 laiobj = self.copy() 542 laiobj['haplotypes'] = filtered_haplotypes 543 laiobj['samples'] = filtered_samples 544 laiobj['lai'] = filtered_lai 545 return laiobj 546 547 def convert_to_snp_level( 548 self, 549 snpobject: Optional['SNPObject'] = None, 550 variants_chrom: Optional[np.ndarray] = None, 551 variants_pos: Optional[np.ndarray] = None, 552 variants_ref: Optional[np.ndarray] = None, 553 variants_alt: Optional[np.ndarray] = None, 554 variants_filter_pass: Optional[np.ndarray] = None, 555 variants_id: Optional[np.ndarray] = None, 556 variants_qual: Optional[np.ndarray] = None, 557 lai_format: str = "3D" 558 ) -> 'SNPObject': 559 """ 560 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 561 with optional support for SNP data. 562 563 If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 564 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 565 coordinates are used to assign ancestry values based on their respective windows. 566 567 If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 568 In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned. 569 570 Args: 571 snpobject (SNPObject, optional): 572 An existing `SNPObject` to extract SNP attributes from. 573 variants_chrom (array of shape (n_snps,), optional): 574 An array containing the chromosome for each SNP. 575 variants_pos (array of shape (n_snps,), optional): 576 An array containing the chromosomal positions for each SNP. 577 variants_ref (array of shape (n_snps,), optional): 578 An array containing the reference allele for each SNP. 579 variants_alt (array of shape (n_snps,), optional): 580 An array containing the alternate allele for each SNP. 581 variants_filter_pass (array of shape (n_snps,), optional): 582 An array indicating whether each SNP passed control checks. 583 variants_id (array of shape (n_snps,), optional): 584 An array containing unique identifiers (IDs) for each SNP. 585 variants_qual (array of shape (n_snps,), optional): 586 An array containing the Phred-scaled quality score for each SNP. 587 lai_format (str, optional): 588 Determines the shape of `calldata_lai`: 589 - `"3D"` (default): Shape `(n_snps, n_samples, 2)`. 590 - `"2D"`: Shape `(n_snps, n_samples * 2)`. 591 592 Returns: 593 **SNPObject**: 594 A `SNPObject` containing SNP-level ancestry data and updated SNP attributes. 595 """ 596 from snputils.snp.genobj.snpobj import SNPObject 597 598 assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'." 599 600 # Extract attributes from SNPObject if provided 601 if snpobject is not None: 602 variants_chrom = variants_chrom or snpobject.variants_chrom 603 variants_pos = variants_pos or snpobject.variants_pos 604 variants_ref = variants_ref or snpobject.variants_ref 605 variants_alt = variants_alt or snpobject.variants_alt 606 variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass 607 variants_id = variants_id or snpobject.variants_id 608 variants_qual = variants_qual or snpobject.variants_qual 609 610 n_samples = self.n_samples 611 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None 612 613 if variants_pos is None or variants_chrom is None: 614 # Generate all SNP positions and corresponding chromosome labels between window boundaries 615 variants_pos_list = [] 616 variants_chrom_list = [] 617 ancestry_list = [] 618 619 for i in range(self.n_windows): 620 start = int(self.physical_pos[i, 0]) 621 end = int(self.physical_pos[i, 1]) 622 chrom = self.chromosomes[i] 623 624 # Generate SNP positions at each base pair within the window range 625 positions_in_window = np.arange(start, end + 1) 626 if positions_in_window.size == 0: 627 continue # Skip windows that contain no valid SNP positions 628 629 n_positions = positions_in_window.size 630 variants_pos_list.append(positions_in_window) 631 variants_chrom_list.append(np.full(n_positions, chrom)) 632 633 ancestry_repeated = ( 634 np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0) 635 if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0) 636 ) 637 ancestry_list.append(ancestry_repeated) 638 639 # Store SNP positions, their corresponding chromosome labels, and their associated ancestry 640 variants_pos = np.concatenate(variants_pos_list) 641 variants_chrom = np.concatenate(variants_chrom_list) 642 calldata_lai = np.concatenate(ancestry_list) 643 else: 644 # Use the provided SNP positions and chromosomes 645 n_snps = len(variants_pos) 646 if len(variants_chrom) != n_snps: 647 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 648 649 # Initialize an array to store the corresponding window index for each SNP 650 # Default value is -1, meaning no matching window found 651 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 652 653 # Identify unique chromosome names sorted by order of appearence 654 _, idx = np.unique(variants_chrom, return_index=True) 655 unique_chroms = variants_chrom[np.sort(idx)] 656 657 # Iterate through each unique chromosome to map SNPs to windows 658 for chrom in unique_chroms: 659 # Get indices of SNPs that belong to the current chromosome 660 snp_indices = np.where(variants_chrom == chrom)[0] 661 snp_pos_chr = variants_pos[snp_indices] 662 663 # Get indices of windows that belong to the current chromosome 664 window_indices = np.where(self.chromosomes == chrom)[0] 665 if window_indices.size == 0: 666 continue # Skip if no windows exist for this chromosome 667 668 # Extract start and end positions of the windows on this chromosome 669 window_starts_chr = self.physical_pos[:, 0][window_indices] 670 window_ends_chr = self.physical_pos[:, 1][window_indices] 671 672 # Find the right-most window that a SNP would fit into (sorted order) 673 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 674 675 # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries 676 valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds]) 677 678 # Assign valid SNPs to their corresponding window indices 679 snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]] 680 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}") 681 682 # Initialize SNP-level ancestry array with NaNs as default 683 shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2) 684 calldata_lai = np.full(shape, np.nan, dtype='uint8') 685 686 # Assign ancestry values to SNPs with valid window assignments 687 valid_mask = (snp_to_window_indices != -1) 688 snp_indices = np.where(valid_mask)[0] 689 snp_to_window_indices = snp_to_window_indices[snp_indices] 690 691 if lai_format == "3D": 692 calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices] 693 else: # "2D" 694 calldata_lai[snp_indices] = self.lai[snp_to_window_indices] 695 696 if snpobject is not None: 697 # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai`` 698 snpobject.variants_chrom = variants_chrom 699 snpobject.variants_pos = variants_pos 700 snpobject.variants_ref = variants_ref 701 snpobject.variants_alt = variants_alt 702 snpobject.variants_filter_pass = variants_filter_pass 703 snpobject.variants_id = variants_id 704 snpobject.variants_qual = variants_qual 705 snpobject.calldata_lai = calldata_lai 706 snpobject.ancestry_map = self.ancestry_map 707 return snpobject 708 else: 709 # Otherwise, create a new SNPObject 710 return SNPObject( 711 calldata_lai=calldata_lai.view(), 712 samples=self.samples, 713 variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref, 714 variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt, 715 variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass, 716 variants_chrom=variants_chrom.view(), 717 variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id, 718 variants_pos=variants_pos.view(), 719 variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual, 720 ancestry_map=self.ancestry_map 721 ) 722 723 def _sanity_check(self) -> None: 724 """ 725 Perform sanity checks on the parsed data to ensure data integrity. 726 727 This method checks that all unique ancestries in LAI are represented 728 in the ancestry map. 729 730 Args: 731 lai (np.ndarray): The LAI data array. 732 ancestry_map (dict, optional): A dictionary mapping ancestry codes to region names, if available. 733 """ 734 # Get unique ancestries from LAI data 735 unique_ancestries = np.unique(self.lai) 736 737 if self.ancestry_map is not None: 738 # Check if all unique ancestries in the LAI are present in the ancestry map 739 for ancestry in unique_ancestries: 740 if str(ancestry) not in self.ancestry_map: 741 warnings.warn( 742 f"Ancestry '{ancestry}' found in LAI data is not represented in the ancestry map." 743 ) 744 745 def save(self, file: Union[str, Path]) -> None: 746 """ 747 Save the data stored in `self` to a specified file. 748 If the file already exists, it will be overwritten. 749 750 The format of the saved file is determined by the file extension provided in the `file` 751 argument. 752 753 **Supported formats:** 754 755 - `.msp`: Text-based MSP format. 756 - `.msp.tsv`: Text-based MSP format with TSV extension. 757 - `.pkl`: Pickle format for saving `self` in serialized form. 758 759 Args: 760 file (str or pathlib.Path): 761 Path to the file where the data will be saved. The extension of the file determines the save format. 762 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 763 """ 764 path = Path(file) 765 suffixes = [suffix.lower() for suffix in path.suffixes] 766 767 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 768 self.save_msp(file) 769 elif suffixes[-1] == '.pkl': 770 self.save_pickle(file) 771 else: 772 raise ValueError( 773 f"Unsupported file extension: {suffixes[-1]}" 774 "Supported extensions are: .msp, .msp.tsv, .pkl." 775 ) 776 777 def save_msp(self, file: Union[str, Path]) -> None: 778 """ 779 Save the data stored in `self` to a `.msp` file. 780 If the file already exists, it will be overwritten. 781 782 Args: 783 file (str or pathlib.Path): 784 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 785 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 786 """ 787 from snputils.ancestry.io.local.write import MSPWriter 788 789 MSPWriter(self, file).write() 790 791 def save_pickle(self, file: Union[str, Path]) -> None: 792 """ 793 Save `self` in serialized form to a `.pkl` file. 794 If the file already exists, it will be overwritten. 795 796 Args: 797 file (str or pathlib.Path): 798 Path to the file where the data will be saved. It should end with `.pkl`. 799 If the provided path does not have this extension, it will be appended. 800 """ 801 import pickle 802 with open(file, 'wb') as file: 803 pickle.dump(self, file)
A class for window-level Local Ancestry Inference (LAI) data.
21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check()
Arguments:
- haplotypes (list of str of length n_haplotypes): A list of unique haplotype identifiers.
- lai (array of shape (n_windows, n_haplotypes)): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
- samples (list of str of length n_samples, optional): A list of unique sample identifiers.
- ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
- window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
- centimorgan_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end centimorgan positions for each window.
- chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
- physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai
Retrieve lai.
Returns:
array of shape (n_windows, n_haplotypes): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None
152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map
Retrieve ancestry_map.
Returns:
dict of str to str: A dictionary mapping ancestry codes to region names.
169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes
Retrieve window_sizes.
Returns:
array of shape (n_windows,): An array specifying the number of SNPs in each genomic window.
187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos
Retrieve centimorgan_pos.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end centimorgan positions for each window.
205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes
Retrieve chromosomes.
Returns:
array of shape (n_windows,): An array with chromosome numbers corresponding to each genomic window.
223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos
Retrieve physical_pos.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end physical positions for each window.
241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2
292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self)
Create and return a copy of self.
Returns:
LocalAncestryObject: A new instance of the current object.
302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj
Filter genomic windows based on specified indexes.
This method updates the lai attribute to include or exclude the specified genomic windows.
Attributes such as window_sizes, centimorgan_pos, chromosomes, and physical_pos will also be
updated accordingly if they are not None. The order of genomic windows is preserved.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- indexes (int or array-like of int): Index(es) of the windows to include or exclude. Can be a single integer or a sequence of integers. Negative indexes are supported.
- include (bool, default=True): If True, includes only the specified windows. If False, excludes the specified windows. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newLocalAncestryObjectwith the windows filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObjectwith the specified windows filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 reorder: bool = False, 403 inplace: bool = False 404 ) -> Optional['LocalAncestryObject']: 405 """ 406 Filter samples based on specified names or indexes. 407 408 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 409 samples. Each sample is associated with two haplotypes, which are included or excluded together. 410 The order of the samples is preserved. Set `reorder=True` to match the ordering of the 411 provided `samples` and/or `indexes` lists when including. 412 413 If both samples and indexes are provided, any sample matching either a name in samples or an index in 414 indexes will be included or excluded. 415 416 Negative indexes are supported and follow 417 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 418 419 Args: 420 samples (str or array_like of str, optional): 421 Name(s) of the samples to include or exclude. Can be a single sample name or a 422 sequence of sample names. Default is None. 423 indexes (int or array_like of int, optional): 424 Index(es) of the samples to include or exclude. Can be a single index or a sequence 425 of indexes. Negative indexes are supported. Default is None. 426 include (bool, default=True): 427 If True, includes only the specified samples. If False, excludes the specified 428 samples. Default is True. 429 inplace (bool, default=False): 430 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 431 samples filtered. Default is False. 432 433 Returns: 434 **Optional[LocalAncestryObject]:** 435 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 436 If `inplace=True`, modifies `self` in place and returns None. 437 """ 438 if samples is None and indexes is None: 439 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 440 441 n_haplotypes = self.n_haplotypes 442 n_samples = self.n_samples 443 444 # Create mask based on sample names 445 if samples is not None: 446 samples = np.asarray(samples).ravel() 447 # Extract sample names from haplotype identifiers 448 haplotype_ids = np.array(self['haplotypes']) 449 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 450 # Create mask for haplotypes belonging to specified samples 451 mask_samples = np.isin(sample_names, samples) 452 else: 453 mask_samples = np.zeros(n_haplotypes, dtype=bool) 454 455 # Create mask based on sample indexes 456 if indexes is not None: 457 indexes = np.asarray(indexes).ravel() 458 459 # Validate indexes, allowing negative indexes 460 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 461 if out_of_bounds_indexes.size > 0: 462 raise ValueError(f"One or more sample indexes are out of bounds.") 463 464 # Adjust negative indexes 465 indexes = np.mod(indexes, n_samples) 466 467 # Get haplotype indexes for the specified sample indexes 468 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 469 # Create mask for haplotypes 470 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 471 mask_indexes[haplotype_indexes] = True 472 else: 473 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 474 475 # Combine masks using logical OR (union of samples) 476 mask_combined = mask_samples | mask_indexes 477 478 if not include: 479 mask_combined = ~mask_combined 480 481 # Optionally compute an ordering of selected samples that follows the provided lists 482 ordered_sample_indices = None 483 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 484 if include and reorder: 485 sel_sample_indices = np.where(sample_mask)[0] 486 ordered_list: List[int] = [] 487 added = np.zeros(self.n_samples, dtype=bool) 488 489 # Source of sample names for ordering logic 490 haplotype_ids = np.array(self['haplotypes']) 491 sample_names_by_sample = np.array([hap.split('.')[0] for hap in haplotype_ids])[::2] 492 493 # Respect the order in `samples` 494 if samples is not None: 495 for s in np.atleast_1d(samples): 496 # Find the sample index by name (first occurrence) 497 matches = np.where(sample_names_by_sample == s)[0] 498 for idx in matches: 499 if sample_mask[idx] and not added[idx]: 500 ordered_list.append(int(idx)) 501 added[idx] = True 502 503 # Then respect the order in `indexes` 504 if indexes is not None: 505 adj_idx = np.mod(np.atleast_1d(indexes), self.n_samples) 506 for idx in adj_idx: 507 if sample_mask[idx] and not added[idx]: 508 ordered_list.append(int(idx)) 509 added[idx] = True 510 511 # Append any remaining selected samples in their original order 512 for idx in sel_sample_indices: 513 if not added[idx]: 514 ordered_list.append(int(idx)) 515 516 ordered_sample_indices = np.asarray(ordered_list, dtype=int) 517 518 # Filter / reorder arrays 519 if ordered_sample_indices is not None: 520 hap_idx = np.concatenate([2*ordered_sample_indices, 2*ordered_sample_indices + 1]) 521 filtered_lai = self['lai'][:, hap_idx] 522 filtered_haplotypes = np.array(self['haplotypes'])[hap_idx].tolist() 523 filtered_samples = ( 524 np.array(self['samples'])[ordered_sample_indices].tolist() 525 if self['samples'] is not None else None 526 ) 527 else: 528 # Filter `lai` 529 filtered_lai = self['lai'][:, mask_combined] 530 # Filter `haplotypes` 531 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 532 # Filter `samples`, checking if they are None before filtering 533 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 534 535 if inplace: 536 self['haplotypes'] = filtered_haplotypes 537 self['samples'] = filtered_samples 538 self['lai'] = filtered_lai 539 return None 540 else: 541 laiobj = self.copy() 542 laiobj['haplotypes'] = filtered_haplotypes 543 laiobj['samples'] = filtered_samples 544 laiobj['lai'] = filtered_lai 545 return laiobj
Filter samples based on specified names or indexes.
This method updates the lai, haplotypes, and samples attributes to include or exclude the specified
samples. Each sample is associated with two haplotypes, which are included or excluded together.
The order of the samples is preserved. Set reorder=True to match the ordering of the
provided samples and/or indexes lists when including.
If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
- indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
- inplace (bool, default=False): If True, modifies
selfin place. If False, returns a newLocalAncestryObjectwith the samples filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObjectwith the specified samples filtered ifinplace=False. Ifinplace=True, modifiesselfin place and returns None.
547 def convert_to_snp_level( 548 self, 549 snpobject: Optional['SNPObject'] = None, 550 variants_chrom: Optional[np.ndarray] = None, 551 variants_pos: Optional[np.ndarray] = None, 552 variants_ref: Optional[np.ndarray] = None, 553 variants_alt: Optional[np.ndarray] = None, 554 variants_filter_pass: Optional[np.ndarray] = None, 555 variants_id: Optional[np.ndarray] = None, 556 variants_qual: Optional[np.ndarray] = None, 557 lai_format: str = "3D" 558 ) -> 'SNPObject': 559 """ 560 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 561 with optional support for SNP data. 562 563 If SNP positions (`variants_pos`) and/or chromosomes (`variants_chrom`) are not specified, the method generates 564 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 565 coordinates are used to assign ancestry values based on their respective windows. 566 567 If a `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 568 In that case, the SNPObject is updated with the (optional) new attributes and the computed `calldata_lai`, then returned. 569 570 Args: 571 snpobject (SNPObject, optional): 572 An existing `SNPObject` to extract SNP attributes from. 573 variants_chrom (array of shape (n_snps,), optional): 574 An array containing the chromosome for each SNP. 575 variants_pos (array of shape (n_snps,), optional): 576 An array containing the chromosomal positions for each SNP. 577 variants_ref (array of shape (n_snps,), optional): 578 An array containing the reference allele for each SNP. 579 variants_alt (array of shape (n_snps,), optional): 580 An array containing the alternate allele for each SNP. 581 variants_filter_pass (array of shape (n_snps,), optional): 582 An array indicating whether each SNP passed control checks. 583 variants_id (array of shape (n_snps,), optional): 584 An array containing unique identifiers (IDs) for each SNP. 585 variants_qual (array of shape (n_snps,), optional): 586 An array containing the Phred-scaled quality score for each SNP. 587 lai_format (str, optional): 588 Determines the shape of `calldata_lai`: 589 - `"3D"` (default): Shape `(n_snps, n_samples, 2)`. 590 - `"2D"`: Shape `(n_snps, n_samples * 2)`. 591 592 Returns: 593 **SNPObject**: 594 A `SNPObject` containing SNP-level ancestry data and updated SNP attributes. 595 """ 596 from snputils.snp.genobj.snpobj import SNPObject 597 598 assert lai_format in {"2D", "3D"}, "Invalid `lai_format`. Must be '2D' or '3D'." 599 600 # Extract attributes from SNPObject if provided 601 if snpobject is not None: 602 variants_chrom = variants_chrom or snpobject.variants_chrom 603 variants_pos = variants_pos or snpobject.variants_pos 604 variants_ref = variants_ref or snpobject.variants_ref 605 variants_alt = variants_alt or snpobject.variants_alt 606 variants_filter_pass = variants_filter_pass or snpobject.variants_filter_pass 607 variants_id = variants_id or snpobject.variants_id 608 variants_qual = variants_qual or snpobject.variants_qual 609 610 n_samples = self.n_samples 611 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2).astype(int) if lai_format == "3D" else None 612 613 if variants_pos is None or variants_chrom is None: 614 # Generate all SNP positions and corresponding chromosome labels between window boundaries 615 variants_pos_list = [] 616 variants_chrom_list = [] 617 ancestry_list = [] 618 619 for i in range(self.n_windows): 620 start = int(self.physical_pos[i, 0]) 621 end = int(self.physical_pos[i, 1]) 622 chrom = self.chromosomes[i] 623 624 # Generate SNP positions at each base pair within the window range 625 positions_in_window = np.arange(start, end + 1) 626 if positions_in_window.size == 0: 627 continue # Skip windows that contain no valid SNP positions 628 629 n_positions = positions_in_window.size 630 variants_pos_list.append(positions_in_window) 631 variants_chrom_list.append(np.full(n_positions, chrom)) 632 633 ancestry_repeated = ( 634 np.repeat(lai_reshaped[i, np.newaxis, :, :], n_positions, axis=0) 635 if lai_format == "3D" else np.repeat(self.lai[i, np.newaxis, :], n_positions, axis=0) 636 ) 637 ancestry_list.append(ancestry_repeated) 638 639 # Store SNP positions, their corresponding chromosome labels, and their associated ancestry 640 variants_pos = np.concatenate(variants_pos_list) 641 variants_chrom = np.concatenate(variants_chrom_list) 642 calldata_lai = np.concatenate(ancestry_list) 643 else: 644 # Use the provided SNP positions and chromosomes 645 n_snps = len(variants_pos) 646 if len(variants_chrom) != n_snps: 647 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 648 649 # Initialize an array to store the corresponding window index for each SNP 650 # Default value is -1, meaning no matching window found 651 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 652 653 # Identify unique chromosome names sorted by order of appearence 654 _, idx = np.unique(variants_chrom, return_index=True) 655 unique_chroms = variants_chrom[np.sort(idx)] 656 657 # Iterate through each unique chromosome to map SNPs to windows 658 for chrom in unique_chroms: 659 # Get indices of SNPs that belong to the current chromosome 660 snp_indices = np.where(variants_chrom == chrom)[0] 661 snp_pos_chr = variants_pos[snp_indices] 662 663 # Get indices of windows that belong to the current chromosome 664 window_indices = np.where(self.chromosomes == chrom)[0] 665 if window_indices.size == 0: 666 continue # Skip if no windows exist for this chromosome 667 668 # Extract start and end positions of the windows on this chromosome 669 window_starts_chr = self.physical_pos[:, 0][window_indices] 670 window_ends_chr = self.physical_pos[:, 1][window_indices] 671 672 # Find the right-most window that a SNP would fit into (sorted order) 673 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 674 675 # Mask valid SNPs: ensure they are within a valid range and fall inside window boundaries 676 valid_mask = (inds >= 0) & (inds < len(window_starts_chr)) & (snp_pos_chr <= window_ends_chr[inds]) 677 678 # Assign valid SNPs to their corresponding window indices 679 snp_to_window_indices[snp_indices[valid_mask]] = window_indices[inds[valid_mask]] 680 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {valid_mask.sum()}") 681 682 # Initialize SNP-level ancestry array with NaNs as default 683 shape = (n_snps, n_samples, 2) if lai_format == "3D" else (n_snps, n_samples * 2) 684 calldata_lai = np.full(shape, np.nan, dtype='uint8') 685 686 # Assign ancestry values to SNPs with valid window assignments 687 valid_mask = (snp_to_window_indices != -1) 688 snp_indices = np.where(valid_mask)[0] 689 snp_to_window_indices = snp_to_window_indices[snp_indices] 690 691 if lai_format == "3D": 692 calldata_lai[snp_indices] = lai_reshaped[snp_to_window_indices] 693 else: # "2D" 694 calldata_lai[snp_indices] = self.lai[snp_to_window_indices] 695 696 if snpobject is not None: 697 # If a SNPObject was provided, update its attributes with any new values and add `calldata_lai`` 698 snpobject.variants_chrom = variants_chrom 699 snpobject.variants_pos = variants_pos 700 snpobject.variants_ref = variants_ref 701 snpobject.variants_alt = variants_alt 702 snpobject.variants_filter_pass = variants_filter_pass 703 snpobject.variants_id = variants_id 704 snpobject.variants_qual = variants_qual 705 snpobject.calldata_lai = calldata_lai 706 snpobject.ancestry_map = self.ancestry_map 707 return snpobject 708 else: 709 # Otherwise, create a new SNPObject 710 return SNPObject( 711 calldata_lai=calldata_lai.view(), 712 samples=self.samples, 713 variants_ref=variants_ref.view() if isinstance(variants_ref, np.ndarray) else variants_ref, 714 variants_alt=variants_alt.view() if isinstance(variants_alt, np.ndarray) else variants_alt, 715 variants_filter_pass=variants_filter_pass.view() if isinstance(variants_filter_pass, np.ndarray) else variants_filter_pass, 716 variants_chrom=variants_chrom.view(), 717 variants_id=variants_id.view() if isinstance(variants_id, np.ndarray) else variants_id, 718 variants_pos=variants_pos.view(), 719 variants_qual=variants_qual.view() if isinstance(variants_qual, np.ndarray) else variants_qual, 720 ancestry_map=self.ancestry_map 721 )
Convert self into a snputils.snp.genobj.SNPObject SNP-level Local Ancestry Information (LAI),
with optional support for SNP data.
If SNP positions (variants_pos) and/or chromosomes (variants_chrom) are not specified, the method generates
SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP
coordinates are used to assign ancestry values based on their respective windows.
If a SNPObject is provided, its attributes are used unless explicitly overridden by the function arguments.
In that case, the SNPObject is updated with the (optional) new attributes and the computed calldata_lai, then returned.
Arguments:
- snpobject (SNPObject, optional): An existing
SNPObjectto extract SNP attributes from. - variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
- variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
- variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
- variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
- variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
- variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
- variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
- lai_format (str, optional): Determines the shape of
calldata_lai:"3D"(default): Shape(n_snps, n_samples, 2)."2D": Shape(n_snps, n_samples * 2).
Returns:
SNPObject: A
SNPObjectcontaining SNP-level ancestry data and updated SNP attributes.
745 def save(self, file: Union[str, Path]) -> None: 746 """ 747 Save the data stored in `self` to a specified file. 748 If the file already exists, it will be overwritten. 749 750 The format of the saved file is determined by the file extension provided in the `file` 751 argument. 752 753 **Supported formats:** 754 755 - `.msp`: Text-based MSP format. 756 - `.msp.tsv`: Text-based MSP format with TSV extension. 757 - `.pkl`: Pickle format for saving `self` in serialized form. 758 759 Args: 760 file (str or pathlib.Path): 761 Path to the file where the data will be saved. The extension of the file determines the save format. 762 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 763 """ 764 path = Path(file) 765 suffixes = [suffix.lower() for suffix in path.suffixes] 766 767 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 768 self.save_msp(file) 769 elif suffixes[-1] == '.pkl': 770 self.save_pickle(file) 771 else: 772 raise ValueError( 773 f"Unsupported file extension: {suffixes[-1]}" 774 "Supported extensions are: .msp, .msp.tsv, .pkl." 775 )
Save the data stored in self to a specified file.
If the file already exists, it will be overwritten.
The format of the saved file is determined by the file extension provided in the file
argument.
Supported formats:
.msp: Text-based MSP format..msp.tsv: Text-based MSP format with TSV extension..pkl: Pickle format for savingselfin serialized form.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format.
Supported extensions:
.msp,.msp.tsv,.pkl.
777 def save_msp(self, file: Union[str, Path]) -> None: 778 """ 779 Save the data stored in `self` to a `.msp` file. 780 If the file already exists, it will be overwritten. 781 782 Args: 783 file (str or pathlib.Path): 784 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 785 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 786 """ 787 from snputils.ancestry.io.local.write import MSPWriter 788 789 MSPWriter(self, file).write()
Save the data stored in self to a .msp file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.mspor.msp.tsv. If the provided path does not have one of these extensions, the.mspextension will be appended.
791 def save_pickle(self, file: Union[str, Path]) -> None: 792 """ 793 Save `self` in serialized form to a `.pkl` file. 794 If the file already exists, it will be overwritten. 795 796 Args: 797 file (str or pathlib.Path): 798 Path to the file where the data will be saved. It should end with `.pkl`. 799 If the provided path does not have this extension, it will be appended. 800 """ 801 import pickle 802 with open(file, 'wb') as file: 803 pickle.dump(self, file)
Save self in serialized form to a .pkl file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl. If the provided path does not have this extension, it will be appended.