snputils.ancestry.genobj
10class GlobalAncestryObject(AncestryObject): 11 """ 12 A class for Global Ancestry Inference (GAI) data. 13 """ 14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: np.ndarray, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 n_snps, n_ancestries_P = P.shape 42 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if snps is None: 62 snps = list(range(n_snps)) 63 else: 64 snps = list(snps) 65 if len(snps) != n_snps: 66 raise ValueError( 67 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 68 ) 69 70 if len(ancestries) != n_samples: 71 raise ValueError( 72 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 73 ) 74 75 super().__init__(n_samples, n_ancestries) 76 77 # Store attributes 78 self.__Q = Q 79 self.__P = P 80 self.__samples = np.asarray(samples) 81 self.__snps = np.asarray(snps) 82 self.__ancestries = np.asarray(ancestries) 83 84 # Perform sanity checks 85 self._sanity_check() 86 87 def __getitem__(self, key): 88 """ 89 To access an attribute of the class using the square bracket notation, 90 similar to a dictionary. 91 """ 92 try: 93 return getattr(self, key) 94 except AttributeError: 95 raise KeyError(f'Invalid key: {key}') 96 97 def __setitem__(self, key, value): 98 """ 99 To set an attribute of the class using the square bracket notation, 100 similar to a dictionary. 101 """ 102 try: 103 setattr(self, key, value) 104 except AttributeError: 105 raise KeyError(f'Invalid key: {key}') 106 107 @property 108 def Q(self) -> np.ndarray: 109 """ 110 Retrieve `Q`. 111 112 Returns: 113 **array of shape (n_samples, n_ancestries):** 114 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 115 and each column corresponds to an ancestry. 116 """ 117 return self.__Q 118 119 @Q.setter 120 def Q(self, x: np.ndarray): 121 """ 122 Update `Q`. 123 """ 124 if x.shape != (self.n_samples, self.n_ancestries): 125 raise ValueError( 126 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {x.shape}." 127 ) 128 self.__Q = x 129 130 @property 131 def P(self) -> np.ndarray: 132 """ 133 Retrieve `P`. 134 135 Returns: 136 **array of shape (n_snps, n_ancestries):** 137 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 138 and each column corresponds to an ancestry. 139 """ 140 return self.__P 141 142 @P.setter 143 def P(self, x: np.ndarray): 144 """ 145 Update `P`. 146 """ 147 if x.shape != (self.n_snps, self.n_ancestries): 148 raise ValueError( 149 f"P must have shape ({self.n_snps}, {self.n_ancestries}). Provided shape is {x.shape}." 150 ) 151 self.__P = x 152 self._sanity_check() 153 154 @property 155 def F(self) -> np.ndarray: 156 """ 157 Alias for `P`. 158 159 Returns: 160 **array of shape (n_snps, n_ancestries):** 161 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 162 and each column corresponds to an ancestry. 163 """ 164 return self.P 165 166 @F.setter 167 def F(self, x: np.ndarray): 168 """ 169 Update `F`. 170 """ 171 if x.shape != (self.n_snps, self.n_ancestries): 172 raise ValueError( 173 f"F must have shape ({self.n_snps}, {self.n_ancestries}). Provided shape is {x.shape}." 174 ) 175 self.__P = x 176 177 @property 178 def samples(self) -> Optional[np.ndarray]: 179 """ 180 Retrieve `samples`. 181 182 Returns: 183 **array of shape (n_samples,):** 184 An array containing unique identifiers for each sample. If None, sample 185 identifiers are assigned as integers from `0` to `n_samples - 1`. 186 """ 187 return self.__samples 188 189 @samples.setter 190 def samples(self, x: Sequence): 191 """ 192 Update `samples`. 193 """ 194 x = list(x) 195 if len(x) != self.n_samples: 196 raise ValueError( 197 f"samples must have length {self.n_samples}; got length {len(x)}." 198 ) 199 self.__samples = x 200 201 @property 202 def snps(self) -> Optional[np.ndarray]: 203 """ 204 Retrieve `snps`. 205 206 Returns: 207 **array of shape (n_snps,):** 208 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 209 from `0` to `n_snps - 1`. 210 """ 211 return self.__snps 212 213 @snps.setter 214 def snps(self, x: Sequence): 215 """ 216 Update `snps`. 217 """ 218 x = list(x) 219 if len(x) != self.n_snps: 220 raise ValueError( 221 f"snps must have length {self.n_snps}; got length {len(x)}." 222 ) 223 self.__snps = np.asarray(x) 224 225 @property 226 def ancestries(self) -> Optional[np.ndarray]: 227 """ 228 Retrieve `ancestries`. 229 230 Returns: 231 **array of shape (n_samples,):** 232 An array containing ancestry labels for each sample. 233 """ 234 return self.__ancestries 235 236 @ancestries.setter 237 def ancestries(self, x: Sequence): 238 """ 239 Update `ancestries`. 240 """ 241 x = list(x) 242 num_x = len(x) 243 num_unique_x = len(np.unique(x)) 244 245 if num_x != self.n_samples: 246 raise ValueError( 247 f"ancestries must have length {self.n_samples}; got length {num_x}." 248 ) 249 if num_unique_x <= self.n_ancestries: 250 raise ValueError( 251 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_x} unique labels." 252 ) 253 self.__ancestries = np.asarray(x) 254 255 @property 256 def n_samples(self) -> int: 257 """ 258 Retrieve `n_samples`. 259 260 Returns: 261 **int:** The total number of samples. 262 """ 263 return self.__Q.shape[0] 264 265 @property 266 def n_snps(self) -> int: 267 """ 268 Retrieve `n_snps`. 269 270 Returns: 271 **int:** The total number of SNPs. 272 """ 273 return self.__P.shape[0] 274 275 @property 276 def n_ancestries(self) -> int: 277 """ 278 Retrieve `n_ancestries`. 279 280 Returns: 281 **int:** The total number of unique ancestries. 282 """ 283 return self.__Q.shape[1] 284 285 def copy(self) -> 'GlobalAncestryObject': 286 """ 287 Create and return a copy of `self`. 288 289 Returns: 290 **GlobalAncestryObject:** A new instance of the current object. 291 """ 292 return copy.copy(self) 293 294 def keys(self) -> List[str]: 295 """ 296 Retrieve a list of public attribute names for `self`. 297 298 Returns: 299 **list of str:** 300 A list of attribute names, with internal name-mangling removed, 301 for easier reference to public attributes in the instance. 302 """ 303 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 304 305 def _sanity_check(self) -> None: 306 """ 307 Perform sanity checks to ensure that matrix dimensions are consistent with expected sizes. 308 309 Raises: 310 **ValueError:** If any of the matrix dimensions do not match the expected sizes. 311 """ 312 # Check that the Q matrix has the correct shape 313 if self.__Q.shape != (self.n_samples, self.n_ancestries): 314 raise ValueError( 315 f"Q must have shape ({self.n_samples}, {self.n_ancestries}); got {self.__Q.shape}." 316 ) 317 318 # Check that the P matrix has the correct shape 319 if self.__P.shape != (self.n_snps, self.n_ancestries): 320 raise ValueError( 321 f"P must have shape ({self.n_snps}, {self.n_ancestries}); got {self.__P.shape}." 322 ) 323 324 # Check that samples length matches n_samples 325 if self.samples is not None: 326 if len(self.__samples) != self.n_samples: 327 raise ValueError( 328 f"samples must have length {self.n_samples}; got length {len(self.__samples)}." 329 ) 330 331 # Check that snps length matches n_snps 332 if self.snps is not None: 333 if len(self.__snps) != self.n_snps: 334 raise ValueError( 335 f"snps must have length {self.n_snps}; got length {len(self.__snps)}." 336 ) 337 338 # Check that ancestries length matches n_samples 339 if self.ancestries is not None: 340 if len(self.__ancestries) != self.n_samples: 341 raise ValueError( 342 f"ancestries must have length {self.n_samples}; got length {len(self.__ancestries)}." 343 ) 344 345 # Check number of unique ancestry labels 346 num_unique_ancestries = len(np.unique(self.__ancestries)) 347 if num_unique_ancestries <= self.n_ancestries: 348 raise ValueError( 349 f"Number of unique ancestry labels must be less than or equal to {self.n_ancestries}; got {num_unique_ancestries} unique labels." 350 ) 351 352 def save(self, file: Union[str, Path]) -> None: 353 """ 354 Save the data stored in `self` to a specified file or set of files. 355 356 The format of the saved file(s) is determined by the file extension provided in the `file` 357 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 358 the file is treated as a prefix for saving ADMIXTURE files. 359 360 **Supported formats:** 361 362 - `.pkl`: Pickle format for saving `self` in serialized form. 363 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 364 365 Args: 366 file (str or pathlib.Path): 367 Path to the file where the data will be saved. If the extension is `.pkl`, the object 368 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 369 """ 370 path = Path(file) 371 suffix = path.suffix.lower() 372 373 if suffix == '.pkl': 374 self.save_pickle(path) 375 else: 376 self.save_admixture(path) 377 378 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 379 """ 380 Save the data stored in `self` into multiple ADMIXTURE files. 381 If the file already exists, it will be overwritten. 382 383 **Output files:** 384 385 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 386 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 387 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 388 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 389 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 390 391 Args: 392 file_prefix (str or pathlib.Path): 393 The base prefix for output file names, including directory path but excluding file extensions. 394 The prefix is used to generate specific file names for each output, with file-specific 395 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 396 """ 397 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 398 399 AdmixtureWriter(self, file_prefix).write() 400 401 def save_pickle(self, file: Union[str, Path]) -> None: 402 """ 403 Save `self` in serialized form to a `.pkl` file. 404 If the file already exists, it will be overwritten. 405 406 Args: 407 file (str or pathlib.Path): 408 Path to the file where the data will be saved. It should end with `.pkl`. 409 If the provided path does not have this extension, it will be appended. 410 """ 411 import pickle 412 with open(file, 'wb') as file: 413 pickle.dump(self, file)
A class for Global Ancestry Inference (GAI) data.
14 def __init__( 15 self, 16 Q: np.ndarray, 17 P: np.ndarray, 18 samples: Optional[Sequence] = None, 19 snps: Optional[Sequence] = None, 20 ancestries: Optional[Sequence] = None 21 ) -> None: 22 """ 23 Args: 24 Q (array of shape (n_samples, n_ancestries)): 25 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 26 and each column corresponds to an ancestry. 27 P (array of shape (n_snps, n_ancestries)): 28 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 29 and each column corresponds to an ancestry. 30 samples (sequence of length n_samples, optional): 31 A sequence containing unique identifiers for each sample. If None, sample identifiers 32 are assigned as integers from `0` to `n_samples - 1`. 33 snps (sequence of length n_snps, optional): 34 A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers 35 from `0` to `n_snps - 1`. 36 ancestries (sequence of length n_samples, optional): 37 A sequence containing ancestry labels for each sample. 38 """ 39 # Determine dimensions 40 n_samples, n_ancestries_Q = Q.shape 41 n_snps, n_ancestries_P = P.shape 42 43 if n_ancestries_Q != n_ancestries_P: 44 raise ValueError( 45 f"The number of ancestries in Q ({n_ancestries_Q}) and P ({n_ancestries_P}) must be the same." 46 ) 47 48 n_ancestries = n_ancestries_Q 49 50 # Assign default sample identifiers if none provided 51 if samples is None: 52 samples = list(range(n_samples)) 53 else: 54 samples = list(samples) 55 if len(samples) != n_samples: 56 raise ValueError( 57 f"Length of samples ({len(samples)}) does not match number of samples ({n_samples})." 58 ) 59 60 # Assign default SNP identifiers if none provided 61 if snps is None: 62 snps = list(range(n_snps)) 63 else: 64 snps = list(snps) 65 if len(snps) != n_snps: 66 raise ValueError( 67 f"Length of snps ({len(snps)}) does not match number of SNPs ({n_snps})." 68 ) 69 70 if len(ancestries) != n_samples: 71 raise ValueError( 72 f"Length of ancestries ({len(ancestries)}) does not match number of samples ({n_samples})." 73 ) 74 75 super().__init__(n_samples, n_ancestries) 76 77 # Store attributes 78 self.__Q = Q 79 self.__P = P 80 self.__samples = np.asarray(samples) 81 self.__snps = np.asarray(snps) 82 self.__ancestries = np.asarray(ancestries) 83 84 # Perform sanity checks 85 self._sanity_check()
Arguments:
- Q (array of shape (n_samples, n_ancestries)): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
- P (array of shape (n_snps, n_ancestries)): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
- samples (sequence of length n_samples, optional): A sequence containing unique identifiers for each sample. If None, sample identifiers
are assigned as integers from
0
ton_samples - 1
. - snps (sequence of length n_snps, optional): A sequence containing identifiers for each SNP. If None, SNPs are assigned as integers
from
0
ton_snps - 1
. - ancestries (sequence of length n_samples, optional): A sequence containing ancestry labels for each sample.
107 @property 108 def Q(self) -> np.ndarray: 109 """ 110 Retrieve `Q`. 111 112 Returns: 113 **array of shape (n_samples, n_ancestries):** 114 A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, 115 and each column corresponds to an ancestry. 116 """ 117 return self.__Q
Retrieve Q
.
Returns:
array of shape (n_samples, n_ancestries): A 2D array containing per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
130 @property 131 def P(self) -> np.ndarray: 132 """ 133 Retrieve `P`. 134 135 Returns: 136 **array of shape (n_snps, n_ancestries):** 137 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 138 and each column corresponds to an ancestry. 139 """ 140 return self.__P
Retrieve P
.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
154 @property 155 def F(self) -> np.ndarray: 156 """ 157 Alias for `P`. 158 159 Returns: 160 **array of shape (n_snps, n_ancestries):** 161 A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, 162 and each column corresponds to an ancestry. 163 """ 164 return self.P
Alias for P
.
Returns:
array of shape (n_snps, n_ancestries): A 2D array containing per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.
177 @property 178 def samples(self) -> Optional[np.ndarray]: 179 """ 180 Retrieve `samples`. 181 182 Returns: 183 **array of shape (n_samples,):** 184 An array containing unique identifiers for each sample. If None, sample 185 identifiers are assigned as integers from `0` to `n_samples - 1`. 186 """ 187 return self.__samples
Retrieve samples
.
Returns:
array of shape (n_samples,): An array containing unique identifiers for each sample. If None, sample identifiers are assigned as integers from
0
ton_samples - 1
.
201 @property 202 def snps(self) -> Optional[np.ndarray]: 203 """ 204 Retrieve `snps`. 205 206 Returns: 207 **array of shape (n_snps,):** 208 An array containing identifiers for each SNP. If None, SNPs are assigned as integers 209 from `0` to `n_snps - 1`. 210 """ 211 return self.__snps
Retrieve snps
.
Returns:
array of shape (n_snps,): An array containing identifiers for each SNP. If None, SNPs are assigned as integers from
0
ton_snps - 1
.
225 @property 226 def ancestries(self) -> Optional[np.ndarray]: 227 """ 228 Retrieve `ancestries`. 229 230 Returns: 231 **array of shape (n_samples,):** 232 An array containing ancestry labels for each sample. 233 """ 234 return self.__ancestries
Retrieve ancestries
.
Returns:
array of shape (n_samples,): An array containing ancestry labels for each sample.
285 def copy(self) -> 'GlobalAncestryObject': 286 """ 287 Create and return a copy of `self`. 288 289 Returns: 290 **GlobalAncestryObject:** A new instance of the current object. 291 """ 292 return copy.copy(self)
Create and return a copy of self
.
Returns:
GlobalAncestryObject: A new instance of the current object.
294 def keys(self) -> List[str]: 295 """ 296 Retrieve a list of public attribute names for `self`. 297 298 Returns: 299 **list of str:** 300 A list of attribute names, with internal name-mangling removed, 301 for easier reference to public attributes in the instance. 302 """ 303 return [attr.replace('_GlobalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self
.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
352 def save(self, file: Union[str, Path]) -> None: 353 """ 354 Save the data stored in `self` to a specified file or set of files. 355 356 The format of the saved file(s) is determined by the file extension provided in the `file` 357 argument. If the extension is `.pkl`, the object is serialized as a pickle file. Otherwise, 358 the file is treated as a prefix for saving ADMIXTURE files. 359 360 **Supported formats:** 361 362 - `.pkl`: Pickle format for saving `self` in serialized form. 363 - Any other extension or no extension: Treated as a prefix for ADMIXTURE files. 364 365 Args: 366 file (str or pathlib.Path): 367 Path to the file where the data will be saved. If the extension is `.pkl`, the object 368 is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files. 369 """ 370 path = Path(file) 371 suffix = path.suffix.lower() 372 373 if suffix == '.pkl': 374 self.save_pickle(path) 375 else: 376 self.save_admixture(path)
Save the data stored in self
to a specified file or set of files.
The format of the saved file(s) is determined by the file extension provided in the file
argument. If the extension is .pkl
, the object is serialized as a pickle file. Otherwise,
the file is treated as a prefix for saving ADMIXTURE files.
Supported formats:
.pkl
: Pickle format for savingself
in serialized form.- Any other extension or no extension: Treated as a prefix for ADMIXTURE files.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. If the extension is
.pkl
, the object is serialized. Otherwise, it is treated as a prefix for ADMIXTURE files.
378 def save_admixture(self, file_prefix: Union[str, Path]) -> None: 379 """ 380 Save the data stored in `self` into multiple ADMIXTURE files. 381 If the file already exists, it will be overwritten. 382 383 **Output files:** 384 385 - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter. 386 - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter. 387 - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available). 388 - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available). 389 - `<file_prefix>.map`: Ancestry file (if ancestries information is available). 390 391 Args: 392 file_prefix (str or pathlib.Path): 393 The base prefix for output file names, including directory path but excluding file extensions. 394 The prefix is used to generate specific file names for each output, with file-specific 395 suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file). 396 """ 397 from snputils.ancestry.io.wide.write.admixture import AdmixtureWriter 398 399 AdmixtureWriter(self, file_prefix).write()
Save the data stored in self
into multiple ADMIXTURE files.
If the file already exists, it will be overwritten.
Output files:
<file_prefix>.K.Q
: Q matrix file. The file uses space (' ') as the delimiter.<file_prefix>.K.P
: P matrix file. The file uses space (' ') as the delimiter.<file_prefix>.sample_ids.txt
: Sample IDs file (if sample IDs are available).<file_prefix>.snp_ids.txt
: SNP IDs file (if SNP IDs are available).<file_prefix>.map
: Ancestry file (if ancestries information is available).
Arguments:
- file_prefix (str or pathlib.Path): The base prefix for output file names, including directory path but excluding file extensions.
The prefix is used to generate specific file names for each output, with file-specific
suffixes appended as described above (e.g.,
file_prefix.n_ancestries.Q
for the Q matrix file).
401 def save_pickle(self, file: Union[str, Path]) -> None: 402 """ 403 Save `self` in serialized form to a `.pkl` file. 404 If the file already exists, it will be overwritten. 405 406 Args: 407 file (str or pathlib.Path): 408 Path to the file where the data will be saved. It should end with `.pkl`. 409 If the provided path does not have this extension, it will be appended. 410 """ 411 import pickle 412 with open(file, 'wb') as file: 413 pickle.dump(self, file)
Save self
in serialized form to a .pkl
file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl
. If the provided path does not have this extension, it will be appended.
17class LocalAncestryObject(AncestryObject): 18 """ 19 A class for window-level Local Ancestry Inference (LAI) data. 20 """ 21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check() 73 74 def __getitem__(self, key): 75 """ 76 To access an attribute of the class using the square bracket notation, 77 similar to a dictionary. 78 """ 79 try: 80 return getattr(self, key) 81 except AttributeError: 82 raise KeyError(f'Invalid key: {key}') 83 84 def __setitem__(self, key, value): 85 """ 86 To set an attribute of the class using the square bracket notation, 87 similar to a dictionary. 88 """ 89 try: 90 setattr(self, key, value) 91 except AttributeError: 92 raise KeyError(f'Invalid key: {key}') 93 94 @property 95 def haplotypes(self) -> List[str]: 96 """ 97 Retrieve `haplotypes`. 98 99 Returns: 100 **list of length n_haplotypes:** A list of unique haplotype identifiers. 101 """ 102 return self.__haplotypes 103 104 @haplotypes.setter 105 def haplotypes(self, x): 106 """ 107 Update `haplotypes`. 108 """ 109 self.__haplotypes = x 110 111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai 122 123 @lai.setter 124 def lai(self, x): 125 """ 126 Update `lai`. 127 """ 128 self.__lai = x 129 130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None 144 145 @samples.setter 146 def samples(self, x): 147 """ 148 Update `samples`. 149 """ 150 self.__samples = x 151 152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map 161 162 @ancestry_map.setter 163 def ancestry_map(self, x): 164 """ 165 Update `ancestry_map`. 166 """ 167 self.__ancestry_map = x 168 169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes 179 180 @window_sizes.setter 181 def window_sizes(self, x): 182 """ 183 Update `window_sizes`. 184 """ 185 self.__window_sizes = x 186 187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos 197 198 @centimorgan_pos.setter 199 def centimorgan_pos(self, x): 200 """ 201 Update `centimorgan_pos`. 202 """ 203 self.__centimorgan_pos = x 204 205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes 215 216 @chromosomes.setter 217 def chromosomes(self, x): 218 """ 219 Update `chromosomes`. 220 """ 221 self.__chromosomes = x 222 223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos 233 234 @physical_pos.setter 235 def physical_pos(self, x): 236 """ 237 Update `physical_pos`. 238 """ 239 self.__physical_pos = x 240 241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2 258 259 @property 260 def n_ancestries(self) -> int: 261 """ 262 Retrieve `n_ancestries`. 263 264 Returns: 265 **int:** The total number of unique ancestries. 266 """ 267 return len(np.unique(self.__lai)) 268 269 @property 270 def n_haplotypes(self) -> int: 271 """ 272 Retrieve `n_haplotypes`. 273 274 Returns: 275 **int:** The total number of haplotypes. 276 """ 277 if self.__haplotypes is not None: 278 return len(self.__haplotypes) 279 else: 280 return self.__lai.shape[1] 281 282 @property 283 def n_windows(self) -> int: 284 """ 285 Retrieve `n_windows`. 286 287 Returns: 288 **int:** The total number of genomic windows. 289 """ 290 return self.__lai.shape[0] 291 292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self) 301 302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)] 312 313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj 396 397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 inplace: bool = False 403 ) -> Optional['LocalAncestryObject']: 404 """ 405 Filter samples based on specified names or indexes. 406 407 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 408 samples. Each sample is associated with two haplotypes, which are included or excluded together. 409 The order of the samples is preserved. 410 411 If both samples and indexes are provided, any sample matching either a name in samples or an index in 412 indexes will be included or excluded. 413 414 Negative indexes are supported and follow 415 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 416 417 Args: 418 samples (str or array_like of str, optional): 419 Name(s) of the samples to include or exclude. Can be a single sample name or a 420 sequence of sample names. Default is None. 421 indexes (int or array_like of int, optional): 422 Index(es) of the samples to include or exclude. Can be a single index or a sequence 423 of indexes. Negative indexes are supported. Default is None. 424 include (bool, default=True): 425 If True, includes only the specified samples. If False, excludes the specified 426 samples. Default is True. 427 inplace (bool, default=False): 428 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 429 samples filtered. Default is False. 430 431 Returns: 432 **Optional[LocalAncestryObject]:** 433 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 434 If `inplace=True`, modifies `self` in place and returns None. 435 """ 436 if samples is None and indexes is None: 437 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 438 439 n_haplotypes = self.n_haplotypes 440 n_samples = self.n_samples 441 442 # Create mask based on sample names 443 if samples is not None: 444 samples = np.atleast_1d(samples) 445 # Extract sample names from haplotype identifiers 446 haplotype_ids = np.array(self['haplotypes']) 447 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 448 # Create mask for haplotypes belonging to specified samples 449 mask_samples = np.isin(sample_names, samples) 450 else: 451 mask_samples = np.zeros(n_haplotypes, dtype=bool) 452 453 # Create mask based on sample indexes 454 if indexes is not None: 455 indexes = np.atleast_1d(indexes) 456 457 # Validate indexes, allowing negative indexes 458 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 459 if out_of_bounds_indexes.size > 0: 460 raise ValueError(f"One or more sample indexes are out of bounds.") 461 462 # Adjust negative indexes 463 indexes = np.mod(indexes, n_samples) 464 465 # Get haplotype indexes for the specified sample indexes 466 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 467 # Create mask for haplotypes 468 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 469 mask_indexes[haplotype_indexes] = True 470 else: 471 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 472 473 # Combine masks using logical OR (union of samples) 474 mask_combined = mask_samples | mask_indexes 475 476 if not include: 477 mask_combined = ~mask_combined 478 479 # Filter `lai` 480 filtered_lai = self['lai'][:, mask_combined] 481 482 # Filter `haplotypes` 483 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 484 485 # Filter `samples`, checking if they are None before filtering 486 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 487 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 488 489 if inplace: 490 self['haplotypes'] = filtered_haplotypes 491 self['samples'] = filtered_samples 492 self['lai'] = filtered_lai 493 return None 494 else: 495 laiobj = self.copy() 496 laiobj['haplotypes'] = filtered_haplotypes 497 laiobj['samples'] = filtered_samples 498 laiobj['lai'] = filtered_lai 499 return laiobj 500 501 def convert_to_snp_level( 502 self, 503 snpobject: Optional['SNPObject'] = None, 504 variants_chrom: Optional[np.ndarray] = None, 505 variants_pos: Optional[np.ndarray] = None, 506 variants_ref: Optional[np.ndarray] = None, 507 variants_alt: Optional[np.ndarray] = None, 508 variants_filter_pass: Optional[np.ndarray] = None, 509 variants_id: Optional[np.ndarray] = None, 510 variants_qual: Optional[np.ndarray] = None 511 ) -> 'SNPObject': 512 """ 513 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 514 with optional support for SNP data. 515 516 If SNP positions (`variants_pos`) and chromosomes (`variants_chrom`) are not specified, the method generates 517 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 518 coordinates are used to assign ancestry values based on their respective windows. 519 520 If an `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 521 522 Args: 523 snpobject (SNPObject, optional): 524 An existing `SNPObject` to extract SNP attributes from. 525 variants_chrom (array of shape (n_snps,), optional): 526 An array containing the chromosome for each SNP. 527 variants_pos (array of shape (n_snps,), optional): 528 An array containing the chromosomal positions for each SNP. 529 variants_ref (array of shape (n_snps,), optional): 530 An array containing the reference allele for each SNP. 531 variants_alt (array of shape (n_snps,), optional): 532 An array containing the alternate allele for each SNP. 533 variants_filter_pass (array of shape (n_snps,), optional): 534 An array indicating whether each SNP passed control checks. 535 variants_id (array of shape (n_snps,), optional): 536 An array containing unique identifiers (IDs) for each SNP. 537 variants_qual (array of shape (n_snps,), optional): 538 An array containing the Phred-scaled quality score for each SNP. 539 540 Returns: 541 **SNPObject**: 542 A `SNPObject` containing SNP-level ancestry data, along with optional metadata. 543 """ 544 from snputils.snp.genobj.snpobj import SNPObject 545 546 # Extract attributes from SNPObject if provided 547 if snpobject is not None: 548 variants_chrom = variants_chrom if variants_chrom is not None else snpobject.variants_chrom 549 variants_pos = variants_pos if variants_pos is not None else snpobject.variants_pos 550 variants_ref = variants_ref if variants_ref is not None else snpobject.variants_ref 551 variants_alt = variants_alt if variants_alt is not None else snpobject.variants_alt 552 variants_filter_pass = variants_filter_pass if variants_filter_pass is not None else snpobject.variants_filter_pass 553 variants_id = variants_id if variants_id is not None else snpobject.variants_id 554 variants_qual = variants_qual if variants_qual is not None else snpobject.variants_qual 555 556 n_samples = self.n_samples 557 558 # Reshape lai to (n_windows, n_samples, 2) 559 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2) 560 561 if variants_pos is None or variants_chrom is None: 562 # Generate SNP positions and chromosomes from windows 563 variants_pos_list = [] 564 variants_chrom_list = [] 565 ancestry_list = [] 566 567 for i in range(self.n_windows): 568 start = int(self.physical_pos[i, 0]) 569 end = int(self.physical_pos[i, 1]) 570 chrom = self.chromosomes[i] 571 ancestry = lai_reshaped[i, :, :] # Shape: (n_samples, 2) 572 573 # Create SNP positions between start and end with the given step size 574 positions_in_window = np.arange(start, end + 1) 575 n_positions = len(positions_in_window) 576 577 if n_positions == 0: 578 continue # Skip windows with no positions 579 580 variants_pos_list.append(positions_in_window) 581 variants_chrom_list.append(np.full(n_positions, chrom)) 582 583 # Repeat ancestry for each SNP position in the window 584 ancestry_repeated = np.repeat(ancestry[np.newaxis, :, :], n_positions, axis=0) 585 ancestry_list.append(ancestry_repeated) 586 587 # Concatenate all SNP positions, chromosomes, and ancestries 588 variants_pos = np.concatenate(variants_pos_list) 589 variants_chrom = np.concatenate(variants_chrom_list) 590 calldata_lai = np.concatenate(ancestry_list) 591 592 else: 593 # Use provided SNP positions and chromosomes 594 n_snps = len(variants_pos) 595 if len(variants_chrom) != n_snps: 596 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 597 598 # Map SNPs to windows 599 window_starts = self.physical_pos[:, 0] 600 window_ends = self.physical_pos[:, 1] 601 window_chromosomes = self.chromosomes 602 603 # Initialize array to store window indices for each SNP 604 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 605 unique_chroms = np.unique(variants_chrom) 606 607 for chrom in unique_chroms: 608 # Indices of SNPs on this chromosome 609 snp_indices = np.where(variants_chrom == chrom)[0] 610 snp_pos_chr = variants_pos[snp_indices] 611 612 # Indices of windows on this chromosome 613 window_indices = np.where(window_chromosomes == chrom)[0] 614 window_starts_chr = window_starts[window_indices] 615 window_ends_chr = window_ends[window_indices] 616 617 # Check if windows are defined 618 if len(window_starts_chr) == 0: 619 continue 620 621 # For SNP positions, find where they would be inserted in window_starts to maintain order 622 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 623 624 # Ensure indices are within valid range 625 valid_inds = (inds >= 0) & (inds < len(window_starts_chr)) 626 snp_inds_valid = snp_indices[valid_inds] 627 inds_valid = inds[valid_inds] 628 snp_pos_valid = snp_pos_chr[valid_inds] 629 630 # Check if SNP positions are within window ranges 631 within_window = snp_pos_valid <= window_ends_chr[inds_valid] 632 final_snp_indices = snp_inds_valid[within_window] 633 final_window_indices = window_indices[inds_valid[within_window]] 634 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {len(final_snp_indices)}") 635 636 # Assign window indices to SNPs 637 snp_to_window_indices[final_snp_indices] = final_window_indices 638 639 # Initialize SNP-level ancestry array 640 calldata_lai = np.full((n_snps, n_samples, 2), np.nan) 641 642 # Create a boolean mask for valid SNP indices (where window_idx != -1) 643 valid_snp_mask = (snp_to_window_indices != -1) 644 645 # Get the array of valid SNP indices 646 valid_snp_indices = np.where(valid_snp_mask)[0] 647 648 # Get the corresponding window indices for valid SNPs 649 valid_window_indices = snp_to_window_indices[valid_snp_indices] 650 651 # Assign lai_values to calldata_lai for all valid SNPs at once 652 calldata_lai[valid_snp_indices] = lai_reshaped[valid_window_indices] 653 654 return SNPObject( 655 calldata_lai=calldata_lai, 656 samples=self.samples, 657 variants_ref=variants_ref, 658 variants_alt=variants_alt, 659 variants_filter_pass=variants_filter_pass, 660 variants_chrom=variants_chrom, 661 variants_id=variants_id, 662 variants_pos=variants_pos, 663 variants_qual=variants_qual, 664 ancestry_map=self.ancestry_map 665 ) 666 667 def _sanity_check(self) -> None: 668 """ 669 Perform sanity checks on the parsed data to ensure data integrity. 670 671 This method checks that all unique ancestries in LAI are represented 672 in the ancestry map. 673 674 Args: 675 lai (np.ndarray): The LAI data array. 676 ancestry_map (dict, optional): A dictionary mapping ancestry codes to region names, if available. 677 """ 678 # Get unique ancestries from LAI data 679 unique_ancestries = np.unique(self.lai) 680 681 if self.ancestry_map is not None: 682 # Check if all unique ancestries in the LAI are present in the ancestry map 683 for ancestry in unique_ancestries: 684 if str(ancestry) not in self.ancestry_map: 685 warnings.warn( 686 f"Ancestry '{ancestry}' found in LAI data is not represented in the ancestry map." 687 ) 688 689 def save(self, file: Union[str, Path]) -> None: 690 """ 691 Save the data stored in `self` to a specified file. 692 If the file already exists, it will be overwritten. 693 694 The format of the saved file is determined by the file extension provided in the `file` 695 argument. 696 697 **Supported formats:** 698 699 - `.msp`: Text-based MSP format. 700 - `.msp.tsv`: Text-based MSP format with TSV extension. 701 - `.pkl`: Pickle format for saving `self` in serialized form. 702 703 Args: 704 file (str or pathlib.Path): 705 Path to the file where the data will be saved. The extension of the file determines the save format. 706 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 707 """ 708 path = Path(file) 709 suffixes = [suffix.lower() for suffix in path.suffixes] 710 711 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 712 self.save_msp(file) 713 elif suffixes[-1] == '.pkl': 714 self.save_pickle(file) 715 else: 716 raise ValueError( 717 f"Unsupported file extension: {suffixes[-1]}" 718 "Supported extensions are: .msp, .msp.tsv, .pkl." 719 ) 720 721 def save_msp(self, file: Union[str, Path]) -> None: 722 """ 723 Save the data stored in `self` to a `.msp` file. 724 If the file already exists, it will be overwritten. 725 726 Args: 727 file (str or pathlib.Path): 728 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 729 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 730 """ 731 from snputils.ancestry.io.local.write import MSPWriter 732 733 MSPWriter(self, file).write() 734 735 def save_pickle(self, file: Union[str, Path]) -> None: 736 """ 737 Save `self` in serialized form to a `.pkl` file. 738 If the file already exists, it will be overwritten. 739 740 Args: 741 file (str or pathlib.Path): 742 Path to the file where the data will be saved. It should end with `.pkl`. 743 If the provided path does not have this extension, it will be appended. 744 """ 745 import pickle 746 with open(file, 'wb') as file: 747 pickle.dump(self, file)
A class for window-level Local Ancestry Inference (LAI) data.
21 def __init__( 22 self, 23 haplotypes: List[str], 24 lai: np.ndarray, 25 samples: Optional[List[str]] = None, 26 ancestry_map: Optional[Dict[str, str]] = None, 27 window_sizes: Optional[np.ndarray] = None, 28 centimorgan_pos: Optional[np.ndarray] = None, 29 chromosomes: Optional[np.ndarray] = None, 30 physical_pos: Optional[np.ndarray] = None 31 ) -> None: 32 """ 33 Args: 34 haplotypes (list of str of length n_haplotypes): 35 A list of unique haplotype identifiers. 36 lai (array of shape (n_windows, n_haplotypes)): 37 A 2D array containing local ancestry inference values, where each row represents a 38 genomic window, and each column corresponds to a haplotype phase for each sample. 39 samples (list of str of length n_samples, optional): 40 A list of unique sample identifiers. 41 ancestry_map (dict of str to str, optional): 42 A dictionary mapping ancestry codes to region names. 43 window_sizes (array of shape (n_windows,), optional): 44 An array specifying the number of SNPs in each genomic window. 45 centimorgan_pos (array of shape (n_windows, 2), optional): 46 A 2D array containing the start and end centimorgan positions for each window. 47 chromosomes (array of shape (n_windows,), optional): 48 An array with chromosome numbers corresponding to each genomic window. 49 physical_pos (array of shape (n_windows, 2), optional): 50 A 2D array containing the start and end physical positions for each window. 51 """ 52 if lai.ndim != 2: 53 raise ValueError("`lai` must be a 2D array with shape (n_windows, n_haplotypes).") 54 55 # Determine the number of unique ancestries and samples from the LAI array 56 n_ancestries = len(np.unique(lai)) 57 n_haplotypes = lai.shape[1] 58 n_samples = n_haplotypes // 2 59 60 super(LocalAncestryObject, self).__init__(n_samples, n_ancestries) 61 62 self.__haplotypes = haplotypes 63 self.__lai = lai 64 self.__window_sizes = window_sizes 65 self.__centimorgan_pos = centimorgan_pos 66 self.__samples = samples 67 self.__chromosomes = chromosomes 68 self.__physical_pos = physical_pos 69 self.__ancestry_map = ancestry_map 70 71 # Perform sanity check to ensure all unique ancestries in LAI data are represented in the ancestry map 72 self._sanity_check()
Arguments:
- haplotypes (list of str of length n_haplotypes): A list of unique haplotype identifiers.
- lai (array of shape (n_windows, n_haplotypes)): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
- samples (list of str of length n_samples, optional): A list of unique sample identifiers.
- ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names.
- window_sizes (array of shape (n_windows,), optional): An array specifying the number of SNPs in each genomic window.
- centimorgan_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end centimorgan positions for each window.
- chromosomes (array of shape (n_windows,), optional): An array with chromosome numbers corresponding to each genomic window.
- physical_pos (array of shape (n_windows, 2), optional): A 2D array containing the start and end physical positions for each window.
111 @property 112 def lai(self) -> np.ndarray: 113 """ 114 Retrieve `lai`. 115 116 Returns: 117 **array of shape (n_windows, n_haplotypes):** 118 A 2D array containing local ancestry inference values, where each row represents a 119 genomic window, and each column corresponds to a haplotype phase for each sample. 120 """ 121 return self.__lai
Retrieve lai
.
Returns:
array of shape (n_windows, n_haplotypes): A 2D array containing local ancestry inference values, where each row represents a genomic window, and each column corresponds to a haplotype phase for each sample.
130 @property 131 def samples(self) -> Optional[List[str]]: 132 """ 133 Retrieve `samples`. 134 135 Returns: 136 **list of str:** A list of unique sample identifiers. 137 """ 138 if self.__samples is not None: 139 return self.__samples 140 elif self.__haplotypes is not None: 141 return [hap.split('.')[0] for hap in self.__haplotypes][::2] 142 else: 143 return None
152 @property 153 def ancestry_map(self) -> Optional[Dict[str, str]]: 154 """ 155 Retrieve `ancestry_map`. 156 157 Returns: 158 **dict of str to str:** A dictionary mapping ancestry codes to region names. 159 """ 160 return self.__ancestry_map
Retrieve ancestry_map
.
Returns:
dict of str to str: A dictionary mapping ancestry codes to region names.
169 @property 170 def window_sizes(self) -> Optional[np.ndarray]: 171 """ 172 Retrieve `window_sizes`. 173 174 Returns: 175 **array of shape (n_windows,):** 176 An array specifying the number of SNPs in each genomic window. 177 """ 178 return self.__window_sizes
Retrieve window_sizes
.
Returns:
array of shape (n_windows,): An array specifying the number of SNPs in each genomic window.
187 @property 188 def centimorgan_pos(self) -> Optional[np.ndarray]: 189 """ 190 Retrieve `centimorgan_pos`. 191 192 Returns: 193 **array of shape (n_windows, 2):** 194 A 2D array containing the start and end centimorgan positions for each window. 195 """ 196 return self.__centimorgan_pos
Retrieve centimorgan_pos
.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end centimorgan positions for each window.
205 @property 206 def chromosomes(self) -> Optional[np.ndarray]: 207 """ 208 Retrieve `chromosomes`. 209 210 Returns: 211 **array of shape (n_windows,):** 212 An array with chromosome numbers corresponding to each genomic window. 213 """ 214 return self.__chromosomes
Retrieve chromosomes
.
Returns:
array of shape (n_windows,): An array with chromosome numbers corresponding to each genomic window.
223 @property 224 def physical_pos(self) -> Optional[np.ndarray]: 225 """ 226 Retrieve `physical_pos`. 227 228 Returns: 229 **array of shape (n_windows, 2):** 230 A 2D array containing the start and end physical positions for each window. 231 """ 232 return self.__physical_pos
Retrieve physical_pos
.
Returns:
array of shape (n_windows, 2): A 2D array containing the start and end physical positions for each window.
241 @property 242 def n_samples(self) -> int: 243 """ 244 Retrieve `n_samples`. 245 246 Returns: 247 **int:** 248 The total number of samples. 249 """ 250 if self.__samples is not None: 251 return len(self.__samples) 252 elif self.__haplotypes is not None: 253 # Divide by 2 because each sample has two associated haplotypes 254 return len(self.__haplotypes) // 2 255 else: 256 #Â Divide by 2 because columns represent haplotypes 257 return self.__lai.shape[1] // 2
292 def copy(self) -> 'LocalAncestryObject': 293 """ 294 Create and return a copy of `self`. 295 296 Returns: 297 **LocalAncestryObject:** 298 A new instance of the current object. 299 """ 300 return copy.copy(self)
Create and return a copy of self
.
Returns:
LocalAncestryObject: A new instance of the current object.
302 def keys(self) -> List[str]: 303 """ 304 Retrieve a list of public attribute names for `self`. 305 306 Returns: 307 **list of str:** 308 A list of attribute names, with internal name-mangling removed, 309 for easier reference to public attributes in the instance. 310 """ 311 return [attr.replace('_LocalAncestryObject__', '').replace('_AncestryObject__', '') for attr in vars(self)]
Retrieve a list of public attribute names for self
.
Returns:
list of str: A list of attribute names, with internal name-mangling removed, for easier reference to public attributes in the instance.
313 def filter_windows( 314 self, 315 indexes: Union[int, Sequence[int], np.ndarray], 316 include: bool = True, 317 inplace: bool = False 318 ) -> Optional['LocalAncestryObject']: 319 """ 320 Filter genomic windows based on specified indexes. 321 322 This method updates the `lai` attribute to include or exclude the specified genomic windows. 323 Attributes such as `window_sizes`, `centimorgan_pos`, `chromosomes`, and `physical_pos` will also be 324 updated accordingly if they are not None. The order of genomic windows is preserved. 325 326 Negative indexes are supported and follow 327 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 328 329 Args: 330 indexes (int or array-like of int): 331 Index(es) of the windows to include or exclude. Can be a single integer or a 332 sequence of integers. Negative indexes are supported. 333 include (bool, default=True): 334 If True, includes only the specified windows. If False, excludes the specified 335 windows. Default is True. 336 inplace (bool, default=False): 337 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with 338 the windows filtered. Default is False. 339 340 Returns: 341 **Optional[LocalAncestryObject]:** 342 A new `LocalAncestryObject` with the specified windows filtered if `inplace=False`. 343 If `inplace=True`, modifies `self` in place and returns None. 344 """ 345 # Convert indexes to a NumPy array 346 indexes = np.atleast_1d(indexes) 347 348 # Get total number of windows 349 n_windows = self.n_windows 350 351 # Validate indexes, allowing negative indexes 352 if np.any((indexes < -n_windows) | (indexes >= n_windows)): 353 raise IndexError("One or more indexes are out of bounds.") 354 355 # Create boolean mask 356 mask = np.zeros(n_windows, dtype=bool) 357 mask[indexes] = True 358 359 # Invert mask if `include=False` 360 if not include: 361 mask = ~mask 362 363 # Filter `lai` 364 filtered_lai = self['lai'][mask, :] 365 366 # Filter `window_sizes`, `chromosomes`, `centimorgan_pos`, and `physical_pos`, checking if they are None before filtering 367 filtered_window_sizes = self['window_sizes'][mask] if self['window_sizes'] is not None else None 368 filtered_chromosomes = self['chromosomes'][mask] if self['chromosomes'] is not None else None 369 filtered_centimorgan_pos = self['centimorgan_pos'][mask, :] if self['centimorgan_pos'] is not None else None 370 filtered_physical_pos = self['physical_pos'][mask, :] if self['physical_pos'] is not None else None 371 372 # Modify the original object if `inplace=True`, otherwise create and return a copy 373 if inplace: 374 self['lai'] = filtered_lai 375 if filtered_window_sizes is not None: 376 self['window_sizes'] = filtered_window_sizes 377 if filtered_chromosomes is not None: 378 self['chromosomes'] = filtered_chromosomes 379 if filtered_centimorgan_pos is not None: 380 self['centimorgan_pos'] = filtered_centimorgan_pos 381 if filtered_physical_pos is not None: 382 self['physical_pos'] = filtered_physical_pos 383 return None 384 else: 385 laiobj = self.copy() 386 laiobj['lai'] = filtered_lai 387 if filtered_window_sizes is not None: 388 laiobj['window_sizes'] = filtered_window_sizes 389 if filtered_chromosomes is not None: 390 laiobj['chromosomes'] = filtered_chromosomes 391 if filtered_centimorgan_pos is not None: 392 laiobj['centimorgan_pos'] = filtered_centimorgan_pos 393 if filtered_physical_pos is not None: 394 laiobj['physical_pos'] = filtered_physical_pos 395 return laiobj
Filter genomic windows based on specified indexes.
This method updates the lai
attribute to include or exclude the specified genomic windows.
Attributes such as window_sizes
, centimorgan_pos
, chromosomes
, and physical_pos
will also be
updated accordingly if they are not None. The order of genomic windows is preserved.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- indexes (int or array-like of int): Index(es) of the windows to include or exclude. Can be a single integer or a sequence of integers. Negative indexes are supported.
- include (bool, default=True): If True, includes only the specified windows. If False, excludes the specified windows. Default is True.
- inplace (bool, default=False): If True, modifies
self
in place. If False, returns a newLocalAncestryObject
with the windows filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObject
with the specified windows filtered ifinplace=False
. Ifinplace=True
, modifiesself
in place and returns None.
397 def filter_samples( 398 self, 399 samples: Optional[Union[str, Sequence[str], np.ndarray, None]] = None, 400 indexes: Optional[Union[int, Sequence[int], np.ndarray, None]] = None, 401 include: bool = True, 402 inplace: bool = False 403 ) -> Optional['LocalAncestryObject']: 404 """ 405 Filter samples based on specified names or indexes. 406 407 This method updates the `lai`, `haplotypes`, and `samples` attributes to include or exclude the specified 408 samples. Each sample is associated with two haplotypes, which are included or excluded together. 409 The order of the samples is preserved. 410 411 If both samples and indexes are provided, any sample matching either a name in samples or an index in 412 indexes will be included or excluded. 413 414 Negative indexes are supported and follow 415 [NumPy's indexing conventions](https://numpy.org/doc/stable/user/basics.indexing.html). 416 417 Args: 418 samples (str or array_like of str, optional): 419 Name(s) of the samples to include or exclude. Can be a single sample name or a 420 sequence of sample names. Default is None. 421 indexes (int or array_like of int, optional): 422 Index(es) of the samples to include or exclude. Can be a single index or a sequence 423 of indexes. Negative indexes are supported. Default is None. 424 include (bool, default=True): 425 If True, includes only the specified samples. If False, excludes the specified 426 samples. Default is True. 427 inplace (bool, default=False): 428 If True, modifies `self` in place. If False, returns a new `LocalAncestryObject` with the 429 samples filtered. Default is False. 430 431 Returns: 432 **Optional[LocalAncestryObject]:** 433 A new `LocalAncestryObject` with the specified samples filtered if `inplace=False`. 434 If `inplace=True`, modifies `self` in place and returns None. 435 """ 436 if samples is None and indexes is None: 437 raise UserWarning("At least one of 'samples' or 'indexes' must be provided.") 438 439 n_haplotypes = self.n_haplotypes 440 n_samples = self.n_samples 441 442 # Create mask based on sample names 443 if samples is not None: 444 samples = np.atleast_1d(samples) 445 # Extract sample names from haplotype identifiers 446 haplotype_ids = np.array(self['haplotypes']) 447 sample_names = np.array([hap.split('.')[0] for hap in haplotype_ids]) 448 # Create mask for haplotypes belonging to specified samples 449 mask_samples = np.isin(sample_names, samples) 450 else: 451 mask_samples = np.zeros(n_haplotypes, dtype=bool) 452 453 # Create mask based on sample indexes 454 if indexes is not None: 455 indexes = np.atleast_1d(indexes) 456 457 # Validate indexes, allowing negative indexes 458 out_of_bounds_indexes = indexes[(indexes < -n_samples) | (indexes >= n_samples)] 459 if out_of_bounds_indexes.size > 0: 460 raise ValueError(f"One or more sample indexes are out of bounds.") 461 462 # Adjust negative indexes 463 indexes = np.mod(indexes, n_samples) 464 465 # Get haplotype indexes for the specified sample indexes 466 haplotype_indexes = np.concatenate([2*indexes, 2*indexes+1]) 467 # Create mask for haplotypes 468 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 469 mask_indexes[haplotype_indexes] = True 470 else: 471 mask_indexes = np.zeros(n_haplotypes, dtype=bool) 472 473 # Combine masks using logical OR (union of samples) 474 mask_combined = mask_samples | mask_indexes 475 476 if not include: 477 mask_combined = ~mask_combined 478 479 # Filter `lai` 480 filtered_lai = self['lai'][:, mask_combined] 481 482 # Filter `haplotypes` 483 filtered_haplotypes = np.array(self['haplotypes'])[mask_combined].tolist() 484 485 # Filter `samples`, checking if they are None before filtering 486 sample_mask = mask_combined.reshape(-1, 2).any(axis=1) 487 filtered_samples = np.array(self['samples'])[sample_mask].tolist() if self['samples'] is not None else None 488 489 if inplace: 490 self['haplotypes'] = filtered_haplotypes 491 self['samples'] = filtered_samples 492 self['lai'] = filtered_lai 493 return None 494 else: 495 laiobj = self.copy() 496 laiobj['haplotypes'] = filtered_haplotypes 497 laiobj['samples'] = filtered_samples 498 laiobj['lai'] = filtered_lai 499 return laiobj
Filter samples based on specified names or indexes.
This method updates the lai
, haplotypes
, and samples
attributes to include or exclude the specified
samples. Each sample is associated with two haplotypes, which are included or excluded together.
The order of the samples is preserved.
If both samples and indexes are provided, any sample matching either a name in samples or an index in indexes will be included or excluded.
Negative indexes are supported and follow NumPy's indexing conventions.
Arguments:
- samples (str or array_like of str, optional): Name(s) of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
- indexes (int or array_like of int, optional): Index(es) of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
- include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
- inplace (bool, default=False): If True, modifies
self
in place. If False, returns a newLocalAncestryObject
with the samples filtered. Default is False.
Returns:
Optional[LocalAncestryObject]: A new
LocalAncestryObject
with the specified samples filtered ifinplace=False
. Ifinplace=True
, modifiesself
in place and returns None.
501 def convert_to_snp_level( 502 self, 503 snpobject: Optional['SNPObject'] = None, 504 variants_chrom: Optional[np.ndarray] = None, 505 variants_pos: Optional[np.ndarray] = None, 506 variants_ref: Optional[np.ndarray] = None, 507 variants_alt: Optional[np.ndarray] = None, 508 variants_filter_pass: Optional[np.ndarray] = None, 509 variants_id: Optional[np.ndarray] = None, 510 variants_qual: Optional[np.ndarray] = None 511 ) -> 'SNPObject': 512 """ 513 Convert `self` into a `snputils.snp.genobj.SNPObject` SNP-level Local Ancestry Information (LAI), 514 with optional support for SNP data. 515 516 If SNP positions (`variants_pos`) and chromosomes (`variants_chrom`) are not specified, the method generates 517 SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP 518 coordinates are used to assign ancestry values based on their respective windows. 519 520 If an `SNPObject` is provided, its attributes are used unless explicitly overridden by the function arguments. 521 522 Args: 523 snpobject (SNPObject, optional): 524 An existing `SNPObject` to extract SNP attributes from. 525 variants_chrom (array of shape (n_snps,), optional): 526 An array containing the chromosome for each SNP. 527 variants_pos (array of shape (n_snps,), optional): 528 An array containing the chromosomal positions for each SNP. 529 variants_ref (array of shape (n_snps,), optional): 530 An array containing the reference allele for each SNP. 531 variants_alt (array of shape (n_snps,), optional): 532 An array containing the alternate allele for each SNP. 533 variants_filter_pass (array of shape (n_snps,), optional): 534 An array indicating whether each SNP passed control checks. 535 variants_id (array of shape (n_snps,), optional): 536 An array containing unique identifiers (IDs) for each SNP. 537 variants_qual (array of shape (n_snps,), optional): 538 An array containing the Phred-scaled quality score for each SNP. 539 540 Returns: 541 **SNPObject**: 542 A `SNPObject` containing SNP-level ancestry data, along with optional metadata. 543 """ 544 from snputils.snp.genobj.snpobj import SNPObject 545 546 # Extract attributes from SNPObject if provided 547 if snpobject is not None: 548 variants_chrom = variants_chrom if variants_chrom is not None else snpobject.variants_chrom 549 variants_pos = variants_pos if variants_pos is not None else snpobject.variants_pos 550 variants_ref = variants_ref if variants_ref is not None else snpobject.variants_ref 551 variants_alt = variants_alt if variants_alt is not None else snpobject.variants_alt 552 variants_filter_pass = variants_filter_pass if variants_filter_pass is not None else snpobject.variants_filter_pass 553 variants_id = variants_id if variants_id is not None else snpobject.variants_id 554 variants_qual = variants_qual if variants_qual is not None else snpobject.variants_qual 555 556 n_samples = self.n_samples 557 558 # Reshape lai to (n_windows, n_samples, 2) 559 lai_reshaped = self.lai.reshape(self.n_windows, n_samples, 2) 560 561 if variants_pos is None or variants_chrom is None: 562 # Generate SNP positions and chromosomes from windows 563 variants_pos_list = [] 564 variants_chrom_list = [] 565 ancestry_list = [] 566 567 for i in range(self.n_windows): 568 start = int(self.physical_pos[i, 0]) 569 end = int(self.physical_pos[i, 1]) 570 chrom = self.chromosomes[i] 571 ancestry = lai_reshaped[i, :, :] # Shape: (n_samples, 2) 572 573 # Create SNP positions between start and end with the given step size 574 positions_in_window = np.arange(start, end + 1) 575 n_positions = len(positions_in_window) 576 577 if n_positions == 0: 578 continue # Skip windows with no positions 579 580 variants_pos_list.append(positions_in_window) 581 variants_chrom_list.append(np.full(n_positions, chrom)) 582 583 # Repeat ancestry for each SNP position in the window 584 ancestry_repeated = np.repeat(ancestry[np.newaxis, :, :], n_positions, axis=0) 585 ancestry_list.append(ancestry_repeated) 586 587 # Concatenate all SNP positions, chromosomes, and ancestries 588 variants_pos = np.concatenate(variants_pos_list) 589 variants_chrom = np.concatenate(variants_chrom_list) 590 calldata_lai = np.concatenate(ancestry_list) 591 592 else: 593 # Use provided SNP positions and chromosomes 594 n_snps = len(variants_pos) 595 if len(variants_chrom) != n_snps: 596 raise ValueError("`variants_pos` and `variants_chrom` must have the same length.") 597 598 # Map SNPs to windows 599 window_starts = self.physical_pos[:, 0] 600 window_ends = self.physical_pos[:, 1] 601 window_chromosomes = self.chromosomes 602 603 # Initialize array to store window indices for each SNP 604 snp_to_window_indices = np.full(n_snps, -1, dtype=int) 605 unique_chroms = np.unique(variants_chrom) 606 607 for chrom in unique_chroms: 608 # Indices of SNPs on this chromosome 609 snp_indices = np.where(variants_chrom == chrom)[0] 610 snp_pos_chr = variants_pos[snp_indices] 611 612 # Indices of windows on this chromosome 613 window_indices = np.where(window_chromosomes == chrom)[0] 614 window_starts_chr = window_starts[window_indices] 615 window_ends_chr = window_ends[window_indices] 616 617 # Check if windows are defined 618 if len(window_starts_chr) == 0: 619 continue 620 621 # For SNP positions, find where they would be inserted in window_starts to maintain order 622 inds = np.searchsorted(window_starts_chr, snp_pos_chr, side='right') - 1 623 624 # Ensure indices are within valid range 625 valid_inds = (inds >= 0) & (inds < len(window_starts_chr)) 626 snp_inds_valid = snp_indices[valid_inds] 627 inds_valid = inds[valid_inds] 628 snp_pos_valid = snp_pos_chr[valid_inds] 629 630 # Check if SNP positions are within window ranges 631 within_window = snp_pos_valid <= window_ends_chr[inds_valid] 632 final_snp_indices = snp_inds_valid[within_window] 633 final_window_indices = window_indices[inds_valid[within_window]] 634 log.debug(f"Number of SNPs within window ranges for chromosome {chrom}: {len(final_snp_indices)}") 635 636 # Assign window indices to SNPs 637 snp_to_window_indices[final_snp_indices] = final_window_indices 638 639 # Initialize SNP-level ancestry array 640 calldata_lai = np.full((n_snps, n_samples, 2), np.nan) 641 642 # Create a boolean mask for valid SNP indices (where window_idx != -1) 643 valid_snp_mask = (snp_to_window_indices != -1) 644 645 # Get the array of valid SNP indices 646 valid_snp_indices = np.where(valid_snp_mask)[0] 647 648 # Get the corresponding window indices for valid SNPs 649 valid_window_indices = snp_to_window_indices[valid_snp_indices] 650 651 # Assign lai_values to calldata_lai for all valid SNPs at once 652 calldata_lai[valid_snp_indices] = lai_reshaped[valid_window_indices] 653 654 return SNPObject( 655 calldata_lai=calldata_lai, 656 samples=self.samples, 657 variants_ref=variants_ref, 658 variants_alt=variants_alt, 659 variants_filter_pass=variants_filter_pass, 660 variants_chrom=variants_chrom, 661 variants_id=variants_id, 662 variants_pos=variants_pos, 663 variants_qual=variants_qual, 664 ancestry_map=self.ancestry_map 665 )
Convert self
into a snputils.snp.genobj.SNPObject
SNP-level Local Ancestry Information (LAI),
with optional support for SNP data.
If SNP positions (variants_pos
) and chromosomes (variants_chrom
) are not specified, the method generates
SNPs uniformly across the start and end positions of each genomic window. Otherwise, the provided SNP
coordinates are used to assign ancestry values based on their respective windows.
If an SNPObject
is provided, its attributes are used unless explicitly overridden by the function arguments.
Arguments:
- snpobject (SNPObject, optional): An existing
SNPObject
to extract SNP attributes from. - variants_chrom (array of shape (n_snps,), optional): An array containing the chromosome for each SNP.
- variants_pos (array of shape (n_snps,), optional): An array containing the chromosomal positions for each SNP.
- variants_ref (array of shape (n_snps,), optional): An array containing the reference allele for each SNP.
- variants_alt (array of shape (n_snps,), optional): An array containing the alternate allele for each SNP.
- variants_filter_pass (array of shape (n_snps,), optional): An array indicating whether each SNP passed control checks.
- variants_id (array of shape (n_snps,), optional): An array containing unique identifiers (IDs) for each SNP.
- variants_qual (array of shape (n_snps,), optional): An array containing the Phred-scaled quality score for each SNP.
Returns:
SNPObject: A
SNPObject
containing SNP-level ancestry data, along with optional metadata.
689 def save(self, file: Union[str, Path]) -> None: 690 """ 691 Save the data stored in `self` to a specified file. 692 If the file already exists, it will be overwritten. 693 694 The format of the saved file is determined by the file extension provided in the `file` 695 argument. 696 697 **Supported formats:** 698 699 - `.msp`: Text-based MSP format. 700 - `.msp.tsv`: Text-based MSP format with TSV extension. 701 - `.pkl`: Pickle format for saving `self` in serialized form. 702 703 Args: 704 file (str or pathlib.Path): 705 Path to the file where the data will be saved. The extension of the file determines the save format. 706 Supported extensions: `.msp`, `.msp.tsv`, `.pkl`. 707 """ 708 path = Path(file) 709 suffixes = [suffix.lower() for suffix in path.suffixes] 710 711 if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp': 712 self.save_msp(file) 713 elif suffixes[-1] == '.pkl': 714 self.save_pickle(file) 715 else: 716 raise ValueError( 717 f"Unsupported file extension: {suffixes[-1]}" 718 "Supported extensions are: .msp, .msp.tsv, .pkl." 719 )
Save the data stored in self
to a specified file.
If the file already exists, it will be overwritten.
The format of the saved file is determined by the file extension provided in the file
argument.
Supported formats:
.msp
: Text-based MSP format..msp.tsv
: Text-based MSP format with TSV extension..pkl
: Pickle format for savingself
in serialized form.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. The extension of the file determines the save format.
Supported extensions:
.msp
,.msp.tsv
,.pkl
.
721 def save_msp(self, file: Union[str, Path]) -> None: 722 """ 723 Save the data stored in `self` to a `.msp` file. 724 If the file already exists, it will be overwritten. 725 726 Args: 727 file (str or pathlib.Path): 728 Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 729 If the provided path does not have one of these extensions, the `.msp` extension will be appended. 730 """ 731 from snputils.ancestry.io.local.write import MSPWriter 732 733 MSPWriter(self, file).write()
Save the data stored in self
to a .msp
file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.msp
or.msp.tsv
. If the provided path does not have one of these extensions, the.msp
extension will be appended.
735 def save_pickle(self, file: Union[str, Path]) -> None: 736 """ 737 Save `self` in serialized form to a `.pkl` file. 738 If the file already exists, it will be overwritten. 739 740 Args: 741 file (str or pathlib.Path): 742 Path to the file where the data will be saved. It should end with `.pkl`. 743 If the provided path does not have this extension, it will be appended. 744 """ 745 import pickle 746 with open(file, 'wb') as file: 747 pickle.dump(self, file)
Save self
in serialized form to a .pkl
file.
If the file already exists, it will be overwritten.
Arguments:
- file (str or pathlib.Path): Path to the file where the data will be saved. It should end with
.pkl
. If the provided path does not have this extension, it will be appended.