Source code for snputils.ancestry.io.wide.read.admixture
import logging
import numpy as np
from pathlib import Path
from typing import Union, Optional
log = logging.getLogger(__name__)
from .base import WideBaseReader
from snputils.ancestry.genobj.wide import GlobalAncestryObject
[docs]
class AdmixtureReader(WideBaseReader):
"""
A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`.
"""
def __init__(
self,
Q_file: Union[str, Path],
P_file: Optional[Union[str, Path]] = None,
sample_file: Optional[Union[str, Path]] = None,
snp_file: Optional[Union[str, Path]] = None,
ancestry_file: Optional[Union[str, Path]] = None,
) -> None:
"""
Args:
Q_file (str or pathlib.Path):
Path to the file containing the Q matrix (per-sample ancestry proportions).
It should end with .Q or .txt.
The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path, optional):
Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
It should end with .P or .txt.
The file should use space (' ') as the delimiter. If None, P is not loaded.
sample_file (str or pathlib.Path, optional):
Path to the single-column file containing sample identifiers.
It should end with .fam or .txt.
If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional):
Path to the single-column file containing SNP identifiers.
It should end with .bim or .txt.
If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional):
Path to the single-column file containing ancestry labels for each sample.
It should end with .map or .txt.
If None, ancestries are not loaded.
"""
self.__Q_file = Path(Q_file)
self.__P_file = Path(P_file) if P_file is not None else None
self.__sample_file = Path(sample_file) if sample_file is not None else None
self.__snp_file = Path(snp_file) if snp_file is not None else None
self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
@property
def Q_file(self) -> Path:
"""
Retrieve Q_file.
Returns:
pathlib.Path:
Path to the file containing the Q matrix (per-sample ancestry proportions).
It should end with .Q or .txt.
The file should use space (' ') as the delimiter.
"""
return self.__Q_file
@property
def P_file(self) -> Optional[Path]:
"""
Retrieve P_file.
Returns:
pathlib.Path or None:
Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
It should end with .P or .txt.
The file should use space (' ') as the delimiter. If None, P is not loaded.
"""
return self.__P_file
@property
def sample_file(self) -> Optional[Path]:
"""
Retrieve sample_file.
Returns:
pathlib.Path:
Path to the single-column file containing sample identifiers.
It should end with .fam or .txt.
If None, sample identifiers are not loaded.
"""
return self.__sample_file
@property
def snp_file(self) -> Optional[Path]:
"""
Retrieve snp_file.
Returns:
pathlib.Path:
Path to the single-column file containing SNP identifiers.
It should end with .bim or .txt.
If None, SNP identifiers are not loaded.
"""
return self.__snp_file
@property
def ancestry_file(self) -> Optional[Path]:
"""
Retrieve ancestry_file.
Returns:
pathlib.Path:
Path to the single-column file containing ancestry labels for each sample.
It should end with .map or .txt.
If None, ancestries are not loaded.
"""
return self.__ancestry_file
[docs]
def read(self) -> 'GlobalAncestryObject':
"""
Read data from the provided ADMIXTURE files and construct a
snputils.ancestry.genobj.GlobalAncestryObject instance.
Expected ADMIXTURE files content:
- **Q_file**:
A text file containing the Q matrix with per-sample ancestry proportions.
Each row corresponds to a sample, and each column corresponds to an ancestry.
- **P_file**:
A text file containing the P matrix with per-ancestry SNP frequencies.
Each row corresponds to a SNP, and each column corresponds to an ancestry.
Optional files (if provided):
- **sample_file**: A single-column text file containing sample identifiers in order.
- **snp_file**: A single-column text file containing SNP identifiers in order.
- **ancestry_file**: A single-column text file containing ancestry labels for each sample.
Returns:
GlobalAncestryObject:
A GlobalAncestryObject instance.
"""
log.info(f"Reading Q matrix from '{self.Q_file}'...")
Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
if self.P_file is not None:
log.info(f"Reading P matrix from '{self.P_file}'...")
P_mat = np.genfromtxt(self.P_file, delimiter=' ')
else:
P_mat = None
samples = self._read_sample_ids()
snps = self._read_snps()
ancestries = self._read_ancestries()
return GlobalAncestryObject(
Q_mat,
P_mat,
samples=samples,
snps=snps,
ancestries=ancestries
)
WideBaseReader.register(AdmixtureReader)