snputils.ancestry.io

View Source

1from .local import LAIReader, MSPReader, MSPWriter, AdmixtureMappingVCFWriter, read_lai, read_msp
2from .wide import AdmixtureReader, AdmixtureWriter, read_adm, read_admixture
3
4__all__ = ['read_adm', 'read_admixture', 'read_lai', 'read_msp',
5           'AdmixtureReader', 'AdmixtureWriter', 'LAIReader', 'MSPReader', 'MSPWriter', 'AdmixtureMappingVCFWriter']

def read_adm( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None) -> snputils.ancestry.genobj.GlobalAncestryObject: View Source

 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Union[str, Path],
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_id_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_admixture( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None) -> snputils.ancestry.genobj.GlobalAncestryObject: View Source

 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Union[str, Path],
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_id_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_lai( file: Union[str, pathlib._local.Path], **kwargs) -> snputils.ancestry.genobj.LocalAncestryObject: View Source

 8def read_lai(file: Union[str, Path], **kwargs) -> LocalAncestryObject:
 9    """
10    Automatically detect the local ancestry data file format from the file's extension and 
11    read it into a `snputils.ancestry.genobj.LocalAncestryObject`.
12
13    **Supported formats:**
14
15    - `.msp`: Text-based MSP format.
16    - `.msp.tsv`: Text-based MSP format with TSV extension.
17    
18    Args:
19        file (str or pathlib.Path): 
20            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
21        **kwargs: Additional arguments passed to the reader method.
22    """
23    from snputils.ancestry.io.local.read.auto import LAIReader
24
25    return LAIReader(file).read(**kwargs)

Automatically detect the local ancestry data file format from the file's extension and read it into a snputils.ancestry.genobj.LocalAncestryObject.

Supported formats:

.msp: Text-based MSP format.
.msp.tsv: Text-based MSP format with TSV extension.

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
**kwargs: Additional arguments passed to the reader method.

def read_msp( file: Union[str, pathlib._local.Path]) -> snputils.ancestry.genobj.LocalAncestryObject: View Source

28def read_msp(file: Union[str, Path]) -> 'LocalAncestryObject':
29    """
30    Read data from an `.msp` or `.msp.tsv` file and construct a `snputils.ancestry.genobj.LocalAncestryObject`.
31
32    Args:
33        file (str or pathlib.Path): 
34            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
35
36    Returns:
37        **LocalAncestryObject:**
38            A LocalAncestryObject instance.
39    """
40    from snputils.ancestry.io.local.read.msp import MSPReader
41
42    return MSPReader(file).read()

Read data from an .msp or .msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class AdmixtureReader(snputils.ancestry.io.wide.read.base.WideBaseReader): View Source

 13class AdmixtureReader(WideBaseReader):
 14    """
 15    A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`.
 16    """
 17    def __init__(
 18        self,
 19        Q_file: Union[str, Path],
 20        P_file: Union[str, Path],
 21        sample_file: Optional[Union[str, Path]] = None,
 22        snp_file: Optional[Union[str, Path]] = None,
 23        ancestry_file: Optional[Union[str, Path]] = None,
 24    ) -> None:
 25        """
 26        Args:
 27            Q_file (str or pathlib.Path):
 28                Path to the file containing the Q matrix (per-sample ancestry proportions).
 29                It should end with .Q or .txt.
 30                The file should use space (' ') as the delimiter.
 31            P_file (str or pathlib.Path):
 32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 33                It should end with .P or .txt.
 34                The file should use space (' ') as the delimiter.
 35            sample_file (str or pathlib.Path, optional):
 36                Path to the single-column file containing sample identifiers. 
 37                It should end with .fam or .txt.
 38                If None, sample identifiers are not loaded.
 39            snp_file (str or pathlib.Path, optional):
 40                Path to the single-column file containing SNP identifiers. 
 41                It should end with .bim or .txt.
 42                If None, SNP identifiers are not loaded.
 43            ancestry_file (str or pathlib.Path, optional):
 44                Path to the single-column file containing ancestry labels for each sample.
 45                It should end with .map or .txt.
 46                If None, ancestries are not loaded.
 47        """
 48        self.__Q_file = Path(Q_file)
 49        self.__P_file = Path(P_file)
 50        self.__sample_file = Path(sample_file) if sample_file is not None else None
 51        self.__snp_file = Path(snp_file) if snp_file is not None else None
 52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
 53
 54    @property
 55    def Q_file(self) -> Path:
 56        """
 57        Retrieve Q_file.
 58
 59        Returns:
 60            **pathlib.Path:** 
 61                Path to the file containing the Q matrix (per-sample ancestry proportions).
 62                It should end with .Q or .txt.
 63                The file should use space (' ') as the delimiter.
 64        """
 65        return self.__Q_file
 66
 67    @property
 68    def P_file(self) -> Path:
 69        """
 70        Retrieve P_file.
 71
 72        Returns:
 73            **pathlib.Path:** 
 74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 75                It should end with .P or .txt.
 76                The file should use space (' ') as the delimiter.
 77        """
 78        return self.__P_file
 79
 80    @property
 81    def sample_file(self) -> Optional[Path]:
 82        """
 83        Retrieve sample_file.
 84
 85        Returns:
 86            **pathlib.Path:** 
 87                Path to the single-column file containing sample identifiers. 
 88                It should end with .fam or .txt.
 89                If None, sample identifiers are not loaded.
 90        """
 91        return self.__sample_file
 92    
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file
105
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file
118
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        log.info(f"Reading P matrix from '{self.P_file}'...")
145        P_mat = np.genfromtxt(self.P_file, delimiter=' ')
146
147        samples = self._read_sample_ids()
148        snps = self._read_snps()
149        ancestries = self._read_ancestries()
150
151        return GlobalAncestryObject(
152            Q_mat,
153            P_mat,
154            samples=samples,
155            snps=snps,
156            ancestries=ancestries
157        )

A reader class for parsing ADMIXTURE files and constructing a snputils.ancestry.genobj.GlobalAncestryObject.

AdmixtureReader( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None) View Source

17    def __init__(
18        self,
19        Q_file: Union[str, Path],
20        P_file: Union[str, Path],
21        sample_file: Optional[Union[str, Path]] = None,
22        snp_file: Optional[Union[str, Path]] = None,
23        ancestry_file: Optional[Union[str, Path]] = None,
24    ) -> None:
25        """
26        Args:
27            Q_file (str or pathlib.Path):
28                Path to the file containing the Q matrix (per-sample ancestry proportions).
29                It should end with .Q or .txt.
30                The file should use space (' ') as the delimiter.
31            P_file (str or pathlib.Path):
32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
33                It should end with .P or .txt.
34                The file should use space (' ') as the delimiter.
35            sample_file (str or pathlib.Path, optional):
36                Path to the single-column file containing sample identifiers. 
37                It should end with .fam or .txt.
38                If None, sample identifiers are not loaded.
39            snp_file (str or pathlib.Path, optional):
40                Path to the single-column file containing SNP identifiers. 
41                It should end with .bim or .txt.
42                If None, SNP identifiers are not loaded.
43            ancestry_file (str or pathlib.Path, optional):
44                Path to the single-column file containing ancestry labels for each sample.
45                It should end with .map or .txt.
46                If None, ancestries are not loaded.
47        """
48        self.__Q_file = Path(Q_file)
49        self.__P_file = Path(P_file)
50        self.__sample_file = Path(sample_file) if sample_file is not None else None
51        self.__snp_file = Path(snp_file) if snp_file is not None else None
52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Q_file: pathlib._local.Path View Source

54    @property
55    def Q_file(self) -> Path:
56        """
57        Retrieve Q_file.
58
59        Returns:
60            **pathlib.Path:** 
61                Path to the file containing the Q matrix (per-sample ancestry proportions).
62                It should end with .Q or .txt.
63                The file should use space (' ') as the delimiter.
64        """
65        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.

P_file: pathlib._local.Path View Source

67    @property
68    def P_file(self) -> Path:
69        """
70        Retrieve P_file.
71
72        Returns:
73            **pathlib.Path:** 
74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
75                It should end with .P or .txt.
76                The file should use space (' ') as the delimiter.
77        """
78        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path: Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.

sample_file: Optional[pathlib._local.Path] View Source

80    @property
81    def sample_file(self) -> Optional[Path]:
82        """
83        Retrieve sample_file.
84
85        Returns:
86            **pathlib.Path:** 
87                Path to the single-column file containing sample identifiers. 
88                It should end with .fam or .txt.
89                If None, sample identifiers are not loaded.
90        """
91        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.

snp_file: Optional[pathlib._local.Path] View Source

 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.

ancestry_file: Optional[pathlib._local.Path] View Source

106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

def read(self) -> snputils.ancestry.genobj.GlobalAncestryObject: View Source

119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        log.info(f"Reading P matrix from '{self.P_file}'...")
145        P_mat = np.genfromtxt(self.P_file, delimiter=' ')
146
147        samples = self._read_sample_ids()
148        snps = self._read_snps()
149        ancestries = self._read_ancestries()
150
151        return GlobalAncestryObject(
152            Q_mat,
153            P_mat,
154            samples=samples,
155            snps=snps,
156            ancestries=ancestries
157        )

Read data from the provided ADMIXTURE files and construct a snputils.ancestry.genobj.GlobalAncestryObject instance.

Expected ADMIXTURE files content:

Q_file: A text file containing the Q matrix with per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
P_file: A text file containing the P matrix with per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

Optional files (if provided):

sample_file: A single-column text file containing sample identifiers in order.
snp_file: A single-column text file containing SNP identifiers in order.
ancestry_file: A single-column text file containing ancestry labels for each sample.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

class LAIReader: View Source

 8class LAIReader:
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that automatically detects the local ancestry data file format from the 
15        file's extension and returns the corresponding reader object.
16
17        **Supported formats:**
18
19        - `.msp`: Text-based MSP format.
20        - `.msp.tsv`: Text-based MSP format with TSV extension.
21
22        Args:
23            file (str or pathlib.Path): 
24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
25        
26        Returns:
27            **object:** A reader object corresponding to the file format (e.g., `MSPReader`).
28        """
29        file = Path(file)
30        suffixes = [suffix.lower() for suffix in file.suffixes]
31        if not suffixes:
32            raise ValueError("The file must have an extension. Supported extensions are: .msp, .msp.tsv.")
33
34        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
35            from snputils.ancestry.io.local.read.msp import MSPReader
36
37            return MSPReader(file)
38        else:
39            raise ValueError(
40                f"Unsupported file extension: {suffixes[-1]}. "
41                "Supported extensions are: .msp, .msp.tsv."
42            )

LAIReader(file: Union[str, pathlib._local.Path]) View Source

 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that automatically detects the local ancestry data file format from the 
15        file's extension and returns the corresponding reader object.
16
17        **Supported formats:**
18
19        - `.msp`: Text-based MSP format.
20        - `.msp.tsv`: Text-based MSP format with TSV extension.
21
22        Args:
23            file (str or pathlib.Path): 
24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
25        
26        Returns:
27            **object:** A reader object corresponding to the file format (e.g., `MSPReader`).
28        """
29        file = Path(file)
30        suffixes = [suffix.lower() for suffix in file.suffixes]
31        if not suffixes:
32            raise ValueError("The file must have an extension. Supported extensions are: .msp, .msp.tsv.")
33
34        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
35            from snputils.ancestry.io.local.read.msp import MSPReader
36
37            return MSPReader(file)
38        else:
39            raise ValueError(
40                f"Unsupported file extension: {suffixes[-1]}. "
41                "Supported extensions are: .msp, .msp.tsv."
42            )

A factory class that automatically detects the local ancestry data file format from the file's extension and returns the corresponding reader object.

Supported formats:

.msp: Text-based MSP format.
.msp.tsv: Text-based MSP format with TSV extension.

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.

Returns:

object: A reader object corresponding to the file format (e.g., MSPReader).

class MSPReader(snputils.ancestry.io.local.read.base.LAIBaseReader): View Source

 15class MSPReader(LAIBaseReader):
 16    """
 17    A reader class for parsing Local Ancestry Inference (LAI) data from an `.msp` or `msp.tsv` file
 18    and constructing a `snputils.ancestry.genobj.LocalAncestryObject`.
 19    """
 20    def __init__(self, file: Union[str, Path]) -> None:
 21        """
 22        Args:
 23            file (str or pathlib.Path): 
 24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 25        """
 26        self.__file = Path(file)
 27
 28    @property
 29    def file(self) -> Path:
 30        """
 31        Retrieve `file`.
 32
 33        Returns:
 34            **pathlib.Path:** 
 35                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 36        """
 37        return self.__file
 38
 39    def _get_samples(self, msp_df: pd.DataFrame, first_lai_col_indx: int) -> List[str]:
 40        """
 41        Extract unique sample identifiers from the pandas DataFrame.
 42
 43        Args:
 44            msp_df (pd.DataFrame): 
 45                The DataFrame representing the `.msp` data, including LAI columns.
 46            first_lai_col_indx (int): 
 47                Index of the first column containing LAI data.
 48
 49        Returns:
 50            **list:** List of unique sample identifiers.
 51        """
 52        # Get all columns starting from the first LAI data column
 53        query_samples_dub = msp_df.columns[first_lai_col_indx:]
 54
 55        # Select only one of the maternal/paternal samples by taking every second sample
 56        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 57        query_samples_sing = query_samples_dub[single_ind_idx]
 58
 59        # Remove the suffix from sample names to get clean identifiers
 60        query_samples = [qs[:-2] for qs in query_samples_sing]
 61
 62        return query_samples
 63
 64    def _get_ancestry_map_from_comment(self, comment: str) -> Dict[str, str]:
 65        """
 66        Construct an ancestry map from the comment line of the `.msp` file.
 67
 68        This method parses the comment string to create a mapping of ancestry numerical identifiers 
 69        to their corresponding ancestry names.
 70
 71        Args:
 72            comment (str): 
 73                The comment line containing ancestry mapping information.
 74
 75        Returns:
 76            dict: A dictionary mapping names to ancestry numbers (as strings).
 77        """
 78        # Split the comment to extract relevant ancestry mapping data
 79        comment = comment.split(' ')[-1].replace('\n', '').split('\t')
 80        ancestry_map = {}
 81
 82        # Populate the ancestry map with parsed values
 83        for elem in comment:
 84            x, y = elem.split('=')
 85            ancestry_map[y] = x
 86
 87        return ancestry_map
 88
 89    def _replace_nan_with_none(self, array: Optional[np.ndarray]) -> Optional[np.ndarray]:
 90        """
 91        Replace arrays that are fully NaN with `None`.
 92
 93        Args:
 94            array (np.ndarray): Array to check.
 95
 96        Returns:
 97            Optional[np.ndarray]: Returns `None` if the array is fully NaN, otherwise returns the original array.
 98        """
 99        if array is not None:
100            if array.size == 0:  # Check if the array is empty
101                return None
102            if np.issubdtype(array.dtype, np.number):  # Check for numeric types
103                if np.isnan(array).all():  # Fully NaN numeric array
104                    return None
105            elif array.dtype == np.object_ or np.issubdtype(array.dtype, np.str_):  # String or object types
106                if all(elem == '' or elem is None for elem in array):  # Empty or None strings
107                    return None
108        return array
109
110    def read(self) -> 'LocalAncestryObject':
111        """
112        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
113        `snputils.ancestry.genobj.LocalAncestryObject`.
114
115        **Expected MSP content:**
116
117        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
118        Each row should correspond to a genomic window and include the following columns:
119
120        - `#chm`: Chromosome numbers corresponding to each genomic window.
121        - `spos`: Start physical position for each window.
122        - `epos`: End physical position for each window.
123        - `sgpos`: Start centimorgan position for each window.
124        - `egpos`: End centimorgan position for each window.
125        - `n snps`: Number of SNPs in each genomic window.
126        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
127        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
128
129        Returns:
130            **LocalAncestryObject:**
131                A LocalAncestryObject instance.
132        """
133        log.info(f"Reading '{self.file}'...")
134
135        # Open the file and read the first two lines to identify the header and comment
136        with open(self.file) as f:
137            first_line = f.readline()
138            second_line = f.readline()
139
140        first_line_ = [h.replace('\n', '') for h in first_line.split("\t")]
141        second_line_ = [h.replace('\n', '') for h in second_line.split("\t")]
142
143        # Determine which line contains the #chm reference value for the header
144        if "#chm" in first_line_:
145            comment = None
146            header = first_line_
147        elif "#chm" in second_line_:
148            comment = first_line
149            header = second_line_
150        else:
151            raise ValueError(
152                f"Header not found. Expected '#chm' in the first two lines. "
153                f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
154            )
155
156        # Ensure no duplicate columns exist in the header
157        if len(header) != len(set(header)):
158            raise ValueError("Duplicate columns detected in the header.")
159
160        # Read the main data into a DataFrame, skipping comment lines
161        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
162
163        # Extract chromosomes data
164        chromosomes = msp_df['#chm'].astype(str).to_numpy()
165
166        # Extract physical positions (if available)
167        column_counter = 1
168        try:
169            physical_pos = msp_df[['spos', 'epos']].to_numpy()
170            column_counter += 2
171        except KeyError:
172            physical_pos = None
173            log.warning("Physical positions ('spos' and 'epos') not found.")
174        
175        # Extract centimorgan positions (if available)
176        try:
177            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
178            column_counter += 2
179        except KeyError:
180            centimorgan_pos = None
181            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
182        
183        # Extract window sizes (if available)
184        try:
185            window_sizes = msp_df['n snps'].to_numpy()
186            column_counter += 1
187        except KeyError:
188            window_sizes = None
189            log.warning("Window sizes ('n snps') not found.")
190        
191        # Extract LAI data (haplotype-level)
192        lai = msp_df[msp_df.columns[column_counter:]].to_numpy()
193
194        # Extract haplotype identifiers
195        haplotypes = msp_df.columns[column_counter:].to_list()
196
197        # Extract haplotype identifiers and sample identifiers
198        samples = self._get_samples(msp_df, column_counter)
199
200        # Validate the number of samples matches the LAI data dimensions
201        n_samples = len(samples)
202        if n_samples != int(lai.shape[1] / 2):
203            raise ValueError(
204                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
205                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
206            )
207        
208        # Count number of unique ancestries in the LAI data
209        n_ancestries = len(np.unique(lai))
210
211        # Parse ancestry map from the comment (if available)
212        ancestry_map = None
213        if comment is not None:
214            ancestry_map = self._get_ancestry_map_from_comment(comment)
215            if len(ancestry_map) != n_ancestries:
216                warnings.warn(
217                    "Mismatch between the number of unique ancestries in the LAI data "
218                    f"({n_ancestries}) and the number of classes in the ancestry map "
219                    f"({len(ancestry_map)})."
220                )
221        else:
222            # Provide default ancestry mapping if no comment is provided
223            ancestry_map = None
224            warnings.warn(
225                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
226                "map as a comment in the first line."
227            )
228
229        # Replace fully NaN attributes with None
230        window_sizes = self._replace_nan_with_none(window_sizes)
231        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
232        chromosomes = self._replace_nan_with_none(chromosomes)
233        physical_pos = self._replace_nan_with_none(physical_pos)
234
235        return LocalAncestryObject(
236            haplotypes=haplotypes,
237            lai=lai,
238            samples=samples,
239            ancestry_map=ancestry_map,
240            window_sizes=window_sizes,
241            centimorgan_pos=centimorgan_pos,
242            chromosomes=chromosomes,
243            physical_pos=physical_pos
244        )

A reader class for parsing Local Ancestry Inference (LAI) data from an .msp or msp.tsv file and constructing a snputils.ancestry.genobj.LocalAncestryObject.

MSPReader(file: Union[str, pathlib._local.Path]) View Source

20    def __init__(self, file: Union[str, Path]) -> None:
21        """
22        Args:
23            file (str or pathlib.Path): 
24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
25        """
26        self.__file = Path(file)

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.

file: pathlib._local.Path View Source

28    @property
29    def file(self) -> Path:
30        """
31        Retrieve `file`.
32
33        Returns:
34            **pathlib.Path:** 
35                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
36        """
37        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file to be read. It should end with .msp or .msp.tsv.

def read(self) -> snputils.ancestry.genobj.LocalAncestryObject: View Source

110    def read(self) -> 'LocalAncestryObject':
111        """
112        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
113        `snputils.ancestry.genobj.LocalAncestryObject`.
114
115        **Expected MSP content:**
116
117        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
118        Each row should correspond to a genomic window and include the following columns:
119
120        - `#chm`: Chromosome numbers corresponding to each genomic window.
121        - `spos`: Start physical position for each window.
122        - `epos`: End physical position for each window.
123        - `sgpos`: Start centimorgan position for each window.
124        - `egpos`: End centimorgan position for each window.
125        - `n snps`: Number of SNPs in each genomic window.
126        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
127        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
128
129        Returns:
130            **LocalAncestryObject:**
131                A LocalAncestryObject instance.
132        """
133        log.info(f"Reading '{self.file}'...")
134
135        # Open the file and read the first two lines to identify the header and comment
136        with open(self.file) as f:
137            first_line = f.readline()
138            second_line = f.readline()
139
140        first_line_ = [h.replace('\n', '') for h in first_line.split("\t")]
141        second_line_ = [h.replace('\n', '') for h in second_line.split("\t")]
142
143        # Determine which line contains the #chm reference value for the header
144        if "#chm" in first_line_:
145            comment = None
146            header = first_line_
147        elif "#chm" in second_line_:
148            comment = first_line
149            header = second_line_
150        else:
151            raise ValueError(
152                f"Header not found. Expected '#chm' in the first two lines. "
153                f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
154            )
155
156        # Ensure no duplicate columns exist in the header
157        if len(header) != len(set(header)):
158            raise ValueError("Duplicate columns detected in the header.")
159
160        # Read the main data into a DataFrame, skipping comment lines
161        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
162
163        # Extract chromosomes data
164        chromosomes = msp_df['#chm'].astype(str).to_numpy()
165
166        # Extract physical positions (if available)
167        column_counter = 1
168        try:
169            physical_pos = msp_df[['spos', 'epos']].to_numpy()
170            column_counter += 2
171        except KeyError:
172            physical_pos = None
173            log.warning("Physical positions ('spos' and 'epos') not found.")
174        
175        # Extract centimorgan positions (if available)
176        try:
177            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
178            column_counter += 2
179        except KeyError:
180            centimorgan_pos = None
181            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
182        
183        # Extract window sizes (if available)
184        try:
185            window_sizes = msp_df['n snps'].to_numpy()
186            column_counter += 1
187        except KeyError:
188            window_sizes = None
189            log.warning("Window sizes ('n snps') not found.")
190        
191        # Extract LAI data (haplotype-level)
192        lai = msp_df[msp_df.columns[column_counter:]].to_numpy()
193
194        # Extract haplotype identifiers
195        haplotypes = msp_df.columns[column_counter:].to_list()
196
197        # Extract haplotype identifiers and sample identifiers
198        samples = self._get_samples(msp_df, column_counter)
199
200        # Validate the number of samples matches the LAI data dimensions
201        n_samples = len(samples)
202        if n_samples != int(lai.shape[1] / 2):
203            raise ValueError(
204                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
205                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
206            )
207        
208        # Count number of unique ancestries in the LAI data
209        n_ancestries = len(np.unique(lai))
210
211        # Parse ancestry map from the comment (if available)
212        ancestry_map = None
213        if comment is not None:
214            ancestry_map = self._get_ancestry_map_from_comment(comment)
215            if len(ancestry_map) != n_ancestries:
216                warnings.warn(
217                    "Mismatch between the number of unique ancestries in the LAI data "
218                    f"({n_ancestries}) and the number of classes in the ancestry map "
219                    f"({len(ancestry_map)})."
220                )
221        else:
222            # Provide default ancestry mapping if no comment is provided
223            ancestry_map = None
224            warnings.warn(
225                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
226                "map as a comment in the first line."
227            )
228
229        # Replace fully NaN attributes with None
230        window_sizes = self._replace_nan_with_none(window_sizes)
231        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
232        chromosomes = self._replace_nan_with_none(chromosomes)
233        physical_pos = self._replace_nan_with_none(physical_pos)
234
235        return LocalAncestryObject(
236            haplotypes=haplotypes,
237            lai=lai,
238            samples=samples,
239            ancestry_map=ancestry_map,
240            window_sizes=window_sizes,
241            centimorgan_pos=centimorgan_pos,
242            chromosomes=chromosomes,
243            physical_pos=physical_pos
244        )

Read data from the provided .msp or msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Expected MSP content:

The .msp file should contain local ancestry assignments for each haplotype across genomic windows. Each row should correspond to a genomic window and include the following columns:

#chm: Chromosome numbers corresponding to each genomic window.
spos: Start physical position for each window.
epos: End physical position for each window.
sgpos: Start centimorgan position for each window.
egpos: End centimorgan position for each window.
n snps: Number of SNPs in each genomic window.
SampleID.0: Local ancestry for the first haplotype of the sample for each window.
SampleID.1: Local ancestry for the second haplotype of the sample for each window.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class MSPWriter(snputils.ancestry.io.local.write.base.LAIBaseWriter): View Source

 15class MSPWriter(LAIBaseWriter):
 16    """
 17    A writer class for exporting local ancestry data from a `snputils.ancestry.genobj.LocalAncestryObject` 
 18    into an `.msp` or `.msp.tsv` file.
 19    """
 20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
 21        """
 22        Args:
 23            laiobj (LocalAncestryObject):
 24                A LocalAncestryObject instance.
 25            file (str or pathlib.Path): 
 26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 28        """
 29        self.__laiobj = laiobj
 30        self.__file = Path(file)
 31
 32    @property
 33    def laiobj(self) -> LocalAncestryObject:
 34        """
 35        Retrieve `laiobj`. 
 36
 37        Returns:
 38            **LocalAncestryObject:** 
 39                A LocalAncestryObject instance.
 40        """
 41        return self.__laiobj
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 52        """
 53        return self.__file
 54
 55    @file.setter
 56    def file(self, x: Union[str, Path]):
 57        """
 58        Update `file`.
 59        """
 60        self.__file = Path(x)
 61    
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

A writer class for exporting local ancestry data from a snputils.ancestry.genobj.LocalAncestryObject into an .msp or .msp.tsv file.

MSPWriter( laiobj: snputils.ancestry.genobj.LocalAncestryObject, file: Union[str, pathlib._local.Path]) View Source

20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
21        """
22        Args:
23            laiobj (LocalAncestryObject):
24                A LocalAncestryObject instance.
25            file (str or pathlib.Path): 
26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
28        """
29        self.__laiobj = laiobj
30        self.__file = Path(file)

Arguments:

laiobj (LocalAncestryObject): A LocalAncestryObject instance.
file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

laiobj: snputils.ancestry.genobj.LocalAncestryObject View Source

32    @property
33    def laiobj(self) -> LocalAncestryObject:
34        """
35        Retrieve `laiobj`. 
36
37        Returns:
38            **LocalAncestryObject:** 
39                A LocalAncestryObject instance.
40        """
41        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib._local.Path View Source

43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
52        """
53        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

def write(self) -> None: View Source

 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

Write the data contained in the laiobj instance to the specified output file. If the file already exists, it will be overwritten.

Output MSP content:

The output .msp file will contain local ancestry assignments for each haplotype across genomic windows. Each row corresponds to a genomic window and includes the following columns:

#chm: Chromosome numbers corresponding to each genomic window.
spos: Start physical position for each window.
epos: End physical position for each window.
sgpos: Start centimorgan position for each window.
egpos: End centimorgan position for each window.
n snps: Number of SNPs in each genomic window.
SampleID.0: Local ancestry for the first haplotype of the sample for each window.
SampleID.1: Local ancestry for the second haplotype of the sample for each window.

class AdmixtureMappingVCFWriter: View Source

 15class AdmixtureMappingVCFWriter:
 16    """
 17    A writer class for converting and writing local ancestry data into ancestry-specific 
 18    VCF/BCF files for ADMIXTURE mapping.
 19    """
 20    def __init__(
 21            self, 
 22            laiobj: LocalAncestryObject, 
 23            file: Union[str, Path], 
 24            ancestry_map: Optional[Dict[str, str]] = None
 25        ):
 26        """
 27        Args:
 28            laiobj (LocalAncestryObject): 
 29                A LocalAncestryObject instance.
 30            file (str or pathlib.Path): 
 31                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 32                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 33            ancestry_map (dict of str to str, optional): 
 34                A dictionary mapping ancestry codes to region names. If not explicitly 
 35                provided, it will default to the `ancestry_map` from `laiobj`.
 36        """
 37        self.__laiobj = laiobj
 38        self.__file = Path(file)
 39        self.__ancestry_map = ancestry_map
 40
 41    @property
 42    def laiobj(self) -> LocalAncestryObject:
 43        """
 44        Retrieve `laiobj`. 
 45
 46        Returns:
 47            **LocalAncestryObject:** 
 48                A LocalAncestryObject instance.
 49        """
 50        return self.__laiobj
 51
 52    @property
 53    def file(self) -> Path:
 54        """
 55        Retrieve `file`.
 56
 57        Returns:
 58            **pathlib.Path:** 
 59                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 60                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 61        """
 62        return self.__file
 63
 64    @property
 65    def ancestry_map(self) -> Dict[str, str]:
 66        """
 67        Retrieve `ancestry_map`.
 68
 69        Returns:
 70            **dict of str to str:** 
 71                A dictionary mapping ancestry codes to region names. If not explicitly 
 72                provided, it will default to the `ancestry_map` from `laiobj`.
 73        """
 74        if self.__ancestry_map is not None:
 75            return self.__ancestry_map
 76        elif self.laiobj.ancestry_map is not None:
 77            return self.laiobj.ancestry_map
 78        else:
 79            raise ValueError(
 80                "Ancestry mapping is required but missing. Provide `ancestry_map` "
 81                "during initialization or ensure `laiobj.ancestry_map` is set."
 82            )
 83
 84    def write(self) -> None:
 85        """
 86        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 87        If the file already exists, it will be overwritten.
 88
 89        **Output VCF/BCF content:**
 90        
 91        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 92        SNPs are encoded as follows:
 93
 94        - `1`: Indicates positions that match the specified ancestry.
 95        - `0`: Indicates positions that do not match the specified ancestry.
 96
 97        The VCF/BCF files will contain the following fields:
 98
 99        - `CHROM`: Chromosome for each variant.
100        - `POS`: Chromosomal positions for each variant.
101        - `ID`: Unique identifier for each variant.
102        - `REF`: Reference allele for each variant.
103        - `ALT`: Alternate allele for each variant.
104        - `QUAL`: Phred-scaled quality score for each variant.
105        - `FILTER`: Status indicating whether each SNP passed control checks.
106        - `INFO`: Additional information field. Defaults to `'.'` indicating no extra metadata.
107        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
108        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
109
110        **Output files:**
111
112        - A separate VCF file is written for each ancestry type, with filenames formatted as:
113        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
114        """
115        # Process the list of positions to include both the start and end coordinates for each window
116        # Iterate over each ancestry key in the ancestry mapping
117        for key in self.ancestry_map:
118            ancestry = int(key)
119            anc_string = self.ancestry_map[key]
120
121            # Define the output file format, ensuring it has the correct ancestry-specific suffix
122            file_extension = (".vcf", ".bcf")
123            
124            # Check if file has one of the specified extensions
125            if self.file.suffix not in file_extension:
126                # If file does not have the correct extension, default to ".vcf"
127                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
128            else:
129                # If file has the correct extension, insert the ancestry string before the extension
130                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
131
132            # Check if file already exists
133            if output_file.exists():
134                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
135
136            # Format start and end positions for the VCF file
137            if self.laiobj.physical_pos is not None:
138                pos_list = [f"{val1}_{val2}" for val1, val2 in self.laiobj.physical_pos]
139            else:
140                pos_list = None
141
142            # Modify LAI data values to simulate a SNP file
143            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
144            match = (self.laiobj.lai == ancestry).astype(int)
145            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
146
147            # Set up VCF-related data
148            calldata_gt = match
149            samples = np.array(self.laiobj.samples)
150            variants_chrom = self.laiobj.chromosomes
151            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
152            variants_id = np.array(variants_list)
153            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
154            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
155
156            # Create the SNPObject
157            variant_data_obj = SNPObject(
158                calldata_gt=calldata_gt,
159                samples=samples,
160                variants_chrom=variants_chrom,
161                variants_id=variants_id,
162                variants_ref = variants_ref,
163                variants_alt = variants_alt,
164                variants_pos = pos_list,
165            )
166
167            # Log the start of the VCF file writing process
168            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
169
170            # Write the VCF file
171            vcf_writer = VCFWriter(variant_data_obj, output_file)
172            vcf_writer.write()
173
174            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
175
176        return

A writer class for converting and writing local ancestry data into ancestry-specific VCF/BCF files for ADMIXTURE mapping.

AdmixtureMappingVCFWriter( laiobj: snputils.ancestry.genobj.LocalAncestryObject, file: Union[str, pathlib._local.Path], ancestry_map: Optional[Dict[str, str]] = None) View Source

20    def __init__(
21            self, 
22            laiobj: LocalAncestryObject, 
23            file: Union[str, Path], 
24            ancestry_map: Optional[Dict[str, str]] = None
25        ):
26        """
27        Args:
28            laiobj (LocalAncestryObject): 
29                A LocalAncestryObject instance.
30            file (str or pathlib.Path): 
31                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
32                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
33            ancestry_map (dict of str to str, optional): 
34                A dictionary mapping ancestry codes to region names. If not explicitly 
35                provided, it will default to the `ancestry_map` from `laiobj`.
36        """
37        self.__laiobj = laiobj
38        self.__file = Path(file)
39        self.__ancestry_map = ancestry_map

Arguments:

laiobj (LocalAncestryObject): A LocalAncestryObject instance.
file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.
ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

laiobj: snputils.ancestry.genobj.LocalAncestryObject View Source

41    @property
42    def laiobj(self) -> LocalAncestryObject:
43        """
44        Retrieve `laiobj`. 
45
46        Returns:
47            **LocalAncestryObject:** 
48                A LocalAncestryObject instance.
49        """
50        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib._local.Path View Source

52    @property
53    def file(self) -> Path:
54        """
55        Retrieve `file`.
56
57        Returns:
58            **pathlib.Path:** 
59                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
60                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
61        """
62        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.

ancestry_map: Dict[str, str] View Source

64    @property
65    def ancestry_map(self) -> Dict[str, str]:
66        """
67        Retrieve `ancestry_map`.
68
69        Returns:
70            **dict of str to str:** 
71                A dictionary mapping ancestry codes to region names. If not explicitly 
72                provided, it will default to the `ancestry_map` from `laiobj`.
73        """
74        if self.__ancestry_map is not None:
75            return self.__ancestry_map
76        elif self.laiobj.ancestry_map is not None:
77            return self.laiobj.ancestry_map
78        else:
79            raise ValueError(
80                "Ancestry mapping is required but missing. Provide `ancestry_map` "
81                "during initialization or ensure `laiobj.ancestry_map` is set."
82            )

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

def write(self) -> None: View Source

 84    def write(self) -> None:
 85        """
 86        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 87        If the file already exists, it will be overwritten.
 88
 89        **Output VCF/BCF content:**
 90        
 91        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 92        SNPs are encoded as follows:
 93
 94        - `1`: Indicates positions that match the specified ancestry.
 95        - `0`: Indicates positions that do not match the specified ancestry.
 96
 97        The VCF/BCF files will contain the following fields:
 98
 99        - `CHROM`: Chromosome for each variant.
100        - `POS`: Chromosomal positions for each variant.
101        - `ID`: Unique identifier for each variant.
102        - `REF`: Reference allele for each variant.
103        - `ALT`: Alternate allele for each variant.
104        - `QUAL`: Phred-scaled quality score for each variant.
105        - `FILTER`: Status indicating whether each SNP passed control checks.
106        - `INFO`: Additional information field. Defaults to `'.'` indicating no extra metadata.
107        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
108        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
109
110        **Output files:**
111
112        - A separate VCF file is written for each ancestry type, with filenames formatted as:
113        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
114        """
115        # Process the list of positions to include both the start and end coordinates for each window
116        # Iterate over each ancestry key in the ancestry mapping
117        for key in self.ancestry_map:
118            ancestry = int(key)
119            anc_string = self.ancestry_map[key]
120
121            # Define the output file format, ensuring it has the correct ancestry-specific suffix
122            file_extension = (".vcf", ".bcf")
123            
124            # Check if file has one of the specified extensions
125            if self.file.suffix not in file_extension:
126                # If file does not have the correct extension, default to ".vcf"
127                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
128            else:
129                # If file has the correct extension, insert the ancestry string before the extension
130                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
131
132            # Check if file already exists
133            if output_file.exists():
134                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
135
136            # Format start and end positions for the VCF file
137            if self.laiobj.physical_pos is not None:
138                pos_list = [f"{val1}_{val2}" for val1, val2 in self.laiobj.physical_pos]
139            else:
140                pos_list = None
141
142            # Modify LAI data values to simulate a SNP file
143            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
144            match = (self.laiobj.lai == ancestry).astype(int)
145            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
146
147            # Set up VCF-related data
148            calldata_gt = match
149            samples = np.array(self.laiobj.samples)
150            variants_chrom = self.laiobj.chromosomes
151            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
152            variants_id = np.array(variants_list)
153            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
154            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
155
156            # Create the SNPObject
157            variant_data_obj = SNPObject(
158                calldata_gt=calldata_gt,
159                samples=samples,
160                variants_chrom=variants_chrom,
161                variants_id=variants_id,
162                variants_ref = variants_ref,
163                variants_alt = variants_alt,
164                variants_pos = pos_list,
165            )
166
167            # Log the start of the VCF file writing process
168            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
169
170            # Write the VCF file
171            vcf_writer = VCFWriter(variant_data_obj, output_file)
172            vcf_writer.write()
173
174            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
175
176        return

Write VCF or BCF files for each ancestry type defined in the ancestry map. If the file already exists, it will be overwritten.

Output VCF/BCF content:

For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. SNPs are encoded as follows:

1: Indicates positions that match the specified ancestry.
0: Indicates positions that do not match the specified ancestry.

The VCF/BCF files will contain the following fields:

CHROM: Chromosome for each variant.
POS: Chromosomal positions for each variant.
ID: Unique identifier for each variant.
REF: Reference allele for each variant.
ALT: Alternate allele for each variant.
QUAL: Phred-scaled quality score for each variant.
FILTER: Status indicating whether each SNP passed control checks.
INFO: Additional information field. Defaults to '.' indicating no extra metadata.
FORMAT: Genotype format. Set to 'GT', representing the genotype as phased alleles.
<SampleID>: One column per sample, containing the genotype data (1|0, 0|1, etc.).

Output files:

A separate VCF file is written for each ancestry type, with filenames formatted as: <filename>_<ancestry>.vcf (e.g., output_African.vcf).