snputils.ancestry.io

1from .local import LAIReader, MSPReader, MSPWriter, AdmixtureMappingVCFWriter, read_lai, read_msp
2from .wide import AdmixtureReader, AdmixtureWriter, read_adm, read_admixture
3
4__all__ = ['read_adm', 'read_admixture', 'read_lai', 'read_msp',
5           'AdmixtureReader', 'AdmixtureWriter', 'LAIReader', 'MSPReader', 'MSPWriter', 'AdmixtureMappingVCFWriter']
def read_adm( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None) -> snputils.ancestry.genobj.GlobalAncestryObject:
 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Union[str, Path],
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_id_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_admixture( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None) -> snputils.ancestry.genobj.GlobalAncestryObject:
 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Union[str, Path],
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_id_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_lai( file: Union[str, pathlib._local.Path], **kwargs) -> snputils.ancestry.genobj.LocalAncestryObject:
 8def read_lai(file: Union[str, Path], **kwargs) -> LocalAncestryObject:
 9    """
10    Automatically detect the local ancestry data file format from the file's extension and 
11    read it into a `snputils.ancestry.genobj.LocalAncestryObject`.
12
13    **Supported formats:**
14
15    - `.msp`: Text-based MSP format.
16    - `.msp.tsv`: Text-based MSP format with TSV extension.
17    
18    Args:
19        file (str or pathlib.Path): 
20            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
21        **kwargs: Additional arguments passed to the reader method.
22    """
23    from snputils.ancestry.io.local.read.auto import LAIReader
24
25    return LAIReader(file).read(**kwargs)

Automatically detect the local ancestry data file format from the file's extension and read it into a snputils.ancestry.genobj.LocalAncestryObject.

Supported formats:

  • .msp: Text-based MSP format.
  • .msp.tsv: Text-based MSP format with TSV extension.
Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
  • **kwargs: Additional arguments passed to the reader method.
def read_msp( file: Union[str, pathlib._local.Path]) -> snputils.ancestry.genobj.LocalAncestryObject:
28def read_msp(file: Union[str, Path]) -> 'LocalAncestryObject':
29    """
30    Read data from an `.msp` or `.msp.tsv` file and construct a `snputils.ancestry.genobj.LocalAncestryObject`.
31
32    Args:
33        file (str or pathlib.Path): 
34            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
35
36    Returns:
37        **LocalAncestryObject:**
38            A LocalAncestryObject instance.
39    """
40    from snputils.ancestry.io.local.read.msp import MSPReader
41
42    return MSPReader(file).read()

Read data from an .msp or .msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class AdmixtureReader(snputils.ancestry.io.wide.read.base.WideBaseReader):
 13class AdmixtureReader(WideBaseReader):
 14    """
 15    A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`.
 16    """
 17    def __init__(
 18        self,
 19        Q_file: Union[str, Path],
 20        P_file: Union[str, Path],
 21        sample_file: Optional[Union[str, Path]] = None,
 22        snp_file: Optional[Union[str, Path]] = None,
 23        ancestry_file: Optional[Union[str, Path]] = None,
 24    ) -> None:
 25        """
 26        Args:
 27            Q_file (str or pathlib.Path):
 28                Path to the file containing the Q matrix (per-sample ancestry proportions).
 29                It should end with .Q or .txt.
 30                The file should use space (' ') as the delimiter.
 31            P_file (str or pathlib.Path):
 32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 33                It should end with .P or .txt.
 34                The file should use space (' ') as the delimiter.
 35            sample_file (str or pathlib.Path, optional):
 36                Path to the single-column file containing sample identifiers. 
 37                It should end with .fam or .txt.
 38                If None, sample identifiers are not loaded.
 39            snp_file (str or pathlib.Path, optional):
 40                Path to the single-column file containing SNP identifiers. 
 41                It should end with .bim or .txt.
 42                If None, SNP identifiers are not loaded.
 43            ancestry_file (str or pathlib.Path, optional):
 44                Path to the single-column file containing ancestry labels for each sample.
 45                It should end with .map or .txt.
 46                If None, ancestries are not loaded.
 47        """
 48        self.__Q_file = Path(Q_file)
 49        self.__P_file = Path(P_file)
 50        self.__sample_file = Path(sample_file) if sample_file is not None else None
 51        self.__snp_file = Path(snp_file) if snp_file is not None else None
 52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
 53
 54    @property
 55    def Q_file(self) -> Path:
 56        """
 57        Retrieve Q_file.
 58
 59        Returns:
 60            **pathlib.Path:** 
 61                Path to the file containing the Q matrix (per-sample ancestry proportions).
 62                It should end with .Q or .txt.
 63                The file should use space (' ') as the delimiter.
 64        """
 65        return self.__Q_file
 66
 67    @property
 68    def P_file(self) -> Path:
 69        """
 70        Retrieve P_file.
 71
 72        Returns:
 73            **pathlib.Path:** 
 74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 75                It should end with .P or .txt.
 76                The file should use space (' ') as the delimiter.
 77        """
 78        return self.__P_file
 79
 80    @property
 81    def sample_file(self) -> Optional[Path]:
 82        """
 83        Retrieve sample_file.
 84
 85        Returns:
 86            **pathlib.Path:** 
 87                Path to the single-column file containing sample identifiers. 
 88                It should end with .fam or .txt.
 89                If None, sample identifiers are not loaded.
 90        """
 91        return self.__sample_file
 92    
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file
105
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file
118
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        log.info(f"Reading P matrix from '{self.P_file}'...")
145        P_mat = np.genfromtxt(self.P_file, delimiter=' ')
146
147        samples = self._read_sample_ids()
148        snps = self._read_snps()
149        ancestries = self._read_ancestries()
150
151        return GlobalAncestryObject(
152            Q_mat,
153            P_mat,
154            samples=samples,
155            snps=snps,
156            ancestries=ancestries
157        )

A reader class for parsing ADMIXTURE files and constructing a snputils.ancestry.genobj.GlobalAncestryObject.

AdmixtureReader( Q_file: Union[str, pathlib._local.Path], P_file: Union[str, pathlib._local.Path], sample_file: Union[str, pathlib._local.Path, NoneType] = None, snp_file: Union[str, pathlib._local.Path, NoneType] = None, ancestry_file: Union[str, pathlib._local.Path, NoneType] = None)
17    def __init__(
18        self,
19        Q_file: Union[str, Path],
20        P_file: Union[str, Path],
21        sample_file: Optional[Union[str, Path]] = None,
22        snp_file: Optional[Union[str, Path]] = None,
23        ancestry_file: Optional[Union[str, Path]] = None,
24    ) -> None:
25        """
26        Args:
27            Q_file (str or pathlib.Path):
28                Path to the file containing the Q matrix (per-sample ancestry proportions).
29                It should end with .Q or .txt.
30                The file should use space (' ') as the delimiter.
31            P_file (str or pathlib.Path):
32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
33                It should end with .P or .txt.
34                The file should use space (' ') as the delimiter.
35            sample_file (str or pathlib.Path, optional):
36                Path to the single-column file containing sample identifiers. 
37                It should end with .fam or .txt.
38                If None, sample identifiers are not loaded.
39            snp_file (str or pathlib.Path, optional):
40                Path to the single-column file containing SNP identifiers. 
41                It should end with .bim or .txt.
42                If None, SNP identifiers are not loaded.
43            ancestry_file (str or pathlib.Path, optional):
44                Path to the single-column file containing ancestry labels for each sample.
45                It should end with .map or .txt.
46                If None, ancestries are not loaded.
47        """
48        self.__Q_file = Path(Q_file)
49        self.__P_file = Path(P_file)
50        self.__sample_file = Path(sample_file) if sample_file is not None else None
51        self.__snp_file = Path(snp_file) if snp_file is not None else None
52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
Arguments:
  • Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
  • P_file (str or pathlib.Path): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.
  • sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
  • snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
  • ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.
Q_file: pathlib._local.Path
54    @property
55    def Q_file(self) -> Path:
56        """
57        Retrieve Q_file.
58
59        Returns:
60            **pathlib.Path:** 
61                Path to the file containing the Q matrix (per-sample ancestry proportions).
62                It should end with .Q or .txt.
63                The file should use space (' ') as the delimiter.
64        """
65        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.

P_file: pathlib._local.Path
67    @property
68    def P_file(self) -> Path:
69        """
70        Retrieve P_file.
71
72        Returns:
73            **pathlib.Path:** 
74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
75                It should end with .P or .txt.
76                The file should use space (' ') as the delimiter.
77        """
78        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path: Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter.

sample_file: Optional[pathlib._local.Path]
80    @property
81    def sample_file(self) -> Optional[Path]:
82        """
83        Retrieve sample_file.
84
85        Returns:
86            **pathlib.Path:** 
87                Path to the single-column file containing sample identifiers. 
88                It should end with .fam or .txt.
89                If None, sample identifiers are not loaded.
90        """
91        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.

snp_file: Optional[pathlib._local.Path]
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.

ancestry_file: Optional[pathlib._local.Path]
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

def read(self) -> snputils.ancestry.genobj.GlobalAncestryObject:
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        log.info(f"Reading P matrix from '{self.P_file}'...")
145        P_mat = np.genfromtxt(self.P_file, delimiter=' ')
146
147        samples = self._read_sample_ids()
148        snps = self._read_snps()
149        ancestries = self._read_ancestries()
150
151        return GlobalAncestryObject(
152            Q_mat,
153            P_mat,
154            samples=samples,
155            snps=snps,
156            ancestries=ancestries
157        )

Read data from the provided ADMIXTURE files and construct a snputils.ancestry.genobj.GlobalAncestryObject instance.

Expected ADMIXTURE files content:

  • Q_file: A text file containing the Q matrix with per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
  • P_file: A text file containing the P matrix with per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

Optional files (if provided):

  • sample_file: A single-column text file containing sample identifiers in order.
  • snp_file: A single-column text file containing SNP identifiers in order.
  • ancestry_file: A single-column text file containing ancestry labels for each sample.
Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

class AdmixtureWriter(snputils.ancestry.io.wide.write.base.WideBaseWriter):
 13class AdmixtureWriter(WideBaseWriter):
 14    """
 15    A writer class for exporting global ancestry data from a 
 16    `snputils.ancestry.genobj.GlobalAncestryObject` into multiple ADMIXTURE files.
 17    """
 18    def __init__(
 19        self, 
 20        wideobj: GlobalAncestryObject, 
 21        file_prefix: Union[str, Path]
 22    ) -> None:
 23        """
 24        Args:
 25            wideobj (GlobalAncestryObject): 
 26                A GlobalAncestryObject instance.
 27            file_prefix (str or pathlib.Path): 
 28                Prefix for output file names, including directory path but excluding file extensions. 
 29                The prefix is used to generate specific file names for each output, with file-specific 
 30                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
 31        """
 32        super(AdmixtureWriter, self).__init__(wideobj, file_prefix)
 33        self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q")
 34        self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P")
 35
 36        self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None
 37        self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None
 38        self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None
 39
 40    @property
 41    def wideobj(self) -> GlobalAncestryObject:
 42        """
 43        Retrieve `wideobj`.
 44
 45        Returns:
 46            **GlobalAncestryObject:** A GlobalAncestryObject instance.
 47        """
 48        return self.__wideobj
 49
 50    @property
 51    def file_prefix(self) -> Path:
 52        """
 53        Retrieve `file_prefix`.
 54
 55        Returns:
 56            **pathlib.Path:** 
 57                Prefix for output file names, including directory path but excluding file extensions. 
 58                The prefix is used to generate specific file names for each output, with file-specific 
 59                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
 60        """
 61        return self.__file_prefix
 62
 63    @property
 64    def Q_file(self) -> Path:
 65        """
 66        Retrieve `Q_file`.
 67
 68        Returns:
 69            **pathlib.Path:** 
 70                Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions).
 71        """
 72        return self.__Q_file
 73    
 74    @property
 75    def P_file(self) -> Path:
 76        """
 77        Retrieve `P_file`.
 78
 79        Returns:
 80            **pathlib.Path:** 
 81                Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies).
 82        """
 83        return self.__P_file
 84    
 85    @property
 86    def sample_file(self) -> Optional[Path]:
 87        """
 88        Retrieve `sample_file`.
 89
 90        Returns:
 91            **pathlib.Path:** 
 92                Path to the `.txt` the file that will store sample identifiers. 
 93                If None, sample identifiers are not saved.
 94        """
 95        return self.__sample_file
 96    
 97    @property
 98    def snp_file(self) -> Optional[Path]:
 99        """
100        Retrieve `snp_file`.
101
102        Returns:
103            **pathlib.Path:** 
104                Path to the `.txt` file that will store SNP identifiers. 
105                If None, SNP identifiers are not saved.
106        """
107        return self.__snp_file
108    
109    @property
110    def ancestry_file(self) -> Optional[Path]:
111        """
112        Retrieve `ancestry_file`.
113
114        Returns:
115            **pathlib.Path:** 
116                Path to the `.map` file that will store ancestry labels for each sample. 
117                If None, ancestries are not saved.
118        """
119        return self.__ancestry_file
120
121    def _write_Q(self):
122        log.info(f"Writing Q matrix to '{self.Q_file}'...")
123        np.savetxt(self.Q_file, self.wideobj.Q, delimiter=" ")
124        log.info(f"Finished writing Q matrix to '{self.Q_file}'.")
125
126    def _write_P(self):
127        log.info(f"Writing P matrix to '{self.P_file}'...")
128        np.savetxt(self.P_file, self.wideobj.P, delimiter=" ")
129        log.info(f"Finished writing P matrix to '{self.P_file}'.")
130
131    def _write_sample_ids(self):
132        if self.wideobj.samples is not None:
133            log.info(f"Writing sample IDs to '{self.sample_file}'...")
134            np.savetxt(self.sample_file, self.wideobj.samples, fmt="%s")
135            log.info(f"Finished writing sample IDs to '{self.sample_file}'.")
136
137    def _write_snps(self):
138        if self.wideobj.snps is not None:
139            log.info(f"Writing SNP IDs to '{self.snp_file}'...")
140            np.savetxt(self.snp_file, self.wideobj.snps, fmt="%s")
141            log.info(f"Finished writing SNP IDs to '{self.snp_file}'.")
142
143    def _write_ancestries(self):
144        if self.wideobj.ancestries is not None:
145            log.info(f"Writing ancestry information to '{self.ancestry_file}'...")
146            np.savetxt(self.ancestry_file, self.wideobj.ancestries, fmt="%s")
147            log.info(f"Finished writing ancestry information to '{self.ancestry_file}'.")
148
149    def write(self) -> None:
150        """
151        Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files
152        with the specified `file_prefix`. If the files already exist, they will be overwritten.
153
154        **Output files:**
155
156        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
157        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
158        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
159        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
160        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
161
162        where `K` is the total number of ancestries.
163        """
164        log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...")
165        
166        self.file_prefix.parent.mkdir(parents=True, exist_ok=True)
167        
168        self._write_Q()
169        self._write_P()
170        self._write_sample_ids()
171        self._write_snps()
172        self._write_ancestries()
173
174        log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")

A writer class for exporting global ancestry data from a snputils.ancestry.genobj.GlobalAncestryObject into multiple ADMIXTURE files.

AdmixtureWriter( wideobj: snputils.ancestry.genobj.GlobalAncestryObject, file_prefix: Union[str, pathlib._local.Path])
18    def __init__(
19        self, 
20        wideobj: GlobalAncestryObject, 
21        file_prefix: Union[str, Path]
22    ) -> None:
23        """
24        Args:
25            wideobj (GlobalAncestryObject): 
26                A GlobalAncestryObject instance.
27            file_prefix (str or pathlib.Path): 
28                Prefix for output file names, including directory path but excluding file extensions. 
29                The prefix is used to generate specific file names for each output, with file-specific 
30                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
31        """
32        super(AdmixtureWriter, self).__init__(wideobj, file_prefix)
33        self.__Q_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.Q")
34        self.__P_file = self.file_prefix.with_suffix(f".{self.wideobj.n_ancestries}.P")
35
36        self.__sample_file = self.file_prefix.with_suffix(".sample_ids.txt") if self.wideobj.samples is not None else None
37        self.__snp_file = self.file_prefix.with_suffix(".snp_ids.txt") if self.wideobj.snps is not None else None
38        self.__ancestry_file = self.file_prefix.with_suffix(".map") if self.wideobj.ancestries is not None else None
Arguments:
  • wideobj (GlobalAncestryObject): A GlobalAncestryObject instance.
  • file_prefix (str or pathlib.Path): Prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).
40    @property
41    def wideobj(self) -> GlobalAncestryObject:
42        """
43        Retrieve `wideobj`.
44
45        Returns:
46            **GlobalAncestryObject:** A GlobalAncestryObject instance.
47        """
48        return self.__wideobj

Retrieve wideobj.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

file_prefix: pathlib._local.Path
50    @property
51    def file_prefix(self) -> Path:
52        """
53        Retrieve `file_prefix`.
54
55        Returns:
56            **pathlib.Path:** 
57                Prefix for output file names, including directory path but excluding file extensions. 
58                The prefix is used to generate specific file names for each output, with file-specific 
59                suffixes appended as described above (e.g., `file_prefix.n_ancestries.Q` for the Q matrix file).
60        """
61        return self.__file_prefix

Retrieve file_prefix.

Returns:

pathlib.Path: Prefix for output file names, including directory path but excluding file extensions. The prefix is used to generate specific file names for each output, with file-specific suffixes appended as described above (e.g., file_prefix.n_ancestries.Q for the Q matrix file).

Q_file: pathlib._local.Path
63    @property
64    def Q_file(self) -> Path:
65        """
66        Retrieve `Q_file`.
67
68        Returns:
69            **pathlib.Path:** 
70                Path to the `.Q` file that will store the Q matrix (per-sample ancestry proportions).
71        """
72        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the .Q file that will store the Q matrix (per-sample ancestry proportions).

P_file: pathlib._local.Path
74    @property
75    def P_file(self) -> Path:
76        """
77        Retrieve `P_file`.
78
79        Returns:
80            **pathlib.Path:** 
81                Path to the `.P` file that will store the P/F matrix (per-ancestry SNP frequencies).
82        """
83        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path: Path to the .P file that will store the P/F matrix (per-ancestry SNP frequencies).

sample_file: Optional[pathlib._local.Path]
85    @property
86    def sample_file(self) -> Optional[Path]:
87        """
88        Retrieve `sample_file`.
89
90        Returns:
91            **pathlib.Path:** 
92                Path to the `.txt` the file that will store sample identifiers. 
93                If None, sample identifiers are not saved.
94        """
95        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the .txt the file that will store sample identifiers. If None, sample identifiers are not saved.

snp_file: Optional[pathlib._local.Path]
 97    @property
 98    def snp_file(self) -> Optional[Path]:
 99        """
100        Retrieve `snp_file`.
101
102        Returns:
103            **pathlib.Path:** 
104                Path to the `.txt` file that will store SNP identifiers. 
105                If None, SNP identifiers are not saved.
106        """
107        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the .txt file that will store SNP identifiers. If None, SNP identifiers are not saved.

ancestry_file: Optional[pathlib._local.Path]
109    @property
110    def ancestry_file(self) -> Optional[Path]:
111        """
112        Retrieve `ancestry_file`.
113
114        Returns:
115            **pathlib.Path:** 
116                Path to the `.map` file that will store ancestry labels for each sample. 
117                If None, ancestries are not saved.
118        """
119        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the .map file that will store ancestry labels for each sample. If None, ancestries are not saved.

def write(self) -> None:
149    def write(self) -> None:
150        """
151        Write the data contained in the `wideobj` instance into the multiple ADMIXTURE files
152        with the specified `file_prefix`. If the files already exist, they will be overwritten.
153
154        **Output files:**
155
156        - `<file_prefix>.K.Q`: Q matrix file. The file uses space (' ') as the delimiter.
157        - `<file_prefix>.K.P`: P matrix file. The file uses space (' ') as the delimiter.
158        - `<file_prefix>.sample_ids.txt`: Sample IDs file (if sample IDs are available).
159        - `<file_prefix>.snp_ids.txt`: SNP IDs file (if SNP IDs are available).
160        - `<file_prefix>.map`: Ancestry file (if ancestries information is available).
161
162        where `K` is the total number of ancestries.
163        """
164        log.info(f"Preparing to write ADMIXTURE files with prefix '{self.file_prefix}'...")
165        
166        self.file_prefix.parent.mkdir(parents=True, exist_ok=True)
167        
168        self._write_Q()
169        self._write_P()
170        self._write_sample_ids()
171        self._write_snps()
172        self._write_ancestries()
173
174        log.info(f"Finished writing all ADMIXTURE files with prefix '{self.file_prefix}'.")

Write the data contained in the wideobj instance into the multiple ADMIXTURE files with the specified file_prefix. If the files already exist, they will be overwritten.

Output files:

  • <file_prefix>.K.Q: Q matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.K.P: P matrix file. The file uses space (' ') as the delimiter.
  • <file_prefix>.sample_ids.txt: Sample IDs file (if sample IDs are available).
  • <file_prefix>.snp_ids.txt: SNP IDs file (if SNP IDs are available).
  • <file_prefix>.map: Ancestry file (if ancestries information is available).

where K is the total number of ancestries.

class LAIReader:
 8class LAIReader:
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that automatically detects the local ancestry data file format from the 
15        file's extension and returns the corresponding reader object.
16
17        **Supported formats:**
18
19        - `.msp`: Text-based MSP format.
20        - `.msp.tsv`: Text-based MSP format with TSV extension.
21
22        Args:
23            file (str or pathlib.Path): 
24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
25        
26        Returns:
27            **object:** A reader object corresponding to the file format (e.g., `MSPReader`).
28        """
29        file = Path(file)
30        suffixes = [suffix.lower() for suffix in file.suffixes]
31        if not suffixes:
32            raise ValueError("The file must have an extension. Supported extensions are: .msp, .msp.tsv.")
33
34        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
35            from snputils.ancestry.io.local.read.msp import MSPReader
36
37            return MSPReader(file)
38        else:
39            raise ValueError(
40                f"Unsupported file extension: {suffixes[-1]}. "
41                "Supported extensions are: .msp, .msp.tsv."
42            )
LAIReader(file: Union[str, pathlib._local.Path])
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that automatically detects the local ancestry data file format from the 
15        file's extension and returns the corresponding reader object.
16
17        **Supported formats:**
18
19        - `.msp`: Text-based MSP format.
20        - `.msp.tsv`: Text-based MSP format with TSV extension.
21
22        Args:
23            file (str or pathlib.Path): 
24                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
25        
26        Returns:
27            **object:** A reader object corresponding to the file format (e.g., `MSPReader`).
28        """
29        file = Path(file)
30        suffixes = [suffix.lower() for suffix in file.suffixes]
31        if not suffixes:
32            raise ValueError("The file must have an extension. Supported extensions are: .msp, .msp.tsv.")
33
34        if suffixes[-2:] == ['.msp', '.tsv'] or suffixes[-1] == '.msp':
35            from snputils.ancestry.io.local.read.msp import MSPReader
36
37            return MSPReader(file)
38        else:
39            raise ValueError(
40                f"Unsupported file extension: {suffixes[-1]}. "
41                "Supported extensions are: .msp, .msp.tsv."
42            )

A factory class that automatically detects the local ancestry data file format from the file's extension and returns the corresponding reader object.

Supported formats:

  • .msp: Text-based MSP format.
  • .msp.tsv: Text-based MSP format with TSV extension.
Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
Returns:

object: A reader object corresponding to the file format (e.g., MSPReader).

class MSPReader(snputils.ancestry.io.local.read.base.LAIBaseReader):
 16class MSPReader(LAIBaseReader):
 17    """
 18    A reader class for parsing Local Ancestry Inference (LAI) data from an `.msp` or `msp.tsv` file
 19    and constructing a `snputils.ancestry.genobj.LocalAncestryObject`.
 20    """
 21    def __init__(self, file: Union[str, Path]) -> None:
 22        """
 23        Args:
 24            file (str or pathlib.Path): 
 25                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 26        """
 27        self.__file = Path(file)
 28
 29    @property
 30    def file(self) -> Path:
 31        """
 32        Retrieve `file`.
 33
 34        Returns:
 35            **pathlib.Path:** 
 36                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 37        """
 38        return self.__file
 39
 40    def _get_samples(self, msp_df: pd.DataFrame, first_lai_col_indx: int) -> List[str]:
 41        """
 42        Extract unique sample identifiers from the pandas DataFrame.
 43
 44        Args:
 45            msp_df (pd.DataFrame): 
 46                The DataFrame representing the `.msp` data, including LAI columns.
 47            first_lai_col_indx (int): 
 48                Index of the first column containing LAI data.
 49
 50        Returns:
 51            **list:** List of unique sample identifiers.
 52        """
 53        # Get all columns starting from the first LAI data column
 54        query_samples_dub = msp_df.columns[first_lai_col_indx:]
 55
 56        # Select only one of the maternal/paternal samples by taking every second sample
 57        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 58        query_samples_sing = query_samples_dub[single_ind_idx]
 59
 60        # Remove the suffix from sample names to get clean identifiers
 61        query_samples = [qs[:-2] for qs in query_samples_sing]
 62
 63        return query_samples
 64
 65    def _get_ancestry_map_from_comment(self, comment: str) -> Dict[str, str]:
 66        """
 67        Construct an ancestry map from the comment line of the `.msp` file.
 68
 69        This method parses the comment string to create a mapping of ancestry numerical identifiers 
 70        to their corresponding ancestry names (e.g., '0': 'African').
 71
 72        Args:
 73            comment (str): 
 74                The comment line containing ancestry mapping information.
 75
 76        Returns:
 77            dict: A dictionary mapping ancestry codes (as strings) to ancestry names.
 78        """
 79        comment = comment.strip()
 80
 81        # Remove everything before the colon, if present
 82        if ':' in comment:
 83            comment = comment.split(':', 1)[1].strip()
 84
 85        ancestry_map: Dict[str, str] = {}
 86
 87        # Split on tabs, spaces, commas, semicolons or any combination of them
 88        tokens = [tok.strip() for tok in re.split(r'[,\t; ]+', comment) if tok]
 89
 90        for tok in tokens:
 91            if '=' not in tok:
 92                continue  # Skip invalid pieces
 93
 94            left, right = (p.strip() for p in tok.split('=', 1))
 95
 96            # Detect whether format is "Pop=0" or "0=Pop"
 97            if left.isdigit() and not right.isdigit():
 98                ancestry_map[left] = right       # 0=Africa
 99            elif right.isdigit() and not left.isdigit():
100                ancestry_map[right] = left       # Africa=0
101            else:
102                # Fallback (if both sides are digits or both are pops, keep left as code)
103                ancestry_map[left] = right
104
105        return ancestry_map
106
107    def _replace_nan_with_none(self, array: Optional[np.ndarray]) -> Optional[np.ndarray]:
108        """
109        Replace arrays that are fully NaN with `None`.
110
111        Args:
112            array (np.ndarray): Array to check.
113
114        Returns:
115            Optional[np.ndarray]: Returns `None` if the array is fully NaN, otherwise returns the original array.
116        """
117        if array is not None:
118            if array.size == 0:  # Check if the array is empty
119                return None
120            if np.issubdtype(array.dtype, np.number):  # Check for numeric types
121                if np.isnan(array).all():  # Fully NaN numeric array
122                    return None
123            elif array.dtype == np.object_ or np.issubdtype(array.dtype, np.str_):  # String or object types
124                if np.all((array == '') | (array == None)):  # Empty or None strings
125                    return None
126        return array
127
128    def read(self) -> 'LocalAncestryObject':
129        """
130        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
131        `snputils.ancestry.genobj.LocalAncestryObject`.
132
133        **Expected MSP content:**
134
135        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
136        Each row should correspond to a genomic window and include the following columns:
137
138        - `#chm`: Chromosome numbers corresponding to each genomic window.
139        - `spos`: Start physical position for each window.
140        - `epos`: End physical position for each window.
141        - `sgpos`: Start centimorgan position for each window.
142        - `egpos`: End centimorgan position for each window.
143        - `n snps`: Number of SNPs in each genomic window.
144        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
145        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
146
147        Returns:
148            **LocalAncestryObject:**
149                A LocalAncestryObject instance.
150        """
151        log.info(f"Reading '{self.file}'...")
152
153        # Open the file and read the first two lines to identify the header and comment
154        with open(self.file) as f:
155            first_line = f.readline()
156            second_line = f.readline()
157
158        first_line_ = [h.strip() for h in first_line.split("\t")]
159        second_line_ = [h.strip() for h in second_line.split("\t")]
160
161        # Determine which line contains the #chm reference value for the header
162        if "#chm" in first_line_:
163            comment = None
164            header = first_line_
165        elif "#chm" in second_line_:
166            comment = first_line
167            header = second_line_
168        else:
169            raise ValueError(
170                f"Header not found. Expected '#chm' in the first two lines. "
171                f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
172            )
173
174        # Ensure no duplicate columns exist in the header
175        if len(header) != len(set(header)):
176            raise ValueError("Duplicate columns detected in the header.")
177
178        # Read the main data into a DataFrame, skipping comment lines
179        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
180
181        # Extract chromosomes data
182        chromosomes = msp_df['#chm'].astype(str).to_numpy()
183
184        # Extract physical positions (if available)
185        column_counter = 1
186        try:
187            physical_pos = msp_df[['spos', 'epos']].to_numpy()
188            column_counter += 2
189        except KeyError:
190            physical_pos = None
191            log.warning("Physical positions ('spos' and 'epos') not found.")
192        
193        # Extract centimorgan positions (if available)
194        try:
195            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
196            column_counter += 2
197        except KeyError:
198            centimorgan_pos = None
199            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
200
201        # Extract window sizes (if available)
202        try:
203            window_sizes = msp_df['n snps'].to_numpy()
204            column_counter += 1
205        except KeyError:
206            window_sizes = None
207            log.warning("Window sizes ('n snps') not found.")
208        
209        # Extract LAI data (haplotype-level)
210        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
211
212        # Extract haplotype identifiers
213        haplotypes = msp_df.columns[column_counter:].to_list()
214
215        # Extract haplotype identifiers and sample identifiers
216        samples = self._get_samples(msp_df, column_counter)
217        del msp_df
218        gc.collect()
219
220        # Validate the number of samples matches the LAI data dimensions
221        n_samples = len(samples)
222        if n_samples != int(lai.shape[1] / 2):
223            raise ValueError(
224                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
225                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
226            )
227        
228        # Count number of unique ancestries in the LAI data
229        n_ancestries = len(np.unique(lai))
230
231        # Parse ancestry map from the comment (if available)
232        ancestry_map = None
233        if comment is not None:
234            ancestry_map = self._get_ancestry_map_from_comment(comment)
235            if len(ancestry_map) != n_ancestries:
236                warnings.warn(
237                    "Mismatch between the number of unique ancestries in the LAI data "
238                    f"({n_ancestries}) and the number of classes in the ancestry map "
239                    f"({len(ancestry_map)})."
240                )
241        else:
242            # Provide default ancestry mapping if no comment is provided
243            ancestry_map = None
244            warnings.warn(
245                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
246                "map as a comment in the first line."
247            )
248
249        # Replace fully NaN attributes with None
250        window_sizes = self._replace_nan_with_none(window_sizes)
251        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
252        chromosomes = self._replace_nan_with_none(chromosomes)
253        physical_pos = self._replace_nan_with_none(physical_pos)
254
255        return LocalAncestryObject(
256            haplotypes=haplotypes,
257            lai=lai,
258            samples=samples,
259            ancestry_map=ancestry_map,
260            window_sizes=window_sizes,
261            centimorgan_pos=centimorgan_pos,
262            chromosomes=chromosomes,
263            physical_pos=physical_pos
264        )

A reader class for parsing Local Ancestry Inference (LAI) data from an .msp or msp.tsv file and constructing a snputils.ancestry.genobj.LocalAncestryObject.

MSPReader(file: Union[str, pathlib._local.Path])
21    def __init__(self, file: Union[str, Path]) -> None:
22        """
23        Args:
24            file (str or pathlib.Path): 
25                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
26        """
27        self.__file = Path(file)
Arguments:
  • file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
file: pathlib._local.Path
29    @property
30    def file(self) -> Path:
31        """
32        Retrieve `file`.
33
34        Returns:
35            **pathlib.Path:** 
36                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
37        """
38        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file to be read. It should end with .msp or .msp.tsv.

def read(self) -> snputils.ancestry.genobj.LocalAncestryObject:
128    def read(self) -> 'LocalAncestryObject':
129        """
130        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
131        `snputils.ancestry.genobj.LocalAncestryObject`.
132
133        **Expected MSP content:**
134
135        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
136        Each row should correspond to a genomic window and include the following columns:
137
138        - `#chm`: Chromosome numbers corresponding to each genomic window.
139        - `spos`: Start physical position for each window.
140        - `epos`: End physical position for each window.
141        - `sgpos`: Start centimorgan position for each window.
142        - `egpos`: End centimorgan position for each window.
143        - `n snps`: Number of SNPs in each genomic window.
144        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
145        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
146
147        Returns:
148            **LocalAncestryObject:**
149                A LocalAncestryObject instance.
150        """
151        log.info(f"Reading '{self.file}'...")
152
153        # Open the file and read the first two lines to identify the header and comment
154        with open(self.file) as f:
155            first_line = f.readline()
156            second_line = f.readline()
157
158        first_line_ = [h.strip() for h in first_line.split("\t")]
159        second_line_ = [h.strip() for h in second_line.split("\t")]
160
161        # Determine which line contains the #chm reference value for the header
162        if "#chm" in first_line_:
163            comment = None
164            header = first_line_
165        elif "#chm" in second_line_:
166            comment = first_line
167            header = second_line_
168        else:
169            raise ValueError(
170                f"Header not found. Expected '#chm' in the first two lines. "
171                f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
172            )
173
174        # Ensure no duplicate columns exist in the header
175        if len(header) != len(set(header)):
176            raise ValueError("Duplicate columns detected in the header.")
177
178        # Read the main data into a DataFrame, skipping comment lines
179        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
180
181        # Extract chromosomes data
182        chromosomes = msp_df['#chm'].astype(str).to_numpy()
183
184        # Extract physical positions (if available)
185        column_counter = 1
186        try:
187            physical_pos = msp_df[['spos', 'epos']].to_numpy()
188            column_counter += 2
189        except KeyError:
190            physical_pos = None
191            log.warning("Physical positions ('spos' and 'epos') not found.")
192        
193        # Extract centimorgan positions (if available)
194        try:
195            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
196            column_counter += 2
197        except KeyError:
198            centimorgan_pos = None
199            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
200
201        # Extract window sizes (if available)
202        try:
203            window_sizes = msp_df['n snps'].to_numpy()
204            column_counter += 1
205        except KeyError:
206            window_sizes = None
207            log.warning("Window sizes ('n snps') not found.")
208        
209        # Extract LAI data (haplotype-level)
210        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
211
212        # Extract haplotype identifiers
213        haplotypes = msp_df.columns[column_counter:].to_list()
214
215        # Extract haplotype identifiers and sample identifiers
216        samples = self._get_samples(msp_df, column_counter)
217        del msp_df
218        gc.collect()
219
220        # Validate the number of samples matches the LAI data dimensions
221        n_samples = len(samples)
222        if n_samples != int(lai.shape[1] / 2):
223            raise ValueError(
224                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
225                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
226            )
227        
228        # Count number of unique ancestries in the LAI data
229        n_ancestries = len(np.unique(lai))
230
231        # Parse ancestry map from the comment (if available)
232        ancestry_map = None
233        if comment is not None:
234            ancestry_map = self._get_ancestry_map_from_comment(comment)
235            if len(ancestry_map) != n_ancestries:
236                warnings.warn(
237                    "Mismatch between the number of unique ancestries in the LAI data "
238                    f"({n_ancestries}) and the number of classes in the ancestry map "
239                    f"({len(ancestry_map)})."
240                )
241        else:
242            # Provide default ancestry mapping if no comment is provided
243            ancestry_map = None
244            warnings.warn(
245                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
246                "map as a comment in the first line."
247            )
248
249        # Replace fully NaN attributes with None
250        window_sizes = self._replace_nan_with_none(window_sizes)
251        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
252        chromosomes = self._replace_nan_with_none(chromosomes)
253        physical_pos = self._replace_nan_with_none(physical_pos)
254
255        return LocalAncestryObject(
256            haplotypes=haplotypes,
257            lai=lai,
258            samples=samples,
259            ancestry_map=ancestry_map,
260            window_sizes=window_sizes,
261            centimorgan_pos=centimorgan_pos,
262            chromosomes=chromosomes,
263            physical_pos=physical_pos
264        )

Read data from the provided .msp or msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Expected MSP content:

The .msp file should contain local ancestry assignments for each haplotype across genomic windows. Each row should correspond to a genomic window and include the following columns:

  • #chm: Chromosome numbers corresponding to each genomic window.
  • spos: Start physical position for each window.
  • epos: End physical position for each window.
  • sgpos: Start centimorgan position for each window.
  • egpos: End centimorgan position for each window.
  • n snps: Number of SNPs in each genomic window.
  • SampleID.0: Local ancestry for the first haplotype of the sample for each window.
  • SampleID.1: Local ancestry for the second haplotype of the sample for each window.
Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class MSPWriter(snputils.ancestry.io.local.write.base.LAIBaseWriter):
 15class MSPWriter(LAIBaseWriter):
 16    """
 17    A writer class for exporting local ancestry data from a `snputils.ancestry.genobj.LocalAncestryObject` 
 18    into an `.msp` or `.msp.tsv` file.
 19    """
 20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
 21        """
 22        Args:
 23            laiobj (LocalAncestryObject):
 24                A LocalAncestryObject instance.
 25            file (str or pathlib.Path): 
 26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 28        """
 29        self.__laiobj = laiobj
 30        self.__file = Path(file)
 31
 32    @property
 33    def laiobj(self) -> LocalAncestryObject:
 34        """
 35        Retrieve `laiobj`. 
 36
 37        Returns:
 38            **LocalAncestryObject:** 
 39                A LocalAncestryObject instance.
 40        """
 41        return self.__laiobj
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 52        """
 53        return self.__file
 54
 55    @file.setter
 56    def file(self, x: Union[str, Path]):
 57        """
 58        Update `file`.
 59        """
 60        self.__file = Path(x)
 61    
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

A writer class for exporting local ancestry data from a snputils.ancestry.genobj.LocalAncestryObject into an .msp or .msp.tsv file.

MSPWriter( laiobj: snputils.ancestry.genobj.LocalAncestryObject, file: Union[str, pathlib._local.Path])
20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
21        """
22        Args:
23            laiobj (LocalAncestryObject):
24                A LocalAncestryObject instance.
25            file (str or pathlib.Path): 
26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
28        """
29        self.__laiobj = laiobj
30        self.__file = Path(file)
Arguments:
  • laiobj (LocalAncestryObject): A LocalAncestryObject instance.
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.
32    @property
33    def laiobj(self) -> LocalAncestryObject:
34        """
35        Retrieve `laiobj`. 
36
37        Returns:
38            **LocalAncestryObject:** 
39                A LocalAncestryObject instance.
40        """
41        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib._local.Path
43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
52        """
53        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

def write(self) -> None:
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

Write the data contained in the laiobj instance to the specified output file. If the file already exists, it will be overwritten.

Output MSP content:

The output .msp file will contain local ancestry assignments for each haplotype across genomic windows. Each row corresponds to a genomic window and includes the following columns:

  • #chm: Chromosome numbers corresponding to each genomic window.
  • spos: Start physical position for each window.
  • epos: End physical position for each window.
  • sgpos: Start centimorgan position for each window.
  • egpos: End centimorgan position for each window.
  • n snps: Number of SNPs in each genomic window.
  • SampleID.0: Local ancestry for the first haplotype of the sample for each window.
  • SampleID.1: Local ancestry for the second haplotype of the sample for each window.
class AdmixtureMappingVCFWriter:
 16class AdmixtureMappingVCFWriter:
 17    """
 18    A writer class for converting and writing local ancestry data into ancestry-specific 
 19    VCF/BCF files for ADMIXTURE mapping.
 20    """
 21    def __init__(
 22            self, 
 23            laiobj: LocalAncestryObject, 
 24            file: Union[str, Path], 
 25            ancestry_map: Optional[Dict[str, str]] = None
 26        ):
 27        """
 28        Args:
 29            laiobj (LocalAncestryObject): 
 30                A LocalAncestryObject instance.
 31            file (str or pathlib.Path): 
 32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 34            ancestry_map (dict of str to str, optional): 
 35                A dictionary mapping ancestry codes to region names. If not explicitly 
 36                provided, it will default to the `ancestry_map` from `laiobj`.
 37        """
 38        self.__laiobj = laiobj
 39        self.__file = Path(file)
 40        self.__ancestry_map = ancestry_map
 41
 42    @property
 43    def laiobj(self) -> LocalAncestryObject:
 44        """
 45        Retrieve `laiobj`. 
 46
 47        Returns:
 48            **LocalAncestryObject:** 
 49                A LocalAncestryObject instance.
 50        """
 51        return self.__laiobj
 52
 53    @property
 54    def file(self) -> Path:
 55        """
 56        Retrieve `file`.
 57
 58        Returns:
 59            **pathlib.Path:** 
 60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 62        """
 63        return self.__file
 64
 65    @property
 66    def ancestry_map(self) -> Dict[str, str]:
 67        """
 68        Retrieve `ancestry_map`.
 69
 70        Returns:
 71            **dict of str to str:** 
 72                A dictionary mapping ancestry codes to region names. If not explicitly 
 73                provided, it will default to the `ancestry_map` from `laiobj`.
 74        """
 75        if self.__ancestry_map is not None:
 76            return self.__ancestry_map
 77        elif self.laiobj.ancestry_map is not None:
 78            return self.laiobj.ancestry_map
 79        else:
 80            raise ValueError(
 81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
 82                "during initialization or ensure `laiobj.ancestry_map` is set."
 83            )
 84
 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: Additional information field. Defaults to `'.'` indicating no extra metadata.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            # Format start and end positions for the VCF file
138            if self.laiobj.physical_pos is not None:
139                pos_list = [f"{val1}_{val2}" for val1, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142
143            # Modify LAI data values to simulate a SNP file
144            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
145            
146            match = (self.laiobj.lai == ancestry)
147            match = match.view(np.int8)
148            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
149
150
151            # Set up VCF-related data
152            calldata_gt = match
153            del match
154            gc.collect()
155            samples = np.array(self.laiobj.samples)
156            variants_chrom = self.laiobj.chromosomes
157            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
158            variants_id = np.array(variants_list)
159            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
160            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
161
162            # Create the SNPObject
163            variant_data_obj = SNPObject(
164                calldata_gt=calldata_gt,
165                samples=samples,
166                variants_chrom=variants_chrom,
167                variants_id=variants_id,
168                variants_ref = variants_ref,
169                variants_alt = variants_alt,
170                variants_pos = pos_list,
171            )
172
173            # Log the start of the VCF file writing process
174            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
175
176            # Write the VCF file
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write()
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

A writer class for converting and writing local ancestry data into ancestry-specific VCF/BCF files for ADMIXTURE mapping.

AdmixtureMappingVCFWriter( laiobj: snputils.ancestry.genobj.LocalAncestryObject, file: Union[str, pathlib._local.Path], ancestry_map: Optional[Dict[str, str]] = None)
21    def __init__(
22            self, 
23            laiobj: LocalAncestryObject, 
24            file: Union[str, Path], 
25            ancestry_map: Optional[Dict[str, str]] = None
26        ):
27        """
28        Args:
29            laiobj (LocalAncestryObject): 
30                A LocalAncestryObject instance.
31            file (str or pathlib.Path): 
32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
34            ancestry_map (dict of str to str, optional): 
35                A dictionary mapping ancestry codes to region names. If not explicitly 
36                provided, it will default to the `ancestry_map` from `laiobj`.
37        """
38        self.__laiobj = laiobj
39        self.__file = Path(file)
40        self.__ancestry_map = ancestry_map
Arguments:
  • laiobj (LocalAncestryObject): A LocalAncestryObject instance.
  • file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.
  • ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.
42    @property
43    def laiobj(self) -> LocalAncestryObject:
44        """
45        Retrieve `laiobj`. 
46
47        Returns:
48            **LocalAncestryObject:** 
49                A LocalAncestryObject instance.
50        """
51        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib._local.Path
53    @property
54    def file(self) -> Path:
55        """
56        Retrieve `file`.
57
58        Returns:
59            **pathlib.Path:** 
60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
62        """
63        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.

ancestry_map: Dict[str, str]
65    @property
66    def ancestry_map(self) -> Dict[str, str]:
67        """
68        Retrieve `ancestry_map`.
69
70        Returns:
71            **dict of str to str:** 
72                A dictionary mapping ancestry codes to region names. If not explicitly 
73                provided, it will default to the `ancestry_map` from `laiobj`.
74        """
75        if self.__ancestry_map is not None:
76            return self.__ancestry_map
77        elif self.laiobj.ancestry_map is not None:
78            return self.laiobj.ancestry_map
79        else:
80            raise ValueError(
81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
82                "during initialization or ensure `laiobj.ancestry_map` is set."
83            )

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

def write(self) -> None:
 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: Additional information field. Defaults to `'.'` indicating no extra metadata.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            # Format start and end positions for the VCF file
138            if self.laiobj.physical_pos is not None:
139                pos_list = [f"{val1}_{val2}" for val1, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142
143            # Modify LAI data values to simulate a SNP file
144            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
145            
146            match = (self.laiobj.lai == ancestry)
147            match = match.view(np.int8)
148            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
149
150
151            # Set up VCF-related data
152            calldata_gt = match
153            del match
154            gc.collect()
155            samples = np.array(self.laiobj.samples)
156            variants_chrom = self.laiobj.chromosomes
157            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
158            variants_id = np.array(variants_list)
159            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
160            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
161
162            # Create the SNPObject
163            variant_data_obj = SNPObject(
164                calldata_gt=calldata_gt,
165                samples=samples,
166                variants_chrom=variants_chrom,
167                variants_id=variants_id,
168                variants_ref = variants_ref,
169                variants_alt = variants_alt,
170                variants_pos = pos_list,
171            )
172
173            # Log the start of the VCF file writing process
174            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
175
176            # Write the VCF file
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write()
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

Write VCF or BCF files for each ancestry type defined in the ancestry map. If the file already exists, it will be overwritten.

Output VCF/BCF content:

For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. SNPs are encoded as follows:

  • 1: Indicates positions that match the specified ancestry.
  • 0: Indicates positions that do not match the specified ancestry.

The VCF/BCF files will contain the following fields:

  • CHROM: Chromosome for each variant.
  • POS: Chromosomal positions for each variant.
  • ID: Unique identifier for each variant.
  • REF: Reference allele for each variant.
  • ALT: Alternate allele for each variant.
  • QUAL: Phred-scaled quality score for each variant.
  • FILTER: Status indicating whether each SNP passed control checks.
  • INFO: Additional information field. Defaults to '.' indicating no extra metadata.
  • FORMAT: Genotype format. Set to 'GT', representing the genotype as phased alleles.
  • <SampleID>: One column per sample, containing the genotype data (1|0, 0|1, etc.).

Output files:

  • A separate VCF file is written for each ancestry type, with filenames formatted as: <filename>_<ancestry>.vcf (e.g., output_African.vcf).