snputils

View Source

 1from importlib import import_module
 2from importlib.metadata import PackageNotFoundError, version
 3from typing import Dict, Tuple
 4
 5try:
 6    __version__ = version("snputils")
 7except PackageNotFoundError:
 8    __version__ = "unknown"
 9
10_LAZY_ATTRS: Dict[str, Tuple[str, str]] = {
11    "SNPObject": (".snp", "SNPObject"),
12    "GRGObject": (".snp", "GRGObject"),
13    "SNPReader": (".snp", "SNPReader"),
14    "BEDReader": (".snp", "BEDReader"),
15    "GRGReader": (".snp", "GRGReader"),
16    "GRGWriter": (".snp", "GRGWriter"),
17    "PGENReader": (".snp", "PGENReader"),
18    "VCFReader": (".snp", "VCFReader"),
19    "BEDWriter": (".snp", "BEDWriter"),
20    "PGENWriter": (".snp", "PGENWriter"),
21    "VCFWriter": (".snp", "VCFWriter"),
22    "read_snp": (".snp", "read_snp"),
23    "read_bed": (".snp", "read_bed"),
24    "read_pgen": (".snp", "read_pgen"),
25    "read_vcf": (".snp", "read_vcf"),
26    "read_grg": (".snp", "read_grg"),
27    "LocalAncestryObject": (".ancestry", "LocalAncestryObject"),
28    "GlobalAncestryObject": (".ancestry", "GlobalAncestryObject"),
29    "MSPReader": (".ancestry", "MSPReader"),
30    "MSPWriter": (".ancestry", "MSPWriter"),
31    "AdmixtureMappingVCFWriter": (".ancestry", "AdmixtureMappingVCFWriter"),
32    "AdmixtureReader": (".ancestry", "AdmixtureReader"),
33    "AdmixtureWriter": (".ancestry", "AdmixtureWriter"),
34    "read_lai": (".ancestry", "read_lai"),
35    "read_msp": (".ancestry", "read_msp"),
36    "read_adm": (".ancestry", "read_adm"),
37    "read_admixture": (".ancestry", "read_admixture"),
38    "IBDObject": (".ibd", "IBDObject"),
39    "read_ibd": (".ibd", "read_ibd"),
40    "HapIBDReader": (".ibd", "HapIBDReader"),
41    "AncIBDReader": (".ibd", "AncIBDReader"),
42    "IBDReader": (".ibd", "IBDReader"),
43    "MultiPhenotypeObject": (".phenotype", "MultiPhenotypeObject"),
44    "PhenotypeObject": (".phenotype", "PhenotypeObject"),
45    "MultiPhenReader": (".phenotype", "MultiPhenReader"),
46    "PhenotypeReader": (".phenotype", "PhenotypeReader"),
47    "load_dataset": (".datasets", "load_dataset"),
48    "viz": (".visualization", ""),
49}
50
51__all__ = list(_LAZY_ATTRS.keys())
52
53
54def __getattr__(name):
55    target = _LAZY_ATTRS.get(name)
56    if target is None:
57        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
58
59    module_name, attr_name = target
60    module = import_module(module_name, package=__name__)
61    value = module if attr_name == "" else getattr(module, attr_name)
62    globals()[name] = value
63    return value
64
65
66def __dir__():
67    return sorted(set(globals().keys()) | set(__all__))

class SNPReader: View Source

 8class SNPReader:
 9    def __new__(cls,
10                filename: Union[str, pathlib.Path],
11                vcf_backend: str = 'polars') -> SNPReader:
12        """
13        Automatically detect the SNP file format from the file extension, and return its corresponding reader.
14
15        Args:
16            filename: Filename of the file to read.
17            vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
18
19        Raises:
20            ValueError: If the filename does not have an extension or the extension is not supported.
21        """
22        filename = pathlib.Path(filename)
23        suffixes = filename.suffixes
24        if not suffixes:
25            raise ValueError("The filename should have an extension when using SNPReader.")
26
27        extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1]
28        extension = extension.lower()
29
30        if extension == ".vcf":
31            if vcf_backend == 'polars':
32                from snputils.snp.io.read.vcf import VCFReaderPolars
33
34                return VCFReaderPolars(filename)
35            elif vcf_backend == 'scikit-allel':
36                from snputils.snp.io.read.vcf import VCFReader
37
38                return VCFReader(filename)
39            else:
40                raise ValueError(f"VCF backend not supported: {vcf_backend}")
41        elif extension in (".bed", ".bim", ".fam"):
42            from snputils.snp.io.read.bed import BEDReader
43
44            return BEDReader(filename)
45        elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"):
46            from snputils.snp.io.read.pgen import PGENReader
47
48            return PGENReader(filename)
49        else:
50            raise ValueError(f"File format not supported: {filename}")

SNPReader(filename: str | pathlib.Path, vcf_backend: str = 'polars') View Source

 9    def __new__(cls,
10                filename: Union[str, pathlib.Path],
11                vcf_backend: str = 'polars') -> SNPReader:
12        """
13        Automatically detect the SNP file format from the file extension, and return its corresponding reader.
14
15        Args:
16            filename: Filename of the file to read.
17            vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.
18
19        Raises:
20            ValueError: If the filename does not have an extension or the extension is not supported.
21        """
22        filename = pathlib.Path(filename)
23        suffixes = filename.suffixes
24        if not suffixes:
25            raise ValueError("The filename should have an extension when using SNPReader.")
26
27        extension = suffixes[-2] if suffixes[-1].lower() in (".zst", ".gz") else suffixes[-1]
28        extension = extension.lower()
29
30        if extension == ".vcf":
31            if vcf_backend == 'polars':
32                from snputils.snp.io.read.vcf import VCFReaderPolars
33
34                return VCFReaderPolars(filename)
35            elif vcf_backend == 'scikit-allel':
36                from snputils.snp.io.read.vcf import VCFReader
37
38                return VCFReader(filename)
39            else:
40                raise ValueError(f"VCF backend not supported: {vcf_backend}")
41        elif extension in (".bed", ".bim", ".fam"):
42            from snputils.snp.io.read.bed import BEDReader
43
44            return BEDReader(filename)
45        elif extension in (".pgen", ".pvar", ".psam", ".pvar.zst"):
46            from snputils.snp.io.read.pgen import PGENReader
47
48            return PGENReader(filename)
49        else:
50            raise ValueError(f"File format not supported: {filename}")

Automatically detect the SNP file format from the file extension, and return its corresponding reader.

Arguments:

filename: Filename of the file to read.
vcf_backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'. Default is 'polars'.

Raises:

ValueError: If the filename does not have an extension or the extension is not supported.

@SNPBaseReader.register

class BEDReader(snputils.snp.io.read.base.SNPBaseReader): View Source

 16@SNPBaseReader.register
 17class BEDReader(SNPBaseReader):
 18    def read(
 19        self,
 20        fields: Optional[List[str]] = None,
 21        exclude_fields: Optional[List[str]] = None,
 22        sample_ids: Optional[np.ndarray] = None,
 23        sample_idxs: Optional[np.ndarray] = None,
 24        variant_ids: Optional[np.ndarray] = None,
 25        variant_idxs: Optional[np.ndarray] = None,
 26        sum_strands: bool = False,
 27        separator: Optional[str] = None,
 28    ) -> SNPObject:
 29        """
 30        Read a bed fileset (bed, bim, fam) into a SNPObject.
 31
 32        Args:
 33            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 34                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 35                To extract all fields, set fields to None. Defaults to None.
 36            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 37                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 38                To exclude no fields, set exclude_fields to None. Defaults to None.
 39            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 40            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 41            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 42            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 43            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 44                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 45                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 46            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 47                If the automatic detection fails, please specify the separator manually.
 48
 49        Returns:
 50            **SNPObject**: 
 51                A SNPObject instance.
 52        """
 53        assert (
 54            sample_idxs is None or sample_ids is None
 55        ), "Only one of sample_idxs and sample_ids can be specified"
 56        assert (
 57            variant_idxs is None or variant_ids is None
 58        ), "Only one of variant_idxs and variant_ids can be specified"
 59
 60        if isinstance(fields, str):
 61            fields = [fields]
 62        if isinstance(exclude_fields, str):
 63            exclude_fields = [exclude_fields]
 64
 65        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"]
 66        exclude_fields = exclude_fields or []
 67        fields = [field for field in fields if field not in exclude_fields]
 68        only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 69
 70        filename_noext = str(self.filename)
 71        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
 72            filename_noext = filename_noext[:-4]
 73
 74        if only_read_bed:
 75            with open(filename_noext + '.fam', 'r') as f:
 76                file_num_samples = sum(1 for _ in f)  # Get sample count from fam file
 77            file_num_variants = None  # Not needed
 78        else:
 79            log.info(f"Reading {filename_noext}.bim")
 80
 81            if separator is None:
 82                with open(filename_noext + ".bim", "r") as file:
 83                    separator = csv.Sniffer().sniff(file.readline()).delimiter
 84
 85            bim = pl.read_csv(
 86                filename_noext + ".bim",
 87                separator=separator,
 88                has_header=False,
 89                new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
 90                schema_overrides={
 91                    "#CHROM": pl.String,
 92                    "ID": pl.String,
 93                    "CM": pl.Float64,
 94                    "POS": pl.Int64,
 95                    "ALT": pl.String,
 96                    "REF": pl.String
 97                },
 98                null_values=["NA"]
 99            ).with_row_index()
100            file_num_variants = bim.height
101
102            if variant_ids is not None:
103                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
104                variant_id_or_pos = (
105                    pl.col("ID").is_in(variant_id_values)
106                    | pl.concat_str(
107                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
108                    ).is_in(variant_id_values)
109                )
110                variant_idxs = (
111                    bim.filter(variant_id_or_pos)
112                    .select("index")
113                    .to_series()
114                    .to_numpy()
115                )
116
117            if variant_idxs is None:
118                num_variants = file_num_variants
119                variant_idxs = np.arange(num_variants, dtype=np.uint32)
120            else:
121                requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel()
122                bim = bim.filter(pl.col("index").is_in(requested_variant_idxs))
123                variant_idxs = bim.select("index").to_series().to_numpy()
124                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
125                num_variants = np.size(variant_idxs)
126
127            log.info(f"Reading {filename_noext}.fam")
128
129            fam = pl.read_csv(
130                filename_noext + ".fam",
131                separator=separator,
132                has_header=False,
133                new_columns=["Family ID", "IID", "Father ID",
134                             "Mother ID", "Sex code", "Phenotype value"],
135                schema_overrides={
136                    "Family ID": pl.String,
137                    "IID": pl.String,
138                    "Father ID": pl.String,
139                    "Mother ID": pl.String,
140                    "Sex code": pl.String,
141                },
142                null_values=["NA"]
143            ).with_row_index()
144            file_num_samples = fam.height
145
146            if sample_ids is not None:
147                sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy()
148
149            if sample_idxs is None:
150                num_samples = file_num_samples
151            else:
152                num_samples = np.size(sample_idxs)
153                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
154                fam = fam.filter(pl.col("index").is_in(sample_idxs))
155
156        if "GT" in fields:
157            log.info(f"Reading {filename_noext}.bed")
158            pgen_reader = pg.PgenReader(
159                str.encode(filename_noext + ".bed"),
160                raw_sample_ct=file_num_samples,
161                variant_ct=file_num_variants,
162                sample_subset=sample_idxs,
163            )
164
165            if only_read_bed:
166                num_samples = pgen_reader.get_raw_sample_ct()
167                num_variants = pgen_reader.get_variant_ct()
168                variant_idxs = np.arange(num_variants, dtype=np.uint32)
169
170            # required arrays: variant_idxs + sample_idxs + genotypes
171            if not sum_strands:
172                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
173            else:
174                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
175            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
176
177            if not sum_strands:
178                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
179                pgen_reader.read_alleles_list(variant_idxs, genotypes)
180                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
181            else:
182                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
183                pgen_reader.read_list(variant_idxs, genotypes)
184            pgen_reader.close()
185        else:
186            genotypes = None
187
188        log.info("Constructing SNPObject")
189
190        snpobj = SNPObject(
191            calldata_gt=genotypes if "GT" in fields else None,
192            samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
193            **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
194               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
195        )
196
197        log.info("Finished constructing SNPObject")
198        return snpobj
199
200    def _resolve_variant_idxs_for_iter(
201        self,
202        *,
203        variant_ids: Optional[np.ndarray],
204        variant_idxs: Optional[np.ndarray],
205        separator: Optional[str],
206    ) -> np.ndarray:
207        """
208        Resolve variant selectors to canonical file-order row indices.
209        """
210        filename_noext = str(self.filename)
211        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
212            filename_noext = filename_noext[:-4]
213
214        local_separator = separator
215        if local_separator is None:
216            with open(filename_noext + ".bim", "r") as file:
217                local_separator = csv.Sniffer().sniff(file.readline()).delimiter
218
219        bim = pl.read_csv(
220            filename_noext + ".bim",
221            separator=local_separator,
222            has_header=False,
223            new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
224            schema_overrides={
225                "#CHROM": pl.String,
226                "ID": pl.String,
227                "CM": pl.Float64,
228                "POS": pl.Int64,
229                "ALT": pl.String,
230                "REF": pl.String,
231            },
232            null_values=["NA"],
233        ).with_row_index()
234
235        if variant_ids is not None:
236            variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
237            variant_id_or_pos = (
238                pl.col("ID").is_in(variant_id_values)
239                | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in(
240                    variant_id_values
241                )
242            )
243            resolved = (
244                bim.filter(variant_id_or_pos)
245                .select("index")
246                .to_series()
247                .to_numpy()
248            )
249            return np.asarray(resolved, dtype=np.uint32)
250
251        if variant_idxs is not None:
252            requested = np.asarray(variant_idxs, dtype=np.uint32).ravel()
253            resolved = (
254                bim.filter(pl.col("index").is_in(requested))
255                .select("index")
256                .to_series()
257                .to_numpy()
258            )
259            return np.asarray(resolved, dtype=np.uint32)
260
261        return np.arange(bim.height, dtype=np.uint32)
262
263    def iter_read(
264        self,
265        fields: Optional[List[str]] = None,
266        exclude_fields: Optional[List[str]] = None,
267        sample_ids: Optional[np.ndarray] = None,
268        sample_idxs: Optional[np.ndarray] = None,
269        variant_ids: Optional[np.ndarray] = None,
270        variant_idxs: Optional[np.ndarray] = None,
271        sum_strands: bool = False,
272        separator: Optional[str] = None,
273        chunk_size: int = 10_000,
274    ) -> Iterator[SNPObject]:
275        """
276        Stream the BED fileset in variant chunks.
277
278        This yields a sequence of SNPObject chunks along the SNP axis.
279        """
280        if chunk_size < 1:
281            raise ValueError("chunk_size must be >= 1.")
282        if sample_idxs is not None and sample_ids is not None:
283            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
284        if variant_idxs is not None and variant_ids is not None:
285            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
286
287        selectors = self._resolve_variant_idxs_for_iter(
288            variant_ids=variant_ids,
289            variant_idxs=variant_idxs,
290            separator=separator,
291        )
292
293        n_selectors = int(selectors.size)
294        for start in range(0, n_selectors, int(chunk_size)):
295            stop = min(start + int(chunk_size), n_selectors)
296            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
297            yield self.read(
298                fields=fields,
299                exclude_fields=exclude_fields,
300                sample_ids=sample_ids,
301                sample_idxs=sample_idxs,
302                variant_idxs=selector_chunk,
303                sum_strands=sum_strands,
304                separator=separator,
305            )

Abstract class for SNP readers.

Attributes:

_filename: The path to the file storing SNP data.

 18    def read(
 19        self,
 20        fields: Optional[List[str]] = None,
 21        exclude_fields: Optional[List[str]] = None,
 22        sample_ids: Optional[np.ndarray] = None,
 23        sample_idxs: Optional[np.ndarray] = None,
 24        variant_ids: Optional[np.ndarray] = None,
 25        variant_idxs: Optional[np.ndarray] = None,
 26        sum_strands: bool = False,
 27        separator: Optional[str] = None,
 28    ) -> SNPObject:
 29        """
 30        Read a bed fileset (bed, bim, fam) into a SNPObject.
 31
 32        Args:
 33            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 34                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 35                To extract all fields, set fields to None. Defaults to None.
 36            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 37                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'.
 38                To exclude no fields, set exclude_fields to None. Defaults to None.
 39            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 40            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 41            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 42            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 43            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 44                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 45                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 46            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 47                If the automatic detection fails, please specify the separator manually.
 48
 49        Returns:
 50            **SNPObject**: 
 51                A SNPObject instance.
 52        """
 53        assert (
 54            sample_idxs is None or sample_ids is None
 55        ), "Only one of sample_idxs and sample_ids can be specified"
 56        assert (
 57            variant_idxs is None or variant_ids is None
 58        ), "Only one of variant_idxs and variant_ids can be specified"
 59
 60        if isinstance(fields, str):
 61            fields = [fields]
 62        if isinstance(exclude_fields, str):
 63            exclude_fields = [exclude_fields]
 64
 65        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS"]
 66        exclude_fields = exclude_fields or []
 67        fields = [field for field in fields if field not in exclude_fields]
 68        only_read_bed = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 69
 70        filename_noext = str(self.filename)
 71        if filename_noext[-4:].lower() in (".bed", ".bim", ".fam"):
 72            filename_noext = filename_noext[:-4]
 73
 74        if only_read_bed:
 75            with open(filename_noext + '.fam', 'r') as f:
 76                file_num_samples = sum(1 for _ in f)  # Get sample count from fam file
 77            file_num_variants = None  # Not needed
 78        else:
 79            log.info(f"Reading {filename_noext}.bim")
 80
 81            if separator is None:
 82                with open(filename_noext + ".bim", "r") as file:
 83                    separator = csv.Sniffer().sniff(file.readline()).delimiter
 84
 85            bim = pl.read_csv(
 86                filename_noext + ".bim",
 87                separator=separator,
 88                has_header=False,
 89                new_columns=["#CHROM", "ID", "CM", "POS", "ALT", "REF"],
 90                schema_overrides={
 91                    "#CHROM": pl.String,
 92                    "ID": pl.String,
 93                    "CM": pl.Float64,
 94                    "POS": pl.Int64,
 95                    "ALT": pl.String,
 96                    "REF": pl.String
 97                },
 98                null_values=["NA"]
 99            ).with_row_index()
100            file_num_variants = bim.height
101
102            if variant_ids is not None:
103                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
104                variant_id_or_pos = (
105                    pl.col("ID").is_in(variant_id_values)
106                    | pl.concat_str(
107                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
108                    ).is_in(variant_id_values)
109                )
110                variant_idxs = (
111                    bim.filter(variant_id_or_pos)
112                    .select("index")
113                    .to_series()
114                    .to_numpy()
115                )
116
117            if variant_idxs is None:
118                num_variants = file_num_variants
119                variant_idxs = np.arange(num_variants, dtype=np.uint32)
120            else:
121                requested_variant_idxs = np.asarray(variant_idxs, dtype=np.uint32).ravel()
122                bim = bim.filter(pl.col("index").is_in(requested_variant_idxs))
123                variant_idxs = bim.select("index").to_series().to_numpy()
124                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
125                num_variants = np.size(variant_idxs)
126
127            log.info(f"Reading {filename_noext}.fam")
128
129            fam = pl.read_csv(
130                filename_noext + ".fam",
131                separator=separator,
132                has_header=False,
133                new_columns=["Family ID", "IID", "Father ID",
134                             "Mother ID", "Sex code", "Phenotype value"],
135                schema_overrides={
136                    "Family ID": pl.String,
137                    "IID": pl.String,
138                    "Father ID": pl.String,
139                    "Mother ID": pl.String,
140                    "Sex code": pl.String,
141                },
142                null_values=["NA"]
143            ).with_row_index()
144            file_num_samples = fam.height
145
146            if sample_ids is not None:
147                sample_idxs = fam.filter(pl.col("IID").is_in(sample_ids)).select("index").to_series().to_numpy()
148
149            if sample_idxs is None:
150                num_samples = file_num_samples
151            else:
152                num_samples = np.size(sample_idxs)
153                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
154                fam = fam.filter(pl.col("index").is_in(sample_idxs))
155
156        if "GT" in fields:
157            log.info(f"Reading {filename_noext}.bed")
158            pgen_reader = pg.PgenReader(
159                str.encode(filename_noext + ".bed"),
160                raw_sample_ct=file_num_samples,
161                variant_ct=file_num_variants,
162                sample_subset=sample_idxs,
163            )
164
165            if only_read_bed:
166                num_samples = pgen_reader.get_raw_sample_ct()
167                num_variants = pgen_reader.get_variant_ct()
168                variant_idxs = np.arange(num_variants, dtype=np.uint32)
169
170            # required arrays: variant_idxs + sample_idxs + genotypes
171            if not sum_strands:
172                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
173            else:
174                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
175            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
176
177            if not sum_strands:
178                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
179                pgen_reader.read_alleles_list(variant_idxs, genotypes)
180                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
181            else:
182                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
183                pgen_reader.read_list(variant_idxs, genotypes)
184            pgen_reader.close()
185        else:
186            genotypes = None
187
188        log.info("Constructing SNPObject")
189
190        snpobj = SNPObject(
191            calldata_gt=genotypes if "GT" in fields else None,
192            samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
193            **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
194               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
195        )
196
197        log.info("Finished constructing SNPObject")
198        return snpobj

Read a bed fileset (bed, bim, fam) into a SNPObject.

Arguments:

fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To extract all fields, set fields to None. Defaults to None.
exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS'. To exclude no fields, set exclude_fields to None. Defaults to None.
sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand. Note: With the pgenlib backend, False uses ~8× more RAM, though calldata_gt is only 2× larger.
separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.

Returns:

SNPObject: A SNPObject instance.

def iter_read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str | None = None, chunk_size: int = 10000) -> Iterator[SNPObject]: View Source

263    def iter_read(
264        self,
265        fields: Optional[List[str]] = None,
266        exclude_fields: Optional[List[str]] = None,
267        sample_ids: Optional[np.ndarray] = None,
268        sample_idxs: Optional[np.ndarray] = None,
269        variant_ids: Optional[np.ndarray] = None,
270        variant_idxs: Optional[np.ndarray] = None,
271        sum_strands: bool = False,
272        separator: Optional[str] = None,
273        chunk_size: int = 10_000,
274    ) -> Iterator[SNPObject]:
275        """
276        Stream the BED fileset in variant chunks.
277
278        This yields a sequence of SNPObject chunks along the SNP axis.
279        """
280        if chunk_size < 1:
281            raise ValueError("chunk_size must be >= 1.")
282        if sample_idxs is not None and sample_ids is not None:
283            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
284        if variant_idxs is not None and variant_ids is not None:
285            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
286
287        selectors = self._resolve_variant_idxs_for_iter(
288            variant_ids=variant_ids,
289            variant_idxs=variant_idxs,
290            separator=separator,
291        )
292
293        n_selectors = int(selectors.size)
294        for start in range(0, n_selectors, int(chunk_size)):
295            stop = min(start + int(chunk_size), n_selectors)
296            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
297            yield self.read(
298                fields=fields,
299                exclude_fields=exclude_fields,
300                sample_ids=sample_ids,
301                sample_idxs=sample_idxs,
302                variant_idxs=selector_chunk,
303                sum_strands=sum_strands,
304                separator=separator,
305            )

Stream the BED fileset in variant chunks.

This yields a sequence of SNPObject chunks along the SNP axis.

@SNPBaseReader.register

class GRGReader(snputils.snp.io.read.base.SNPBaseReader): View Source

 9@SNPBaseReader.register
10class GRGReader(SNPBaseReader):
11    def read(self,
12             mutable: Optional[bool] = None,
13             load_up_edges: Optional[bool] = None,
14             binary_mutations: Optional[bool] = None) -> GRGObject:
15        """
16        Read in a GRG or TSKit File
17        """
18        file = str(pathlib.Path(self.filename).resolve())
19        extension = pathlib.Path(file).suffix.lower()
20        edges = load_up_edges if load_up_edges is not None else True
21        binmuts = binary_mutations if binary_mutations is not None else False
22
23        if extension == ".trees":
24            return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True)
25        if mutable:
26            return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True)
27
28        return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)

Abstract class for SNP readers.

Attributes:

_filename: The path to the file storing SNP data.

def read( self, mutable: bool | None = None, load_up_edges: bool | None = None, binary_mutations: bool | None = None) -> GRGObject: View Source

11    def read(self,
12             mutable: Optional[bool] = None,
13             load_up_edges: Optional[bool] = None,
14             binary_mutations: Optional[bool] = None) -> GRGObject:
15        """
16        Read in a GRG or TSKit File
17        """
18        file = str(pathlib.Path(self.filename).resolve())
19        extension = pathlib.Path(file).suffix.lower()
20        edges = load_up_edges if load_up_edges is not None else True
21        binmuts = binary_mutations if binary_mutations is not None else False
22
23        if extension == ".trees":
24            return GRGObject(calldata_gt=pyg.grg_from_trees(file, binmuts), filename=file, mutable=True)
25        if mutable:
26            return GRGObject(calldata_gt=pyg.load_mutable_grg(file), filename=file, mutable=True)
27
28        return GRGObject(calldata_gt=pyg.load_immutable_grg(file, edges), filename=file, mutable=False)

Read in a GRG or TSKit File

class GRGWriter: View Source

 9class GRGWriter:
10    def __init__(self, grgobj: Union[pyg.GRG, pyg.MutableGRG], filename: str):
11        self.grgobj = grgobj
12        self.mutability = False if isinstance(self.grgobj, pyg.GRG) else True
13        self.filename = filename
14    
15    def write(self, allow_simplify : Optional[bool]                         = None, 
16                    subset         : Optional[bool]                         = None,
17                    direction      : Optional[pyg.TraversalDirection]       = None,
18                    seed_list      : Optional[List[int]]                    = None,
19                    bp_range       : Optional[Tuple[int, int]]              = None):
20        """
21        """
22
23        if subset:
24            if direction is None:
25                raise ValueError("If subset is True, 'direction' must be provided.")
26            if seed_list is None:
27                raise ValueError("If subset is True, 'seed_list' must be provided.")
28            _bp_range = (0,0) if bp_range is None else bp_range
29            pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 
30        else:
31            _allow_simplify = True if allow_simplify is None else allow_simplify 
32            pyg.save_grg(self.grgobj, self.filename, _allow_simplify)

GRGWriter(grgobj: _grgl.GRG | _grgl.MutableGRG, filename: str) View Source

10    def __init__(self, grgobj: Union[pyg.GRG, pyg.MutableGRG], filename: str):
11        self.grgobj = grgobj
12        self.mutability = False if isinstance(self.grgobj, pyg.GRG) else True
13        self.filename = filename

grgobj

mutability

filename

15    def write(self, allow_simplify : Optional[bool]                         = None, 
16                    subset         : Optional[bool]                         = None,
17                    direction      : Optional[pyg.TraversalDirection]       = None,
18                    seed_list      : Optional[List[int]]                    = None,
19                    bp_range       : Optional[Tuple[int, int]]              = None):
20        """
21        """
22
23        if subset:
24            if direction is None:
25                raise ValueError("If subset is True, 'direction' must be provided.")
26            if seed_list is None:
27                raise ValueError("If subset is True, 'seed_list' must be provided.")
28            _bp_range = (0,0) if bp_range is None else bp_range
29            pyg.save_subset(self.grgobj, self.filename, direction, seed_list, _bp_range) 
30        else:
31            _allow_simplify = True if allow_simplify is None else allow_simplify 
32            pyg.save_grg(self.grgobj, self.filename, _allow_simplify)

@SNPBaseReader.register

class PGENReader(snputils.snp.io.read.base.SNPBaseReader): View Source

 24@SNPBaseReader.register
 25class PGENReader(SNPBaseReader):
 26    def read(
 27        self,
 28        fields: Optional[List[str]] = None,
 29        exclude_fields: Optional[List[str]] = None,
 30        sample_ids: Optional[np.ndarray] = None,
 31        sample_idxs: Optional[np.ndarray] = None,
 32        variant_ids: Optional[np.ndarray] = None,
 33        variant_idxs: Optional[np.ndarray] = None,
 34        sum_strands: bool = False,
 35        separator: str = None,
 36    ) -> SNPObject:
 37        """
 38        Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
 39
 40        Args:
 41            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 42                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 43                To extract all fields, set fields to None. Defaults to None.
 44            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 45                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 46                To exclude no fields, set exclude_fields to None. Defaults to None.
 47            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 48            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 49            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 50            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 51            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 52                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 53                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 54            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 55                If the automatic detection fails, please specify the separator manually.
 56
 57        Returns:
 58            **SNPObject**: 
 59                A SNPObject instance.
 60        """
 61        assert (
 62            sample_idxs is None or sample_ids is None
 63        ), "Only one of sample_idxs and sample_ids can be specified"
 64        assert (
 65            variant_idxs is None or variant_ids is None
 66        ), "Only one of variant_idxs and variant_ids can be specified"
 67
 68        if isinstance(fields, str):
 69            fields = [fields]
 70        if isinstance(exclude_fields, str):
 71            exclude_fields = [exclude_fields]
 72
 73        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"]
 74        exclude_fields = exclude_fields or []
 75        fields = [field for field in fields if field not in exclude_fields]
 76        only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 77
 78        filename_noext = str(self.filename)
 79        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
 80            if filename_noext.endswith(ext):
 81                filename_noext = filename_noext[:-len(ext)]
 82                break
 83
 84        if only_read_pgen:
 85            file_num_samples = None  # Not needed for pgen
 86            file_num_variants = None  # Not needed
 87        else:
 88            pvar_extensions = [".pvar", ".pvar.zst"]
 89            pvar_filename = None
 90            for ext in pvar_extensions:
 91                possible_pvar = filename_noext + ext
 92                if os.path.exists(possible_pvar):
 93                    pvar_filename = possible_pvar
 94                    break
 95            if pvar_filename is None:
 96                raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
 97
 98            log.info(f"Reading {pvar_filename}")
 99
100            pvar_has_header = True
101            pvar_header_line_num = 0
102            with _open_textfile(pvar_filename) as file:
103                for line_num, line in enumerate(file):
104                    if line.startswith("##"):  # Metadata
105                        continue
106                    else:
107                        if separator is None:
108                            separator = csv.Sniffer().sniff(file.readline()).delimiter
109                        if line.startswith("#CHROM"):  # Header
110                            pvar_header_line_num = line_num
111                            header = line.strip().split()
112                            break
113                        elif not line.startswith("#"):  # If no header, look at line 1
114                            pvar_has_header = False
115                            cols_in_pvar = len(line.strip().split(separator))
116                            if cols_in_pvar == 5:
117                                header = ["#CHROM", "ID", "POS", "ALT", "REF"]
118                            elif cols_in_pvar == 6:
119                                header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
120                            else:
121                                raise ValueError(
122                                    f"{pvar_filename} is not a valid pvar file."
123                                )
124                            break
125
126            pvar_reading_args = {
127                'separator': separator,
128                'skip_rows': pvar_header_line_num,
129                'has_header': pvar_has_header,
130                'new_columns': None if pvar_has_header else header,
131                'schema_overrides': {
132                    "#CHROM": pl.String,
133                    "POS": pl.UInt32,
134                    "ID": pl.String,
135                    "REF": pl.String,
136                    "ALT": pl.String,
137                },
138                'null_values': ["NA"],
139            }
140            if pvar_filename.endswith('.zst'):
141                pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy()
142            else:
143                pvar = pl.scan_csv(pvar_filename, **pvar_reading_args)
144
145            # We need to map requested IDs to row positions before reading genotypes.
146            variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect()
147            file_num_variants = variant_meta.height
148
149            if variant_ids is not None:
150                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
151                variant_id_or_pos = (
152                    pl.col("ID").is_in(variant_id_values)
153                    | pl.concat_str(
154                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
155                    ).is_in(variant_id_values)
156                )
157                variant_idxs = (
158                    variant_meta.filter(variant_id_or_pos)
159                    .select("index")
160                    .to_series()
161                    .to_numpy()
162                )
163
164            if variant_idxs is None:
165                num_variants = file_num_variants
166                variant_idxs = np.arange(num_variants, dtype=np.uint32)
167                pvar = pvar.collect()
168            else:
169                pvar = (
170                    pvar.with_row_index()
171                    .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel()))
172                    .collect()
173                )
174                variant_idxs = pvar.select("index").to_series().to_numpy()
175                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
176                num_variants = np.size(variant_idxs)
177                pvar = pvar.drop("index")
178
179            log.info(f"Reading {filename_noext}.psam")
180
181            with open(filename_noext + ".psam") as file:
182                first_line = file.readline().strip()
183                psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID"))
184
185            psam = pl.read_csv(
186                filename_noext + ".psam",
187                separator=separator,
188                has_header=psam_has_header,
189                new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"],
190                null_values=["NA"],
191            ).with_row_index()
192            if "#IID" in psam.columns:
193                psam = psam.rename({"#IID": "IID"})
194            if "#FID" in psam.columns:
195                psam = psam.rename({"#FID": "FID"})
196
197            file_num_samples = psam.height
198
199            if sample_ids is not None:
200                psam = psam.filter(pl.col("IID").is_in(sample_ids))
201                sample_idxs = psam.select("index").to_series().to_numpy()
202                num_samples = np.size(sample_idxs)
203            elif sample_idxs is not None:
204                num_samples = np.size(sample_idxs)
205                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
206                psam = psam.filter(pl.col("index").is_in(sample_idxs))
207            else:
208                num_samples = file_num_samples
209
210        if "GT" in fields:
211            log.info(f"Reading {filename_noext}.pgen")
212            pgen_reader = pg.PgenReader(
213                str.encode(filename_noext + ".pgen"),
214                raw_sample_ct=file_num_samples,
215                variant_ct=file_num_variants,
216                sample_subset=sample_idxs,
217            )
218
219            if only_read_pgen:
220                num_samples = pgen_reader.get_raw_sample_ct()
221                num_variants = pgen_reader.get_variant_ct()
222                variant_idxs = np.arange(num_variants, dtype=np.uint32)
223
224            # required arrays: variant_idxs + sample_idxs + genotypes
225            if not sum_strands:
226                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
227            else:
228                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
229            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
230
231            if not sum_strands:
232                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
233                pgen_reader.read_alleles_list(variant_idxs, genotypes)
234                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
235            else:
236                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
237                pgen_reader.read_list(variant_idxs, genotypes)
238            pgen_reader.close()
239        else:
240            genotypes = None
241
242        log.info("Constructing SNPObject")
243
244        snpobj = SNPObject(
245            calldata_gt=genotypes if "GT" in fields else None,
246            samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
247            **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
248               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
249        )
250
251        log.info("Finished constructing SNPObject")
252        return snpobj
253
254    def _resolve_variant_idxs_for_iter(
255        self,
256        *,
257        variant_ids: Optional[np.ndarray],
258        variant_idxs: Optional[np.ndarray],
259        separator: str = None,
260    ) -> np.ndarray:
261        """
262        Resolve variant selectors to canonical file-order row indices.
263        """
264        filename_noext = str(self.filename)
265        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
266            if filename_noext.endswith(ext):
267                filename_noext = filename_noext[:-len(ext)]
268                break
269
270        pvar_filename = None
271        for ext in [".pvar", ".pvar.zst"]:
272            candidate = filename_noext + ext
273            if os.path.exists(candidate):
274                pvar_filename = candidate
275                break
276        if pvar_filename is None:
277            raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
278
279        local_separator = separator
280
281        pvar_has_header = True
282        pvar_header_line_num = 0
283        with _open_textfile(pvar_filename) as file:
284            for line_num, line in enumerate(file):
285                if line.startswith("##"):
286                    continue
287                if local_separator is None:
288                    local_separator = csv.Sniffer().sniff(file.readline()).delimiter
289                if line.startswith("#CHROM"):
290                    pvar_header_line_num = line_num
291                    header = line.strip().split()
292                    break
293                if not line.startswith("#"):
294                    pvar_has_header = False
295                    cols_in_pvar = len(line.strip().split(local_separator))
296                    if cols_in_pvar == 5:
297                        header = ["#CHROM", "ID", "POS", "ALT", "REF"]
298                    elif cols_in_pvar == 6:
299                        header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
300                    else:
301                        raise ValueError(f"{pvar_filename} is not a valid pvar file.")
302                    break
303
304        pvar_reading_args = {
305            "separator": local_separator,
306            "skip_rows": pvar_header_line_num,
307            "has_header": pvar_has_header,
308            "new_columns": None if pvar_has_header else header,
309            "schema_overrides": {
310                "#CHROM": pl.String,
311                "POS": pl.UInt32,
312                "ID": pl.String,
313                "REF": pl.String,
314                "ALT": pl.String,
315            },
316            "null_values": ["NA"],
317        }
318        if pvar_filename.endswith(".zst"):
319            pvar = pl.read_csv(pvar_filename, **pvar_reading_args)
320        else:
321            pvar = pl.scan_csv(pvar_filename, **pvar_reading_args).collect()
322
323        variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index()
324
325        if variant_ids is not None:
326            variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
327            variant_id_or_pos = (
328                pl.col("ID").is_in(variant_id_values)
329                | pl.concat_str([pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]).is_in(
330                    variant_id_values
331                )
332            )
333            resolved = (
334                variant_meta.filter(variant_id_or_pos)
335                .select("index")
336                .to_series()
337                .to_numpy()
338            )
339            return np.asarray(resolved, dtype=np.uint32)
340
341        if variant_idxs is not None:
342            requested = np.asarray(variant_idxs, dtype=np.uint32).ravel()
343            resolved = (
344                variant_meta.filter(pl.col("index").is_in(requested))
345                .select("index")
346                .to_series()
347                .to_numpy()
348            )
349            return np.asarray(resolved, dtype=np.uint32)
350
351        return np.arange(variant_meta.height, dtype=np.uint32)
352
353    def iter_read(
354        self,
355        fields: Optional[List[str]] = None,
356        exclude_fields: Optional[List[str]] = None,
357        sample_ids: Optional[np.ndarray] = None,
358        sample_idxs: Optional[np.ndarray] = None,
359        variant_ids: Optional[np.ndarray] = None,
360        variant_idxs: Optional[np.ndarray] = None,
361        sum_strands: bool = False,
362        separator: str = None,
363        chunk_size: int = 10_000,
364    ) -> Iterator[SNPObject]:
365        """
366        Stream the PGEN fileset in variant chunks.
367
368        This yields a sequence of SNPObject chunks along the SNP axis.
369        """
370        if chunk_size < 1:
371            raise ValueError("chunk_size must be >= 1.")
372        if sample_idxs is not None and sample_ids is not None:
373            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
374        if variant_idxs is not None and variant_ids is not None:
375            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
376
377        selectors = self._resolve_variant_idxs_for_iter(
378            variant_ids=variant_ids,
379            variant_idxs=variant_idxs,
380            separator=separator,
381        )
382
383        n_selectors = int(selectors.size)
384        for start in range(0, n_selectors, int(chunk_size)):
385            stop = min(start + int(chunk_size), n_selectors)
386            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
387            yield self.read(
388                fields=fields,
389                exclude_fields=exclude_fields,
390                sample_ids=sample_ids,
391                sample_idxs=sample_idxs,
392                variant_idxs=selector_chunk,
393                sum_strands=sum_strands,
394                separator=separator,
395            )

Abstract class for SNP readers.

Attributes:

_filename: The path to the file storing SNP data.

 26    def read(
 27        self,
 28        fields: Optional[List[str]] = None,
 29        exclude_fields: Optional[List[str]] = None,
 30        sample_ids: Optional[np.ndarray] = None,
 31        sample_idxs: Optional[np.ndarray] = None,
 32        variant_ids: Optional[np.ndarray] = None,
 33        variant_idxs: Optional[np.ndarray] = None,
 34        sum_strands: bool = False,
 35        separator: str = None,
 36    ) -> SNPObject:
 37        """
 38        Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
 39
 40        Args:
 41            fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject.
 42                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 43                To extract all fields, set fields to None. Defaults to None.
 44            exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject.
 45                Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'.
 46                To exclude no fields, set exclude_fields to None. Defaults to None.
 47            sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
 48            sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
 49            variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
 50            variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
 51            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 52                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand. 
 53                Note: With the pgenlib backend, `False` uses `~8×` more RAM, though `calldata_gt` is only `2×` larger.
 54            separator: Separator used in the pvar file. If None, the separator is automatically detected.
 55                If the automatic detection fails, please specify the separator manually.
 56
 57        Returns:
 58            **SNPObject**: 
 59                A SNPObject instance.
 60        """
 61        assert (
 62            sample_idxs is None or sample_ids is None
 63        ), "Only one of sample_idxs and sample_ids can be specified"
 64        assert (
 65            variant_idxs is None or variant_ids is None
 66        ), "Only one of variant_idxs and variant_ids can be specified"
 67
 68        if isinstance(fields, str):
 69            fields = [fields]
 70        if isinstance(exclude_fields, str):
 71            exclude_fields = [exclude_fields]
 72
 73        fields = fields or ["GT", "IID", "REF", "ALT", "#CHROM", "ID", "POS", "FILTER", "QUAL"]
 74        exclude_fields = exclude_fields or []
 75        fields = [field for field in fields if field not in exclude_fields]
 76        only_read_pgen = fields == ["GT"] and variant_idxs is None and sample_idxs is None
 77
 78        filename_noext = str(self.filename)
 79        for ext in [".pgen", ".pvar", ".pvar.zst", ".psam"]:
 80            if filename_noext.endswith(ext):
 81                filename_noext = filename_noext[:-len(ext)]
 82                break
 83
 84        if only_read_pgen:
 85            file_num_samples = None  # Not needed for pgen
 86            file_num_variants = None  # Not needed
 87        else:
 88            pvar_extensions = [".pvar", ".pvar.zst"]
 89            pvar_filename = None
 90            for ext in pvar_extensions:
 91                possible_pvar = filename_noext + ext
 92                if os.path.exists(possible_pvar):
 93                    pvar_filename = possible_pvar
 94                    break
 95            if pvar_filename is None:
 96                raise FileNotFoundError(f"No .pvar or .pvar.zst file found for {filename_noext}")
 97
 98            log.info(f"Reading {pvar_filename}")
 99
100            pvar_has_header = True
101            pvar_header_line_num = 0
102            with _open_textfile(pvar_filename) as file:
103                for line_num, line in enumerate(file):
104                    if line.startswith("##"):  # Metadata
105                        continue
106                    else:
107                        if separator is None:
108                            separator = csv.Sniffer().sniff(file.readline()).delimiter
109                        if line.startswith("#CHROM"):  # Header
110                            pvar_header_line_num = line_num
111                            header = line.strip().split()
112                            break
113                        elif not line.startswith("#"):  # If no header, look at line 1
114                            pvar_has_header = False
115                            cols_in_pvar = len(line.strip().split(separator))
116                            if cols_in_pvar == 5:
117                                header = ["#CHROM", "ID", "POS", "ALT", "REF"]
118                            elif cols_in_pvar == 6:
119                                header = ["#CHROM", "ID", "CM", "POS", "ALT", "REF"]
120                            else:
121                                raise ValueError(
122                                    f"{pvar_filename} is not a valid pvar file."
123                                )
124                            break
125
126            pvar_reading_args = {
127                'separator': separator,
128                'skip_rows': pvar_header_line_num,
129                'has_header': pvar_has_header,
130                'new_columns': None if pvar_has_header else header,
131                'schema_overrides': {
132                    "#CHROM": pl.String,
133                    "POS": pl.UInt32,
134                    "ID": pl.String,
135                    "REF": pl.String,
136                    "ALT": pl.String,
137                },
138                'null_values': ["NA"],
139            }
140            if pvar_filename.endswith('.zst'):
141                pvar = pl.read_csv(pvar_filename, **pvar_reading_args).lazy()
142            else:
143                pvar = pl.scan_csv(pvar_filename, **pvar_reading_args)
144
145            # We need to map requested IDs to row positions before reading genotypes.
146            variant_meta = pvar.select(["ID", "#CHROM", "POS"]).with_row_index().collect()
147            file_num_variants = variant_meta.height
148
149            if variant_ids is not None:
150                variant_id_values = [str(v) for v in np.atleast_1d(variant_ids)]
151                variant_id_or_pos = (
152                    pl.col("ID").is_in(variant_id_values)
153                    | pl.concat_str(
154                        [pl.col("#CHROM"), pl.lit(":"), pl.col("POS").cast(pl.String)]
155                    ).is_in(variant_id_values)
156                )
157                variant_idxs = (
158                    variant_meta.filter(variant_id_or_pos)
159                    .select("index")
160                    .to_series()
161                    .to_numpy()
162                )
163
164            if variant_idxs is None:
165                num_variants = file_num_variants
166                variant_idxs = np.arange(num_variants, dtype=np.uint32)
167                pvar = pvar.collect()
168            else:
169                pvar = (
170                    pvar.with_row_index()
171                    .filter(pl.col("index").is_in(np.asarray(variant_idxs, dtype=np.uint32).ravel()))
172                    .collect()
173                )
174                variant_idxs = pvar.select("index").to_series().to_numpy()
175                variant_idxs = np.asarray(variant_idxs, dtype=np.uint32)
176                num_variants = np.size(variant_idxs)
177                pvar = pvar.drop("index")
178
179            log.info(f"Reading {filename_noext}.psam")
180
181            with open(filename_noext + ".psam") as file:
182                first_line = file.readline().strip()
183                psam_has_header = first_line.startswith(("#FID", "FID", "#IID", "IID"))
184
185            psam = pl.read_csv(
186                filename_noext + ".psam",
187                separator=separator,
188                has_header=psam_has_header,
189                new_columns=None if psam_has_header else ["FID", "IID", "PAT", "MAT", "SEX", "PHENO1"],
190                null_values=["NA"],
191            ).with_row_index()
192            if "#IID" in psam.columns:
193                psam = psam.rename({"#IID": "IID"})
194            if "#FID" in psam.columns:
195                psam = psam.rename({"#FID": "FID"})
196
197            file_num_samples = psam.height
198
199            if sample_ids is not None:
200                psam = psam.filter(pl.col("IID").is_in(sample_ids))
201                sample_idxs = psam.select("index").to_series().to_numpy()
202                num_samples = np.size(sample_idxs)
203            elif sample_idxs is not None:
204                num_samples = np.size(sample_idxs)
205                sample_idxs = np.array(sample_idxs, dtype=np.uint32)
206                psam = psam.filter(pl.col("index").is_in(sample_idxs))
207            else:
208                num_samples = file_num_samples
209
210        if "GT" in fields:
211            log.info(f"Reading {filename_noext}.pgen")
212            pgen_reader = pg.PgenReader(
213                str.encode(filename_noext + ".pgen"),
214                raw_sample_ct=file_num_samples,
215                variant_ct=file_num_variants,
216                sample_subset=sample_idxs,
217            )
218
219            if only_read_pgen:
220                num_samples = pgen_reader.get_raw_sample_ct()
221                num_variants = pgen_reader.get_variant_ct()
222                variant_idxs = np.arange(num_variants, dtype=np.uint32)
223
224            # required arrays: variant_idxs + sample_idxs + genotypes
225            if not sum_strands:
226                required_ram = (num_samples + num_variants + num_variants * 2 * num_samples) * 4
227            else:
228                required_ram = (num_samples + num_variants) * 4 + num_variants * num_samples
229            log.info(f">{required_ram / 1024**3:.2f} GiB of RAM are required to process {num_samples} samples with {num_variants} variants each")
230
231            if not sum_strands:
232                genotypes = np.empty((num_variants, 2 * num_samples), dtype=np.int32)  # cannot use int8 because of pgenlib
233                pgen_reader.read_alleles_list(variant_idxs, genotypes)
234                genotypes = genotypes.astype(np.int8).reshape((num_variants, num_samples, 2))
235            else:
236                genotypes = np.empty((num_variants, num_samples), dtype=np.int8)
237                pgen_reader.read_list(variant_idxs, genotypes)
238            pgen_reader.close()
239        else:
240            genotypes = None
241
242        log.info("Constructing SNPObject")
243
244        snpobj = SNPObject(
245            calldata_gt=genotypes if "GT" in fields else None,
246            samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
247            **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
248               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
249        )
250
251        log.info("Finished constructing SNPObject")
252        return snpobj

Read a pgen fileset (pgen, psam, pvar) into a SNPObject.

Arguments:

fields (str, None, or list of str, optional): Fields to extract data for that should be included in the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To extract all fields, set fields to None. Defaults to None.
exclude_fields (str, None, or list of str, optional): Fields to exclude from the returned SNPObject. Available fields are 'GT', 'IID', 'REF', 'ALT', '#CHROM', 'ID', 'POS', 'FILTER', 'QUAL'. To exclude no fields, set exclude_fields to None. Defaults to None.
sample_ids: List of sample IDs to read. If None and sample_idxs is None, all samples are read.
sample_idxs: List of sample indices to read. If None and sample_ids is None, all samples are read.
variant_ids: List of variant IDs to read. If None and variant_idxs is None, all variants are read.
variant_idxs: List of variant indices to read. If None and variant_ids is None, all variants are read.
sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand. Note: With the pgenlib backend, False uses ~8× more RAM, though calldata_gt is only 2× larger.
separator: Separator used in the pvar file. If None, the separator is automatically detected. If the automatic detection fails, please specify the separator manually.

Returns:

SNPObject: A SNPObject instance.

def iter_read( self, fields: List[str] | None = None, exclude_fields: List[str] | None = None, sample_ids: numpy.ndarray | None = None, sample_idxs: numpy.ndarray | None = None, variant_ids: numpy.ndarray | None = None, variant_idxs: numpy.ndarray | None = None, sum_strands: bool = False, separator: str = None, chunk_size: int = 10000) -> Iterator[SNPObject]: View Source

353    def iter_read(
354        self,
355        fields: Optional[List[str]] = None,
356        exclude_fields: Optional[List[str]] = None,
357        sample_ids: Optional[np.ndarray] = None,
358        sample_idxs: Optional[np.ndarray] = None,
359        variant_ids: Optional[np.ndarray] = None,
360        variant_idxs: Optional[np.ndarray] = None,
361        sum_strands: bool = False,
362        separator: str = None,
363        chunk_size: int = 10_000,
364    ) -> Iterator[SNPObject]:
365        """
366        Stream the PGEN fileset in variant chunks.
367
368        This yields a sequence of SNPObject chunks along the SNP axis.
369        """
370        if chunk_size < 1:
371            raise ValueError("chunk_size must be >= 1.")
372        if sample_idxs is not None and sample_ids is not None:
373            raise ValueError("Only one of sample_idxs and sample_ids can be specified.")
374        if variant_idxs is not None and variant_ids is not None:
375            raise ValueError("Only one of variant_idxs and variant_ids can be specified.")
376
377        selectors = self._resolve_variant_idxs_for_iter(
378            variant_ids=variant_ids,
379            variant_idxs=variant_idxs,
380            separator=separator,
381        )
382
383        n_selectors = int(selectors.size)
384        for start in range(0, n_selectors, int(chunk_size)):
385            stop = min(start + int(chunk_size), n_selectors)
386            selector_chunk = np.asarray(selectors[start:stop], dtype=np.uint32)
387            yield self.read(
388                fields=fields,
389                exclude_fields=exclude_fields,
390                sample_ids=sample_ids,
391                sample_idxs=sample_idxs,
392                variant_idxs=selector_chunk,
393                sum_strands=sum_strands,
394                separator=separator,
395            )

Stream the PGEN fileset in variant chunks.

This yields a sequence of SNPObject chunks along the SNP axis.

@SNPBaseReader.register

class VCFReader(snputils.snp.io.read.base.SNPBaseReader): View Source

 21@SNPBaseReader.register
 22class VCFReader(SNPBaseReader):
 23    def __init__(self, filename: Union[str, pathlib.Path]):
 24        super().__init__(filename)
 25        self._igd_path: Optional[pathlib.Path] = None
 26        self._grg_path: Optional[pathlib.Path] = None
 27        self.debug : bool = False
 28    def read(
 29        self,
 30        fields: Optional[List[str]] = None,
 31        exclude_fields: Optional[List[str]] = None,
 32        rename_fields: Optional[dict] = None,
 33        fills: Optional[dict] = None,
 34        region: Optional[str] = None,
 35        samples: Optional[List[str]] = None,
 36        sum_strands: bool = False,
 37    ) -> SNPObject:
 38        """
 39        Read a vcf file into a SNPObject.
 40
 41        Args:
 42            fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS',
 43                'calldata/GT']. If you are feeling lazy, you can drop the 'variants/'
 44                and 'calldata/' prefixes, in which case the fields will be matched
 45                against fields declared in the VCF header, with variants taking priority
 46                over calldata if a field with the same ID exists both in INFO and FORMAT
 47                headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out
 48                for fields like 'DP' which can be both INFO and FORMAT. To extract all
 49                fields, provide just the string '*'. To extract all variants fields
 50                (including all INFO fields) provide 'variants/*'. To extract all
 51                calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
 52            exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
 53            rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
 54            fills: Override the fill value used for empty values. Should be a dictionary
 55                mapping field names to fill values.
 56            region: Genomic region to extract variants for. If provided, should be a
 57                tabix-style region string, which can be either just a chromosome name
 58                (e.g., '2L'), or a chromosome name followed by 1-based beginning and
 59                end coordinates (e.g., '2L:100000-200000'). Note that only variants
 60                whose start position (POS) is within the requested range will be included.
 61                This is slightly different from the default tabix behaviour, where a
 62                variant (e.g., deletion) may be included if its position (POS) occurs
 63                before the requested region but its reference allele overlaps the
 64                region - such a variant will not be included in the data returned
 65                by this function.
 66            samples: Selection of samples to extract calldata for. If provided, should be
 67                a list of strings giving sample identifiers. May also be a list of
 68                integers giving indices of selected samples.
 69            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 70                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand.
 71
 72        Returns:
 73            **SNPObject**: 
 74                A SNPObject instance.
 75        """
 76        log.info(f"Reading {self.filename}")
 77
 78        vcf_dict = allel.read_vcf(
 79            str(self.filename),
 80            fields=fields,
 81            exclude_fields=exclude_fields,
 82            rename_fields=rename_fields,
 83            fills=fills,
 84            region=region,
 85            samples=samples,
 86            alt_number=1,
 87        )
 88        assert vcf_dict is not None  # suppress Flake8 warning
 89
 90        genotypes = vcf_dict["calldata/GT"].astype(np.int8)
 91        if sum_strands:
 92            genotypes = genotypes.sum(axis=2, dtype=np.int8)
 93
 94        snpobj = SNPObject(
 95            calldata_gt=genotypes,
 96            samples=vcf_dict["samples"],
 97            variants_ref=vcf_dict["variants/REF"],
 98            variants_alt=vcf_dict["variants/ALT"],
 99            variants_chrom=vcf_dict["variants/CHROM"],
100            variants_filter_pass=vcf_dict["variants/FILTER_PASS"],
101            variants_id=vcf_dict["variants/ID"],
102            variants_pos=vcf_dict["variants/POS"],
103            variants_qual=vcf_dict["variants/QUAL"],
104        )
105
106        log.info(f"Finished reading {self.filename}")
107        return snpobj
108    def to_igd(self,
109                igd_file : Optional[str] = None,
110                logfile_out : Optional[str] = None,
111                logfile_err : Optional[str] = None) -> None:
112        """
113        Convert the current VCF input file to IGD via `grg convert`.
114
115        Args:
116            igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`.
117            logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
118            logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
119
120        """
121
122        if not exists(self.filename):
123            raise FileNotFoundError(f"File {self.filename} does not exist")
124
125        lf_o  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
126        lf_e  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
127        name, _ext1 = splitext(str(self.filename))
128        name, _ext2 = splitext(name)
129        if igd_file is None:
130            self._igd_path = pathlib.Path(name + ".igd")
131        else:
132            self._igd_path = pathlib.Path(igd_file)
133
134        try:
135            subprocess.run(
136                ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))],
137                stdout=lf_o,
138                stderr=lf_e,
139                check=True,
140            )
141        finally:
142            if not isinstance(lf_o, int):
143                lf_o.close()
144            if not isinstance(lf_e, int):
145                lf_e.close()
146            
147    def to_grg(self,
148               range: Optional[str] = None,
149               parts: Optional[int] = None,
150               jobs: Optional[int] = None,
151               trees: Optional[int] = None,
152               binmuts: Optional[bool] = None,
153               no_file_cleanup: Optional[bool] = None,
154               maf_flip: Optional[bool] = None,
155               population_ids: Optional[str] = None,
156               mutation_batch_size: Optional[int] = None,
157               igd_file: Optional[str] = None,
158               out_file: Optional[str] = None,
159               verbose: Optional[bool] = None,
160               no_merge: Optional[bool] = None,
161               force: Optional[bool] = None,
162               logfile_out: Optional[str] = None,
163               logfile_err: Optional[str] = None
164               ) -> None:
165        """
166        Convert VCF input to a GRG file via `grg construct`.
167
168        If `igd_file` exists, it is used as construct input. If it does not
169        exist, it is first created via `to_igd` and then used for construction.
170        """
171        input_file = pathlib.Path(self.filename).resolve()
172        if igd_file is not None:
173            candidate_igd = pathlib.Path(igd_file)
174            if candidate_igd.exists():
175                self._igd_path = candidate_igd.resolve()
176            else:
177                self.to_igd(igd_file, logfile_out, logfile_err)
178            input_file = pathlib.Path(self._igd_path).resolve()
179
180        if out_file is not None:
181            self._grg_path = pathlib.Path(out_file)
182        else:
183            default_stem = splitext(str(input_file))[0]
184            if default_stem.endswith(".vcf"):
185                default_stem = splitext(default_stem)[0]
186            self._grg_path = pathlib.Path(default_stem + ".grg")
187
188        lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
189        lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
190        args = ["grg", "construct"]
191        args += self._setarg(range, "-r", None)
192        args += self._setarg(parts, "-p", 50)
193        args += self._setarg(jobs, "-j", multiprocessing.cpu_count())
194        args += self._setarg(trees, "-t", 16)
195        args += self._setarg(binmuts, "--binary-muts", None)
196        args += self._setarg(no_file_cleanup, "--no-file-cleanup", None)
197        args += self._setarg(maf_flip, "--maf-flip", None)
198        args += self._setarg(population_ids, "--population-ids", None)
199        args += self._setarg(mutation_batch_size, "--mutation-batch-size", None)
200        args += self._setarg(str(self._grg_path), "--out-file", None)
201        args += self._setarg(verbose, "--verbose", None)
202        args += self._setarg(no_merge, "--no-merge", None)
203        args += self._setarg(force, "--force", None)
204        args += [str(input_file)]
205        log.debug("Running grg construct command: %s", args)
206        try:
207            subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True)
208        finally:
209            if not isinstance(lf_o, int):
210                lf_o.close()
211            if not isinstance(lf_e, int):
212                lf_e.close()
213
214    def _setarg(self, x: Optional[Any], flag: str, default_arg: Optional[Any] = None) -> List[str]:
215        if isinstance(x, bool):
216            return [flag] if x else []
217        if x is None and default_arg is not None:
218            return [flag, f"{default_arg}"] 
219        elif x is not None:
220            return [flag, f"{x}"]
221        else:
222            return []

Abstract class for SNP readers.

Attributes:

_filename: The path to the file storing SNP data.

VCFReader(filename: str | pathlib.Path) View Source

23    def __init__(self, filename: Union[str, pathlib.Path]):
24        super().__init__(filename)
25        self._igd_path: Optional[pathlib.Path] = None
26        self._grg_path: Optional[pathlib.Path] = None
27        self.debug : bool = False

Initialize the SNPBaseReader.

Arguments:

filename: The path to the file storing SNP data.

debug: bool

 28    def read(
 29        self,
 30        fields: Optional[List[str]] = None,
 31        exclude_fields: Optional[List[str]] = None,
 32        rename_fields: Optional[dict] = None,
 33        fills: Optional[dict] = None,
 34        region: Optional[str] = None,
 35        samples: Optional[List[str]] = None,
 36        sum_strands: bool = False,
 37    ) -> SNPObject:
 38        """
 39        Read a vcf file into a SNPObject.
 40
 41        Args:
 42            fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS',
 43                'calldata/GT']. If you are feeling lazy, you can drop the 'variants/'
 44                and 'calldata/' prefixes, in which case the fields will be matched
 45                against fields declared in the VCF header, with variants taking priority
 46                over calldata if a field with the same ID exists both in INFO and FORMAT
 47                headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out
 48                for fields like 'DP' which can be both INFO and FORMAT. To extract all
 49                fields, provide just the string '*'. To extract all variants fields
 50                (including all INFO fields) provide 'variants/*'. To extract all
 51                calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
 52            exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
 53            rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
 54            fills: Override the fill value used for empty values. Should be a dictionary
 55                mapping field names to fill values.
 56            region: Genomic region to extract variants for. If provided, should be a
 57                tabix-style region string, which can be either just a chromosome name
 58                (e.g., '2L'), or a chromosome name followed by 1-based beginning and
 59                end coordinates (e.g., '2L:100000-200000'). Note that only variants
 60                whose start position (POS) is within the requested range will be included.
 61                This is slightly different from the default tabix behaviour, where a
 62                variant (e.g., deletion) may be included if its position (POS) occurs
 63                before the requested region but its reference allele overlaps the
 64                region - such a variant will not be included in the data returned
 65                by this function.
 66            samples: Selection of samples to extract calldata for. If provided, should be
 67                a list of strings giving sample identifiers. May also be a list of
 68                integers giving indices of selected samples.
 69            sum_strands: If True, maternal and paternal strands are combined into a single `int8` array with values `{0, 1, 2`}. 
 70                If False, strands are stored separately as an `int8` array with values `{0, 1}` for each strand.
 71
 72        Returns:
 73            **SNPObject**: 
 74                A SNPObject instance.
 75        """
 76        log.info(f"Reading {self.filename}")
 77
 78        vcf_dict = allel.read_vcf(
 79            str(self.filename),
 80            fields=fields,
 81            exclude_fields=exclude_fields,
 82            rename_fields=rename_fields,
 83            fills=fills,
 84            region=region,
 85            samples=samples,
 86            alt_number=1,
 87        )
 88        assert vcf_dict is not None  # suppress Flake8 warning
 89
 90        genotypes = vcf_dict["calldata/GT"].astype(np.int8)
 91        if sum_strands:
 92            genotypes = genotypes.sum(axis=2, dtype=np.int8)
 93
 94        snpobj = SNPObject(
 95            calldata_gt=genotypes,
 96            samples=vcf_dict["samples"],
 97            variants_ref=vcf_dict["variants/REF"],
 98            variants_alt=vcf_dict["variants/ALT"],
 99            variants_chrom=vcf_dict["variants/CHROM"],
100            variants_filter_pass=vcf_dict["variants/FILTER_PASS"],
101            variants_id=vcf_dict["variants/ID"],
102            variants_pos=vcf_dict["variants/POS"],
103            variants_qual=vcf_dict["variants/QUAL"],
104        )
105
106        log.info(f"Finished reading {self.filename}")
107        return snpobj

Read a vcf file into a SNPObject.

Arguments:

fields: Fields to extract data for. e.g., ['variants/CHROM', 'variants/POS', 'calldata/GT']. If you are feeling lazy, you can drop the 'variants/' and 'calldata/' prefixes, in which case the fields will be matched against fields declared in the VCF header, with variants taking priority over calldata if a field with the same ID exists both in INFO and FORMAT headers. I.e., ['CHROM', 'POS', 'DP', 'GT'] will work, although watch out for fields like 'DP' which can be both INFO and FORMAT. To extract all fields, provide just the string ''. To extract all variants fields (including all INFO fields) provide 'variants/'. To extract all calldata fields (i.e., defined in FORMAT headers) provide 'calldata/*'.
exclude_fields: Fields to exclude. E.g., for use in combination with fields='*'.
rename_fields: Fields to be renamed. Should be a dictionary mapping old to new names.
fills: Override the fill value used for empty values. Should be a dictionary mapping field names to fill values.
region: Genomic region to extract variants for. If provided, should be a tabix-style region string, which can be either just a chromosome name (e.g., '2L'), or a chromosome name followed by 1-based beginning and end coordinates (e.g., '2L:100000-200000'). Note that only variants whose start position (POS) is within the requested range will be included. This is slightly different from the default tabix behaviour, where a variant (e.g., deletion) may be included if its position (POS) occurs before the requested region but its reference allele overlaps the region - such a variant will not be included in the data returned by this function.
samples: Selection of samples to extract calldata for. If provided, should be a list of strings giving sample identifiers. May also be a list of integers giving indices of selected samples.
sum_strands: If True, maternal and paternal strands are combined into a single int8 array with values {0, 1, 2}. If False, strands are stored separately as an int8 array with values {0, 1} for each strand.

Returns:

SNPObject: A SNPObject instance.

def to_igd( self, igd_file: str | None = None, logfile_out: str | None = None, logfile_err: str | None = None) -> None: View Source

108    def to_igd(self,
109                igd_file : Optional[str] = None,
110                logfile_out : Optional[str] = None,
111                logfile_err : Optional[str] = None) -> None:
112        """
113        Convert the current VCF input file to IGD via `grg convert`.
114
115        Args:
116            igd_file: Output IGD file path. Defaults to `<vcf_stem>.igd`.
117            logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
118            logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).
119
120        """
121
122        if not exists(self.filename):
123            raise FileNotFoundError(f"File {self.filename} does not exist")
124
125        lf_o  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
126        lf_e  : Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
127        name, _ext1 = splitext(str(self.filename))
128        name, _ext2 = splitext(name)
129        if igd_file is None:
130            self._igd_path = pathlib.Path(name + ".igd")
131        else:
132            self._igd_path = pathlib.Path(igd_file)
133
134        try:
135            subprocess.run(
136                ["grg", "convert", abspath(str(self.filename)), abspath(str(self._igd_path))],
137                stdout=lf_o,
138                stderr=lf_e,
139                check=True,
140            )
141        finally:
142            if not isinstance(lf_o, int):
143                lf_o.close()
144            if not isinstance(lf_e, int):
145                lf_e.close()

Convert the current VCF input file to IGD via grg convert.

Arguments:

igd_file: Output IGD file path. Defaults to <vcf_stem>.igd.
logfile_out: The file to log standard output to. If None (default), no output will be logged (i.e., piped to dev null).
logfile_err: The file to log standard error to. If None (default), no error will be logged (i.e., piped to dev null).

147    def to_grg(self,
148               range: Optional[str] = None,
149               parts: Optional[int] = None,
150               jobs: Optional[int] = None,
151               trees: Optional[int] = None,
152               binmuts: Optional[bool] = None,
153               no_file_cleanup: Optional[bool] = None,
154               maf_flip: Optional[bool] = None,
155               population_ids: Optional[str] = None,
156               mutation_batch_size: Optional[int] = None,
157               igd_file: Optional[str] = None,
158               out_file: Optional[str] = None,
159               verbose: Optional[bool] = None,
160               no_merge: Optional[bool] = None,
161               force: Optional[bool] = None,
162               logfile_out: Optional[str] = None,
163               logfile_err: Optional[str] = None
164               ) -> None:
165        """
166        Convert VCF input to a GRG file via `grg construct`.
167
168        If `igd_file` exists, it is used as construct input. If it does not
169        exist, it is first created via `to_igd` and then used for construction.
170        """
171        input_file = pathlib.Path(self.filename).resolve()
172        if igd_file is not None:
173            candidate_igd = pathlib.Path(igd_file)
174            if candidate_igd.exists():
175                self._igd_path = candidate_igd.resolve()
176            else:
177                self.to_igd(igd_file, logfile_out, logfile_err)
178            input_file = pathlib.Path(self._igd_path).resolve()
179
180        if out_file is not None:
181            self._grg_path = pathlib.Path(out_file)
182        else:
183            default_stem = splitext(str(input_file))[0]
184            if default_stem.endswith(".vcf"):
185                default_stem = splitext(default_stem)[0]
186            self._grg_path = pathlib.Path(default_stem + ".grg")
187
188        lf_o: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_out is None else open(logfile_out, "a")
189        lf_e: Union[int, TextIOWrapper] = subprocess.DEVNULL if logfile_err is None else open(logfile_err, "a")
190        args = ["grg", "construct"]
191        args += self._setarg(range, "-r", None)
192        args += self._setarg(parts, "-p", 50)
193        args += self._setarg(jobs, "-j", multiprocessing.cpu_count())
194        args += self._setarg(trees, "-t", 16)
195        args += self._setarg(binmuts, "--binary-muts", None)
196        args += self._setarg(no_file_cleanup, "--no-file-cleanup", None)
197        args += self._setarg(maf_flip, "--maf-flip", None)
198        args += self._setarg(population_ids, "--population-ids", None)
199        args += self._setarg(mutation_batch_size, "--mutation-batch-size", None)
200        args += self._setarg(str(self._grg_path), "--out-file", None)
201        args += self._setarg(verbose, "--verbose", None)
202        args += self._setarg(no_merge, "--no-merge", None)
203        args += self._setarg(force, "--force", None)
204        args += [str(input_file)]
205        log.debug("Running grg construct command: %s", args)
206        try:
207            subprocess.run(args, stdout=lf_o, stderr=lf_e, check=True)
208        finally:
209            if not isinstance(lf_o, int):
210                lf_o.close()
211            if not isinstance(lf_e, int):
212                lf_e.close()

Convert VCF input to a GRG file via grg construct.

If igd_file exists, it is used as construct input. If it does not exist, it is first created via to_igd and then used for construction.

class BEDWriter: View Source

 14class BEDWriter:
 15    """Writes an object in bed/bim/fam formats in the specified output path.
 16
 17    Args:
 18        snpobj: The SNPObject to be written.
 19        file: The output file path.
 20
 21    """
 22
 23    def __init__(self, snpobj: SNPObject, filename: str):
 24        self.__snpobj = snpobj.copy()
 25        self.__filename = Path(filename)
 26
 27    def write(
 28            self,
 29            rename_missing_values: bool = True, 
 30            before: Union[int, float, str] = -1, 
 31            after: Union[int, float, str] = '.'
 32        ):
 33        """
 34        Writes the SNPObject to bed/bim/fam formats.
 35
 36        Args:
 37            rename_missing_values (bool, optional):
 38                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 39                Defaults to True.
 40            before (int, float, or str, default=-1): 
 41                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 42                Default is -1.
 43            after (int, float, or str, default='.'): 
 44                The value that will replace `before`. Default is '.'.
 45        """
 46        # Save .bed file
 47        if self.__filename.suffix != '.bed':
 48            self.__filename = self.__filename.with_suffix('.bed')
 49
 50        log.info(f"Writing .bed file: {self.__filename}")
 51
 52        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 53        if rename_missing_values:
 54            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 55
 56        # If the input matrix has three dimensions, it indicates that the data is divided into two strands.
 57        if len(self.__snpobj.calldata_gt.shape) == 3:
 58            # Sum the two strands
 59            self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8)
 60
 61        # Infer the number of samples and variants from the matrix
 62        samples, variants = self.__snpobj.calldata_gt.shape
 63
 64        # Define the PgenWriter to save the data
 65        data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'),
 66                                  sample_ct=samples,
 67                                  variant_ct=variants,
 68                                  nonref_flags=True,
 69                                  hardcall_phase_present=False,
 70                                  dosage_present=True,
 71                                  dosage_phase_present=False)
 72
 73        # Fill the data_save object with the matrix of individuals x variants
 74        for snp_i in range(0, variants):
 75            data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i]))
 76
 77        # Save the .bed file
 78        data_save.close()
 79
 80        log.info(f"Finished writing .bed file: {self.__filename}")
 81
 82        # Remove .bed from the file name
 83        if self.__filename.suffix == '.bed':
 84            self.__filename = self.__filename.with_suffix('')
 85
 86        # Save .fam file
 87        log.info(f"Writing .fam file: {self.__filename}")
 88
 89        # Fill .fam file
 90        fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait'])
 91        fam_file['iid'] = self.__snpobj.samples
 92        fam_file['fid'] = self.__snpobj.samples
 93
 94        # Save .fam file
 95        fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False)
 96        log.info(f"Finished writing .fam file: {self.__filename}")
 97
 98        # Save .bim file
 99        log.info(f"Writing .bim file: {self.__filename}")
100
101        # Fill .bim file
102        bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1'])
103        bim_file['chrom'] = self.__snpobj.variants_chrom
104        bim_file['snp'] = self.__snpobj.variants_id
105        bim_file['cm'] = 0  # TODO: read, save and write too if available?
106        log.warning("The .bim file is being saved with 0 cM values.")
107        bim_file['pos'] = self.__snpobj.variants_pos
108        bim_file['a0'] = self.__snpobj.variants_alt
109        bim_file['a1'] = self.__snpobj.variants_ref
110
111        # Save .bim file
112        bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False)
113        log.info(f"Finished writing .bim file: {self.__filename}")

Writes an object in bed/bim/fam formats in the specified output path.

Arguments:

snpobj: The SNPObject to be written.
file: The output file path.

BEDWriter(snpobj: SNPObject, filename: str) View Source

23    def __init__(self, snpobj: SNPObject, filename: str):
24        self.__snpobj = snpobj.copy()
25        self.__filename = Path(filename)

def write( self, rename_missing_values: bool = True, before: int | float | str = -1, after: int | float | str = '.'): View Source

 27    def write(
 28            self,
 29            rename_missing_values: bool = True, 
 30            before: Union[int, float, str] = -1, 
 31            after: Union[int, float, str] = '.'
 32        ):
 33        """
 34        Writes the SNPObject to bed/bim/fam formats.
 35
 36        Args:
 37            rename_missing_values (bool, optional):
 38                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 39                Defaults to True.
 40            before (int, float, or str, default=-1): 
 41                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 42                Default is -1.
 43            after (int, float, or str, default='.'): 
 44                The value that will replace `before`. Default is '.'.
 45        """
 46        # Save .bed file
 47        if self.__filename.suffix != '.bed':
 48            self.__filename = self.__filename.with_suffix('.bed')
 49
 50        log.info(f"Writing .bed file: {self.__filename}")
 51
 52        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 53        if rename_missing_values:
 54            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 55
 56        # If the input matrix has three dimensions, it indicates that the data is divided into two strands.
 57        if len(self.__snpobj.calldata_gt.shape) == 3:
 58            # Sum the two strands
 59            self.__snpobj.calldata_gt = self.__snpobj.calldata_gt.transpose(1, 0, 2).sum(axis=2, dtype=np.int8)
 60
 61        # Infer the number of samples and variants from the matrix
 62        samples, variants = self.__snpobj.calldata_gt.shape
 63
 64        # Define the PgenWriter to save the data
 65        data_save = pg.PgenWriter(filename=str(self.__filename).encode('utf-8'),
 66                                  sample_ct=samples,
 67                                  variant_ct=variants,
 68                                  nonref_flags=True,
 69                                  hardcall_phase_present=False,
 70                                  dosage_present=True,
 71                                  dosage_phase_present=False)
 72
 73        # Fill the data_save object with the matrix of individuals x variants
 74        for snp_i in range(0, variants):
 75            data_save.append_biallelic(np.ascontiguousarray(self.__snpobj.calldata_gt[:, snp_i]))
 76
 77        # Save the .bed file
 78        data_save.close()
 79
 80        log.info(f"Finished writing .bed file: {self.__filename}")
 81
 82        # Remove .bed from the file name
 83        if self.__filename.suffix == '.bed':
 84            self.__filename = self.__filename.with_suffix('')
 85
 86        # Save .fam file
 87        log.info(f"Writing .fam file: {self.__filename}")
 88
 89        # Fill .fam file
 90        fam_file = pd.DataFrame(columns=['fid', 'iid', 'father', 'mother', 'gender', 'trait'])
 91        fam_file['iid'] = self.__snpobj.samples
 92        fam_file['fid'] = self.__snpobj.samples
 93
 94        # Save .fam file
 95        fam_file.to_csv(self.__filename.with_suffix('.fam'), sep='\t', index=False, header=False)
 96        log.info(f"Finished writing .fam file: {self.__filename}")
 97
 98        # Save .bim file
 99        log.info(f"Writing .bim file: {self.__filename}")
100
101        # Fill .bim file
102        bim_file = pd.DataFrame(columns=['chrom', 'snp', 'cm', 'pos', 'a0', 'a1'])
103        bim_file['chrom'] = self.__snpobj.variants_chrom
104        bim_file['snp'] = self.__snpobj.variants_id
105        bim_file['cm'] = 0  # TODO: read, save and write too if available?
106        log.warning("The .bim file is being saved with 0 cM values.")
107        bim_file['pos'] = self.__snpobj.variants_pos
108        bim_file['a0'] = self.__snpobj.variants_alt
109        bim_file['a1'] = self.__snpobj.variants_ref
110
111        # Save .bim file
112        bim_file.to_csv(self.__filename.with_suffix('.bim'), sep='\t', index=False, header=False)
113        log.info(f"Finished writing .bim file: {self.__filename}")

Writes the SNPObject to bed/bim/fam formats.

Arguments:

rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
after (int, float, or str, default='.'): The value that will replace before. Default is '.'.

class PGENWriter: View Source

 15class PGENWriter:
 16    """
 17    Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path.
 18    """
 19
 20    def __init__(self, snpobj: SNPObject, filename: str):
 21        """
 22        Initializes the PGENWriter instance.
 23
 24        Args:
 25            snpobj (SNPObject): The SNPObject containing genotype data to be written.
 26            filename (str): Base path for the output files (excluding extension).
 27        """
 28        self.__snpobj = snpobj
 29        self.__filename = Path(filename)
 30
 31    def write(
 32            self, 
 33            vzs: bool = False,
 34            rename_missing_values: bool = True, 
 35            before: Union[int, float, str] = -1, 
 36            after: Union[int, float, str] = '.'
 37        ):
 38        """
 39        Writes the SNPObject data to .pgen, .psam, and .pvar files.
 40
 41        Args:
 42            vzs (bool, optional): 
 43                If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 44            rename_missing_values (bool, optional):
 45                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
 46                Defaults to True.
 47            before (int, float, or str, default=-1): 
 48                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 49                Default is -1.
 50            after (int, float, or str, default='.'): 
 51                The value that will replace `before`. Default is '.'.
 52        """
 53        file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst")
 54        if self.__filename.suffix in file_extensions:
 55            self.__filename = self.__filename.with_suffix('')
 56
 57        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 58        if rename_missing_values:
 59            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 60
 61        self.write_pvar(vzs=vzs)
 62        self.write_psam()
 63        self.write_pgen()
 64
 65    def write_pvar(self, vzs: bool = False):
 66        """
 67        Writes variant data to the .pvar file.
 68
 69        Args:
 70            vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 71        """
 72        output_filename = f"{self.__filename}.pvar"
 73        if vzs:
 74            output_filename += ".zst"
 75            log.info(f"Writing to {output_filename} (compressed)")
 76        else:
 77            log.info(f"Writing to {output_filename}")
 78
 79        df = pl.DataFrame(
 80            {
 81                "#CHROM": self.__snpobj.variants_chrom,
 82                "POS": self.__snpobj.variants_pos,
 83                "ID": self.__snpobj.variants_id,
 84                "REF": self.__snpobj.variants_ref,
 85                "ALT": self.__snpobj.variants_alt,
 86                "FILTER": self.__snpobj.variants_filter_pass,
 87                # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost)
 88            }
 89        )
 90        # TODO: add header to the .pvar file, if not it's lost
 91
 92        # Write the DataFrame to a CSV string
 93        csv_data = df.write_csv(None, separator="\t")
 94
 95        if vzs:
 96            # Compress the CSV data using zstd
 97            cctx = zstd.ZstdCompressor()
 98            compressed_data = cctx.compress(csv_data.encode('utf-8'))
 99            with open(output_filename, 'wb') as f:
100                f.write(compressed_data)
101        else:
102            with open(output_filename, 'w') as f:
103                f.write(csv_data)
104
105    def write_psam(self):
106        """
107        Writes sample metadata to the .psam file.
108        """
109        log.info(f"Writing {self.__filename}.psam")
110        df = pl.DataFrame(
111            {
112                "#IID": self.__snpobj.samples,
113                "SEX": "NA",  # Add SEX as nan for now
114                # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost)
115            }
116        )
117        df.write_csv(f"{self.__filename}.psam", separator="\t")
118
119    def write_pgen(self):
120        """
121        Writes the genotype data to a .pgen file.
122        """
123        log.info(f"Writing to {self.__filename}.pgen")
124        summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True
125        if not summed_strands:
126            num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape
127            # Flatten the genotype matrix for pgenlib
128            flat_genotypes = self.__snpobj.calldata_gt.reshape(
129                num_variants, num_samples * num_alleles
130            )
131            with pg.PgenWriter(
132                filename=f"{self.__filename}.pgen".encode('utf-8'),
133                sample_ct=num_samples,
134                variant_ct=num_variants,
135                hardcall_phase_present=True,
136            ) as writer:
137                for variant_index in range(num_variants):
138                    writer.append_alleles(
139                        flat_genotypes[variant_index].astype(np.int32), all_phased=True
140                    )
141        else:
142            num_variants, num_samples = self.__snpobj.calldata_gt.shape
143            # Transpose to (samples, variants)
144            genotypes = self.__snpobj.calldata_gt.T  # Shape is (samples, variants)
145            with pg.PgenWriter(
146                filename=f"{self.__filename}.pgen".encode('utf-8'),
147                sample_ct=num_samples,
148                variant_ct=num_variants,
149                hardcall_phase_present=False,
150            ) as writer:
151                for variant_index in range(num_variants):
152                    variant_genotypes = genotypes[:, variant_index].astype(np.int8)
153                    # Map missing genotypes to -9 if necessary
154                    variant_genotypes[variant_genotypes == -1] = -9
155                    writer.append_biallelic(np.ascontiguousarray(variant_genotypes))

Writes a genotype object in PGEN format (.pgen, .psam, and .pvar files) in the specified output path.

PGENWriter(snpobj: SNPObject, filename: str) View Source

20    def __init__(self, snpobj: SNPObject, filename: str):
21        """
22        Initializes the PGENWriter instance.
23
24        Args:
25            snpobj (SNPObject): The SNPObject containing genotype data to be written.
26            filename (str): Base path for the output files (excluding extension).
27        """
28        self.__snpobj = snpobj
29        self.__filename = Path(filename)

Initializes the PGENWriter instance.

Arguments:

snpobj (SNPObject): The SNPObject containing genotype data to be written.
filename (str): Base path for the output files (excluding extension).

def write( self, vzs: bool = False, rename_missing_values: bool = True, before: int | float | str = -1, after: int | float | str = '.'): View Source

31    def write(
32            self, 
33            vzs: bool = False,
34            rename_missing_values: bool = True, 
35            before: Union[int, float, str] = -1, 
36            after: Union[int, float, str] = '.'
37        ):
38        """
39        Writes the SNPObject data to .pgen, .psam, and .pvar files.
40
41        Args:
42            vzs (bool, optional): 
43                If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
44            rename_missing_values (bool, optional):
45                If True, renames potential missing values in `snpobj.calldata_gt` before writing. 
46                Defaults to True.
47            before (int, float, or str, default=-1): 
48                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
49                Default is -1.
50            after (int, float, or str, default='.'): 
51                The value that will replace `before`. Default is '.'.
52        """
53        file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst")
54        if self.__filename.suffix in file_extensions:
55            self.__filename = self.__filename.with_suffix('')
56
57        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
58        if rename_missing_values:
59            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
60
61        self.write_pvar(vzs=vzs)
62        self.write_psam()
63        self.write_pgen()

Writes the SNPObject data to .pgen, .psam, and .pvar files.

Arguments:

vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
after (int, float, or str, default='.'): The value that will replace before. Default is '.'.

def write_pvar(self, vzs: bool = False): View Source

 65    def write_pvar(self, vzs: bool = False):
 66        """
 67        Writes variant data to the .pvar file.
 68
 69        Args:
 70            vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
 71        """
 72        output_filename = f"{self.__filename}.pvar"
 73        if vzs:
 74            output_filename += ".zst"
 75            log.info(f"Writing to {output_filename} (compressed)")
 76        else:
 77            log.info(f"Writing to {output_filename}")
 78
 79        df = pl.DataFrame(
 80            {
 81                "#CHROM": self.__snpobj.variants_chrom,
 82                "POS": self.__snpobj.variants_pos,
 83                "ID": self.__snpobj.variants_id,
 84                "REF": self.__snpobj.variants_ref,
 85                "ALT": self.__snpobj.variants_alt,
 86                "FILTER": self.__snpobj.variants_filter_pass,
 87                # TODO: add INFO column to SNPObject and write it to the .pvar file? (if not it's lost)
 88            }
 89        )
 90        # TODO: add header to the .pvar file, if not it's lost
 91
 92        # Write the DataFrame to a CSV string
 93        csv_data = df.write_csv(None, separator="\t")
 94
 95        if vzs:
 96            # Compress the CSV data using zstd
 97            cctx = zstd.ZstdCompressor()
 98            compressed_data = cctx.compress(csv_data.encode('utf-8'))
 99            with open(output_filename, 'wb') as f:
100                f.write(compressed_data)
101        else:
102            with open(output_filename, 'w') as f:
103                f.write(csv_data)

Writes variant data to the .pvar file.

Arguments:

vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.

def write_psam(self): View Source

105    def write_psam(self):
106        """
107        Writes sample metadata to the .psam file.
108        """
109        log.info(f"Writing {self.__filename}.psam")
110        df = pl.DataFrame(
111            {
112                "#IID": self.__snpobj.samples,
113                "SEX": "NA",  # Add SEX as nan for now
114                # TODO: add SEX as Optional column to SNPObject and write it to the .psam file (if not it's lost)
115            }
116        )
117        df.write_csv(f"{self.__filename}.psam", separator="\t")

Writes sample metadata to the .psam file.

def write_pgen(self): View Source

119    def write_pgen(self):
120        """
121        Writes the genotype data to a .pgen file.
122        """
123        log.info(f"Writing to {self.__filename}.pgen")
124        summed_strands = False if self.__snpobj.calldata_gt.ndim == 3 else True
125        if not summed_strands:
126            num_variants, num_samples, num_alleles = self.__snpobj.calldata_gt.shape
127            # Flatten the genotype matrix for pgenlib
128            flat_genotypes = self.__snpobj.calldata_gt.reshape(
129                num_variants, num_samples * num_alleles
130            )
131            with pg.PgenWriter(
132                filename=f"{self.__filename}.pgen".encode('utf-8'),
133                sample_ct=num_samples,
134                variant_ct=num_variants,
135                hardcall_phase_present=True,
136            ) as writer:
137                for variant_index in range(num_variants):
138                    writer.append_alleles(
139                        flat_genotypes[variant_index].astype(np.int32), all_phased=True
140                    )
141        else:
142            num_variants, num_samples = self.__snpobj.calldata_gt.shape
143            # Transpose to (samples, variants)
144            genotypes = self.__snpobj.calldata_gt.T  # Shape is (samples, variants)
145            with pg.PgenWriter(
146                filename=f"{self.__filename}.pgen".encode('utf-8'),
147                sample_ct=num_samples,
148                variant_ct=num_variants,
149                hardcall_phase_present=False,
150            ) as writer:
151                for variant_index in range(num_variants):
152                    variant_genotypes = genotypes[:, variant_index].astype(np.int8)
153                    # Map missing genotypes to -9 if necessary
154                    variant_genotypes[variant_genotypes == -1] = -9
155                    writer.append_biallelic(np.ascontiguousarray(variant_genotypes))

Writes the genotype data to a .pgen file.

class VCFWriter: View Source

 14class VCFWriter:
 15    """
 16    A writer class for exporting SNP data from a `snputils.snp.genobj.SNPObject` 
 17    into an `.vcf` file.
 18    """
 19    def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False):
 20        """
 21        Args:
 22            snpobj (SNPObject):
 23                A SNPObject instance.
 24            file (str or pathlib.Path): 
 25                Path to the file where the data will be saved. It should end with `.vcf`. 
 26                If the provided path does not have this extension, the `.vcf` extension will be appended.
 27            n_jobs: 
 28                Number of jobs to run in parallel. 
 29                - `None`: use 1 job unless within a `joblib.parallel_backend` context.  
 30                - `-1`: use all available processors.  
 31                - Any other integer: use the specified number of jobs.
 32            phased: 
 33                If True, genotype data is written in "maternal|paternal" format.  
 34                If False, genotype data is written in "maternal/paternal" format.
 35        """
 36        self.__snpobj = snpobj
 37        self.__filename = Path(filename)
 38        self.__n_jobs = n_jobs
 39        self.__phased = phased
 40
 41    def write(
 42            self,
 43            chrom_partition: bool = False,
 44            rename_missing_values: bool = True,
 45            before: Union[int, float, str] = -1,
 46            after: Union[int, float, str] = '.',
 47            variants_info: Optional[Sequence[str]] = None,
 48        ):
 49        """
 50        Writes the SNP data to VCF file(s).
 51
 52        Args:
 53            chrom_partition (bool, optional):
 54                If True, individual VCF files are generated for each chromosome.
 55                If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
 56            rename_missing_values (bool, optional):
 57                If True, renames potential missing values in `snpobj.calldata_gt` before writing.
 58                Defaults to True.
 59            before (int, float, or str, default=-1):
 60                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
 61                Default is -1.
 62            after (int, float, or str, default='.'):
 63                The value that will replace `before`. Default is '.'.
 64            variants_info (sequence of str, optional):
 65                Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count.
 66                When provided, a ##INFO header line for END is written if any value contains ``END=``.
 67        """
 68        self.__chrom_partition = chrom_partition
 69
 70        file_extensions = (".vcf", ".bcf")
 71        if self.__filename.suffix in file_extensions:
 72            self.__file_extension = self.__filename.suffix
 73            self.__filename = self.__filename.with_suffix('')
 74        else:
 75            self.__file_extension = ".vcf"
 76
 77        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
 78        if rename_missing_values:
 79            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
 80
 81        data = self.__snpobj
 82
 83        if self.__chrom_partition:
 84            chroms = data.unique_chrom
 85
 86            for chrom in chroms:
 87                data_chrom = data.filter_variants(chrom=chrom, inplace=False)
 88                if variants_info is not None:
 89                    mask = data.variants_chrom == chrom
 90                    info_chrom = [variants_info[i] for i in np.where(mask)[0]]
 91                else:
 92                    info_chrom = None
 93                log.debug(f'Storing chromosome {chrom}')
 94                self._write_chromosome_data(chrom, data_chrom, info_chrom)
 95        else:
 96            self._write_chromosome_data("All", data, variants_info)
 97
 98    def _write_chromosome_data(
 99        self, chrom, data_chrom, variants_info: Optional[Sequence[str]] = None
100    ):
101        """
102        Writes the SNP data for a specific chromosome to a VCF file.
103
104        Args:
105            chrom: The chromosome name.
106            data_chrom: The SNPObject instance containing the data for the chromosome.
107            variants_info: Optional per-variant INFO strings; length must match variant count.
108        """
109        npy3 = data_chrom.calldata_gt
110        n_windows, n_samples, _ = npy3.shape
111
112        if chrom == "All":
113            file = self.__filename.with_suffix(self.__file_extension)
114        else:
115            file = self.__filename.parent / f"{self.__filename.stem}_{chrom}{self.__file_extension}"
116
117        out = open(file, "w")
118        out.write("##fileformat=VCFv4.1\n")
119        out.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">\n')
120        if variants_info is not None and any("END=" in s for s in variants_info):
121            out.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the segment">\n')
122        for c in set(data_chrom.variants_chrom):
123            out.write(f"##contig=<ID={c}>\n")
124        cols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + list(data_chrom.samples)
125        out.write("\t".join(cols) + "\n")
126
127        sep = "|" if self.__phased else "/"
128        for i in range(n_windows):
129            chrom_val = data_chrom.variants_chrom[i]
130            pos = data_chrom.variants_pos[i]
131            vid = data_chrom.variants_id[i]
132            ref = data_chrom.variants_ref[i]
133            alt = data_chrom.variants_alt[i]
134            info_str = variants_info[i] if variants_info is not None else "."
135            row = npy3[i]
136            genotypes = [
137                f"{row[s,0]}{sep}{row[s,1]}"
138                for s in range(n_samples)
139            ]
140            line = "\t".join([
141                str(chrom_val), str(pos), vid, ref, alt,
142                ".", "PASS", info_str, "GT", *genotypes
143            ])
144            out.write(line + "\n")
145        out.close()

A writer class for exporting SNP data from a snputils.snp.genobj.SNPObject into an .vcf file.

VCFWriter( snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False) View Source

19    def __init__(self, snpobj: SNPObject, filename: str, n_jobs: int = -1, phased: bool = False):
20        """
21        Args:
22            snpobj (SNPObject):
23                A SNPObject instance.
24            file (str or pathlib.Path): 
25                Path to the file where the data will be saved. It should end with `.vcf`. 
26                If the provided path does not have this extension, the `.vcf` extension will be appended.
27            n_jobs: 
28                Number of jobs to run in parallel. 
29                - `None`: use 1 job unless within a `joblib.parallel_backend` context.  
30                - `-1`: use all available processors.  
31                - Any other integer: use the specified number of jobs.
32            phased: 
33                If True, genotype data is written in "maternal|paternal" format.  
34                If False, genotype data is written in "maternal/paternal" format.
35        """
36        self.__snpobj = snpobj
37        self.__filename = Path(filename)
38        self.__n_jobs = n_jobs
39        self.__phased = phased

Arguments:

snpobj (SNPObject): A SNPObject instance.
file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf. If the provided path does not have this extension, the .vcf extension will be appended.
n_jobs: Number of jobs to run in parallel.
- None: use 1 job unless within a joblib.parallel_backend context.
- -1: use all available processors.
- Any other integer: use the specified number of jobs.
phased: If True, genotype data is written in "maternal|paternal" format.
If False, genotype data is written in "maternal/paternal" format.

41    def write(
42            self,
43            chrom_partition: bool = False,
44            rename_missing_values: bool = True,
45            before: Union[int, float, str] = -1,
46            after: Union[int, float, str] = '.',
47            variants_info: Optional[Sequence[str]] = None,
48        ):
49        """
50        Writes the SNP data to VCF file(s).
51
52        Args:
53            chrom_partition (bool, optional):
54                If True, individual VCF files are generated for each chromosome.
55                If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
56            rename_missing_values (bool, optional):
57                If True, renames potential missing values in `snpobj.calldata_gt` before writing.
58                Defaults to True.
59            before (int, float, or str, default=-1):
60                The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
61                Default is -1.
62            after (int, float, or str, default='.'):
63                The value that will replace `before`. Default is '.'.
64            variants_info (sequence of str, optional):
65                Per-variant INFO column values (e.g. ``["END=2000", "END=3000"]``). Length must match variant count.
66                When provided, a ##INFO header line for END is written if any value contains ``END=``.
67        """
68        self.__chrom_partition = chrom_partition
69
70        file_extensions = (".vcf", ".bcf")
71        if self.__filename.suffix in file_extensions:
72            self.__file_extension = self.__filename.suffix
73            self.__filename = self.__filename.with_suffix('')
74        else:
75            self.__file_extension = ".vcf"
76
77        # Optionally rename potential missing values in `snpobj.calldata_gt` before writing
78        if rename_missing_values:
79            self.__snpobj.rename_missings(before=before, after=after, inplace=True)
80
81        data = self.__snpobj
82
83        if self.__chrom_partition:
84            chroms = data.unique_chrom
85
86            for chrom in chroms:
87                data_chrom = data.filter_variants(chrom=chrom, inplace=False)
88                if variants_info is not None:
89                    mask = data.variants_chrom == chrom
90                    info_chrom = [variants_info[i] for i in np.where(mask)[0]]
91                else:
92                    info_chrom = None
93                log.debug(f'Storing chromosome {chrom}')
94                self._write_chromosome_data(chrom, data_chrom, info_chrom)
95        else:
96            self._write_chromosome_data("All", data, variants_info)

Writes the SNP data to VCF file(s).

Arguments:

chrom_partition (bool, optional): If True, individual VCF files are generated for each chromosome. If False, a single VCF file containing data for all chromosomes is created. Defaults to False.
rename_missing_values (bool, optional): If True, renames potential missing values in snpobj.calldata_gt before writing. Defaults to True.
before (int, float, or str, default=-1): The current representation of missing values in calldata_gt. Common values might be -1, '.', or NaN. Default is -1.
after (int, float, or str, default='.'): The value that will replace before. Default is '.'.
variants_info (sequence of str, optional): Per-variant INFO column values (e.g. ["END=2000", "END=3000"]). Length must match variant count. When provided, a ##INFO header line for END is written if any value contains END=.

def read_snp( filename: str | pathlib.Path, **kwargs) -> SNPObject: View Source

11def read_snp(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
12    """
13    Automatically detect the file format and read it into a SNPObject.
14
15    Args:
16        filename: Filename of the file to read.
17        **kwargs: Additional arguments passed to the reader method.
18
19    Raises:
20        ValueError: If the filename does not have an extension or the extension is not supported.
21    """
22    from snputils.snp.io.read.auto import SNPReader
23
24    return SNPReader(filename).read(**kwargs)

Automatically detect the file format and read it into a SNPObject.

Arguments:

filename: Filename of the file to read.
**kwargs: Additional arguments passed to the reader method.

Raises:

ValueError: If the filename does not have an extension or the extension is not supported.

def read_bed( filename: str | pathlib.Path, **kwargs) -> SNPObject: View Source

27def read_bed(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
28    """
29    Read a BED fileset into a SNPObject.
30
31    Args:
32        filename: Filename of the BED fileset to read.
33        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.bed.BEDReader` for possible parameters.
34    """
35    from snputils.snp.io.read.bed import BEDReader
36
37    return BEDReader(filename).read(**kwargs)

Read a BED fileset into a SNPObject.

Arguments:

filename: Filename of the BED fileset to read.
**kwargs: Additional arguments passed to the reader method. See snputils.snp.io.read.bed.BEDReader for possible parameters.

def read_pgen( filename: str | pathlib.Path, **kwargs) -> SNPObject: View Source

40def read_pgen(filename: Union[str, pathlib.Path], **kwargs) -> SNPObject:
41    """
42    Read a PGEN fileset into a SNPObject.
43
44    Args:
45        filename: Filename of the PGEN fileset to read.
46        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.pgen.PGENReader` for possible parameters.
47    """
48    from snputils.snp.io.read.pgen import PGENReader
49
50    return PGENReader(filename).read(**kwargs)

Read a PGEN fileset into a SNPObject.

Arguments:

filename: Filename of the PGEN fileset to read.
**kwargs: Additional arguments passed to the reader method. See snputils.snp.io.read.pgen.PGENReader for possible parameters.

def read_vcf( filename: str | pathlib.Path, backend: str = 'polars', **kwargs) -> SNPObject: View Source

53def read_vcf(filename: Union[str, pathlib.Path], 
54             backend: str = 'polars',
55             **kwargs) -> SNPObject:
56    """
57    Read a VCF fileset into a SNPObject.
58
59    Args:
60        filename: Filename of the VCF fileset to read.
61        backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'.
62        **kwargs: Additional arguments passed to the reader method. See :class:`snputils.snp.io.read.vcf.VCFReader` for possible parameters.
63    """
64    from snputils.snp.io.read.vcf import VCFReader, VCFReaderPolars
65    if backend == 'polars':
66        print(f"Reading {filename} with polars backend")
67        return VCFReaderPolars(filename).read(**kwargs)
68    else:
69        print(f"Reading {filename} with scikit-allel backend")
70        return VCFReader(filename).read(**kwargs)

Read a VCF fileset into a SNPObject.

Arguments:

filename: Filename of the VCF fileset to read.
backend: Backend to use for reading the VCF file. Options are 'polars' or 'scikit-allel'.
**kwargs: Additional arguments passed to the reader method. See snputils.snp.io.read.vcf.VCFReader for possible parameters.

def read_grg( filename: str | pathlib.Path, **kwargs) -> GRGObject: View Source

73def read_grg(filename: Union[str, pathlib.Path], **kwargs) -> "GRGObject":
74    """
75    Read a GRG file into a GRGObject.
76
77    Args:
78        filename: Filename of the GRG file to read.
79        **kwargs: Additional arguments passed to the reader method.
80    """
81    try:
82        from snputils.snp.io.read.grg import GRGReader
83    except ModuleNotFoundError as exc:
84        if exc.name == "pygrgl":
85            raise ImportError(
86                "GRG support requires the optional dependency 'pygrgl'. "
87                "Install it with: pip install pygrgl"
88            ) from exc
89        raise
90
91    return GRGReader(filename).read(**kwargs)

Read a GRG file into a GRGObject.

Arguments:

filename: Filename of the GRG file to read.
**kwargs: Additional arguments passed to the reader method.

class MSPReader(snputils.ancestry.io.local.read.base.LAIBaseReader): View Source

 30class MSPReader(LAIBaseReader):
 31    """
 32    A reader class for parsing Local Ancestry Inference (LAI) data from an `.msp` or `msp.tsv` file
 33    and constructing a `snputils.ancestry.genobj.LocalAncestryObject`.
 34    """
 35    def __init__(self, file: Union[str, Path]) -> None:
 36        """
 37        Args:
 38            file (str or pathlib.Path): 
 39                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 40        """
 41        self.__file = Path(file)
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
 51        """
 52        return self.__file
 53
 54    def _get_samples(self, msp_df: pd.DataFrame, first_lai_col_indx: int) -> List[str]:
 55        """
 56        Extract unique sample identifiers from the pandas DataFrame.
 57
 58        Args:
 59            msp_df (pd.DataFrame): 
 60                The DataFrame representing the `.msp` data, including LAI columns.
 61            first_lai_col_indx (int): 
 62                Index of the first column containing LAI data.
 63
 64        Returns:
 65            **list:** List of unique sample identifiers.
 66        """
 67        # Get all columns starting from the first LAI data column
 68        query_samples_dub = msp_df.columns[first_lai_col_indx:]
 69
 70        # Select only one of the maternal/paternal samples by taking every second sample
 71        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 72        query_samples_sing = query_samples_dub[single_ind_idx]
 73
 74        # Remove the suffix from sample names to get clean identifiers
 75        query_samples = [qs[:-2] for qs in query_samples_sing]
 76
 77        return query_samples
 78
 79    def _get_samples_from_haplotypes(self, haplotypes: List[str]) -> List[str]:
 80        query_samples_dub = np.asarray(haplotypes, dtype=object)
 81        single_ind_idx = np.arange(0, len(query_samples_dub), 2)
 82        query_samples_sing = query_samples_dub[single_ind_idx]
 83        return [str(qs)[:-2] for qs in query_samples_sing]
 84
 85    def _parse_header_and_comment(self) -> tuple[Optional[str], List[str]]:
 86        with open(self.file) as f:
 87            first_line = f.readline()
 88            second_line = f.readline()
 89
 90        first_line_ = [h.strip() for h in first_line.split("\t")]
 91        second_line_ = [h.strip() for h in second_line.split("\t")]
 92
 93        if "#chm" in first_line_:
 94            return None, first_line_
 95        if "#chm" in second_line_:
 96            return first_line, second_line_
 97
 98        raise ValueError(
 99            f"Header not found. Expected '#chm' in the first two lines. "
100            f"First line: {first_line.strip()} | Second line: {second_line.strip()}"
101        )
102
103    def _get_first_lai_col_indx(self, header: List[str]) -> int:
104        column_counter = 1
105        if "spos" in header and "epos" in header:
106            column_counter += 2
107        if "sgpos" in header and "egpos" in header:
108            column_counter += 2
109        if "n snps" in header:
110            column_counter += 1
111        return column_counter
112
113    def read_metadata(self) -> MSPMetadata:
114        comment, header = self._parse_header_and_comment()
115
116        if len(header) != len(set(header)):
117            raise ValueError("Duplicate columns detected in the header.")
118
119        first_lai_col_indx = self._get_first_lai_col_indx(header)
120        haplotypes = header[first_lai_col_indx:]
121        samples = self._get_samples_from_haplotypes(haplotypes)
122        ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None
123
124        return MSPMetadata(
125            header=header,
126            comment=comment,
127            first_lai_col_indx=first_lai_col_indx,
128            haplotypes=haplotypes,
129            samples=samples,
130            ancestry_map=ancestry_map,
131            has_physical_pos=("spos" in header and "epos" in header),
132            has_centimorgan_pos=("sgpos" in header and "egpos" in header),
133            has_window_sizes=("n snps" in header),
134        )
135
136    def iter_windows(
137        self,
138        chunk_size: int = 1024,
139        sample_indices: Optional[np.ndarray] = None,
140    ) -> Iterator[Dict[str, np.ndarray]]:
141        metadata = self.read_metadata()
142
143        if chunk_size < 1:
144            raise ValueError("chunk_size must be >= 1.")
145
146        header = metadata.header
147        first_lai_col_indx = metadata.first_lai_col_indx
148        column_index = {name: i for i, name in enumerate(header)}
149        chrom_col_idx = column_index["#chm"]
150
151        spos_col_idx: Optional[int] = None
152        epos_col_idx: Optional[int] = None
153        if metadata.has_physical_pos:
154            spos_col_idx = column_index["spos"]
155            epos_col_idx = column_index["epos"]
156
157        if sample_indices is None:
158            hap_col_indices = list(range(first_lai_col_indx, len(header)))
159        else:
160            sample_indices = np.asarray(sample_indices, dtype=np.int64)
161            if sample_indices.size == 0:
162                raise ValueError("sample_indices cannot be empty.")
163            if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)):
164                raise ValueError("sample_indices contain out-of-bounds sample indexes.")
165
166            hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64)
167            hap_indices[0::2] = 2 * sample_indices
168            hap_indices[1::2] = 2 * sample_indices + 1
169            hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist()
170
171        n_selected_haps = len(hap_col_indices)
172        n_total_haps = len(metadata.haplotypes)
173        all_haps_selected = (
174            n_selected_haps == n_total_haps
175            and n_selected_haps > 0
176            and hap_col_indices[0] == first_lai_col_indx
177            and hap_col_indices[-1] == (len(header) - 1)
178        )
179
180        # Pre-compute relative indices for the sample-subset path so the
181        # inner loop can use np.fromstring (C-level) + numpy fancy indexing
182        # instead of a Python for-loop over potentially millions of columns.
183        if not all_haps_selected:
184            _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx
185        else:
186            _relative_hap_idx = None
187
188        row_in_chunk = 0
189        window_start = 0
190        chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
191        lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
192        physical_pos_chunk = (
193            np.empty((int(chunk_size), 2), dtype=np.int64)
194            if metadata.has_physical_pos
195            else None
196        )
197
198        with open(self.file, "r", encoding="utf-8") as handle:
199            for line_no, raw_line in enumerate(handle, start=1):
200                if not raw_line:
201                    continue
202                if raw_line.startswith("#"):
203                    continue
204
205                line = raw_line.rstrip("\n")
206                if not line:
207                    continue
208
209                # Both paths split only at the metadata/haplotype boundary,
210                # then use np.fromstring (C parser) for the haplotype tail.
211                fields = line.split("\t", first_lai_col_indx)
212                if len(fields) != (first_lai_col_indx + 1):
213                    raise ValueError(
214                        f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} "
215                        f"prefix segments when parsing haplotypes."
216                    )
217
218                chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx]
219                if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None:
220                    physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx])
221                    physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx])
222
223                lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8)
224
225                if all_haps_selected:
226                    if lai_row.size != n_selected_haps:
227                        raise ValueError(
228                            f"Malformed MSP haplotype row at line {line_no}: expected "
229                            f"{n_selected_haps} haplotype values, got {lai_row.size}."
230                        )
231                    lai_chunk[row_in_chunk, :] = lai_row
232                else:
233                    if lai_row.size < n_total_haps:
234                        raise ValueError(
235                            f"Malformed MSP haplotype row at line {line_no}: expected at least "
236                            f"{n_total_haps} haplotype values, got {lai_row.size}."
237                        )
238                    lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx]
239
240                row_in_chunk += 1
241                if row_in_chunk == chunk_size:
242                    window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
243                    yield {
244                        "window_indexes": window_indexes,
245                        "chromosomes": chromosomes_chunk,
246                        "physical_pos": physical_pos_chunk,
247                        "lai": lai_chunk,
248                    }
249
250                    window_start += row_in_chunk
251                    row_in_chunk = 0
252                    chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
253                    lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
254                    if metadata.has_physical_pos:
255                        physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64)
256                    else:
257                        physical_pos_chunk = None
258
259        if row_in_chunk > 0:
260            window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
261            yield {
262                "window_indexes": window_indexes,
263                "chromosomes": chromosomes_chunk[:row_in_chunk],
264                "physical_pos": (
265                    physical_pos_chunk[:row_in_chunk]
266                    if physical_pos_chunk is not None
267                    else None
268                ),
269                "lai": lai_chunk[:row_in_chunk],
270            }
271
272    def _get_ancestry_map_from_comment(self, comment: str) -> Dict[str, str]:
273        """
274        Construct an ancestry map from the comment line of the `.msp` file.
275
276        This method parses the comment string to create a mapping of ancestry numerical identifiers 
277        to their corresponding ancestry names (e.g., '0': 'African').
278
279        Args:
280            comment (str): 
281                The comment line containing ancestry mapping information.
282
283        Returns:
284            dict: A dictionary mapping ancestry codes (as strings) to ancestry names.
285        """
286        comment = comment.strip()
287
288        # Remove everything before the colon, if present
289        if ':' in comment:
290            comment = comment.split(':', 1)[1].strip()
291
292        ancestry_map: Dict[str, str] = {}
293
294        # Split on tabs, spaces, commas, semicolons or any combination of them
295        tokens = [tok.strip() for tok in re.split(r'[,\t; ]+', comment) if tok]
296
297        for tok in tokens:
298            if '=' not in tok:
299                continue  # Skip invalid pieces
300
301            left, right = (p.strip() for p in tok.split('=', 1))
302
303            # Detect whether format is "Pop=0" or "0=Pop"
304            if left.isdigit() and not right.isdigit():
305                ancestry_map[left] = right       # 0=Africa
306            elif right.isdigit() and not left.isdigit():
307                ancestry_map[right] = left       # Africa=0
308            else:
309                # Fallback (if both sides are digits or both are pops, keep left as code)
310                ancestry_map[left] = right
311
312        return ancestry_map
313
314    def _replace_nan_with_none(self, array: Optional[np.ndarray]) -> Optional[np.ndarray]:
315        """
316        Replace arrays that are fully NaN with `None`.
317
318        Args:
319            array (np.ndarray): Array to check.
320
321        Returns:
322            Optional[np.ndarray]: Returns `None` if the array is fully NaN, otherwise returns the original array.
323        """
324        if array is not None:
325            if array.size == 0:  # Check if the array is empty
326                return None
327            if np.issubdtype(array.dtype, np.number):  # Check for numeric types
328                if np.isnan(array).all():  # Fully NaN numeric array
329                    return None
330            elif array.dtype == np.object_ or np.issubdtype(array.dtype, np.str_):  # String or object types
331                if np.all((array == '') | (array == None)):  # Empty or None strings
332                    return None
333        return array
334
335    def read(self) -> 'LocalAncestryObject':
336        """
337        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
338        `snputils.ancestry.genobj.LocalAncestryObject`.
339
340        **Expected MSP content:**
341
342        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
343        Each row should correspond to a genomic window and include the following columns:
344
345        - `#chm`: Chromosome numbers corresponding to each genomic window.
346        - `spos`: Start physical position for each window.
347        - `epos`: End physical position for each window.
348        - `sgpos`: Start centimorgan position for each window.
349        - `egpos`: End centimorgan position for each window.
350        - `n snps`: Number of SNPs in each genomic window.
351        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
352        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
353
354        Returns:
355            **LocalAncestryObject:**
356                A LocalAncestryObject instance.
357        """
358        log.info(f"Reading '{self.file}'...")
359        metadata = self.read_metadata()
360        comment = metadata.comment
361        header = metadata.header
362
363        # Read the main data into a DataFrame, skipping comment lines
364        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
365
366        # Extract chromosomes data
367        chromosomes = msp_df['#chm'].astype(str).to_numpy()
368
369        # Extract physical positions (if available)
370        column_counter = metadata.first_lai_col_indx
371        if metadata.has_physical_pos:
372            physical_pos = msp_df[['spos', 'epos']].to_numpy()
373        else:
374            physical_pos = None
375            log.warning("Physical positions ('spos' and 'epos') not found.")
376        
377        # Extract centimorgan positions (if available)
378        if metadata.has_centimorgan_pos:
379            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
380        else:
381            centimorgan_pos = None
382            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
383
384        # Extract window sizes (if available)
385        if metadata.has_window_sizes:
386            window_sizes = msp_df['n snps'].to_numpy()
387        else:
388            window_sizes = None
389            log.warning("Window sizes ('n snps') not found.")
390        
391        # Extract LAI data (haplotype-level)
392        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
393
394        # Extract haplotype identifiers
395        haplotypes = metadata.haplotypes
396
397        # Extract haplotype identifiers and sample identifiers
398        samples = metadata.samples
399        del msp_df
400        gc.collect()
401
402        # Validate the number of samples matches the LAI data dimensions
403        n_samples = len(samples)
404        if n_samples != int(lai.shape[1] / 2):
405            raise ValueError(
406                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
407                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
408            )
409        
410        # Count number of unique ancestries in the LAI data
411        n_ancestries = len(np.unique(lai))
412
413        # Parse ancestry map from the comment (if available)
414        ancestry_map = None
415        if comment is not None:
416            ancestry_map = metadata.ancestry_map
417            if len(ancestry_map) != n_ancestries:
418                warnings.warn(
419                    "Mismatch between the number of unique ancestries in the LAI data "
420                    f"({n_ancestries}) and the number of classes in the ancestry map "
421                    f"({len(ancestry_map)})."
422                )
423        else:
424            # Provide default ancestry mapping if no comment is provided
425            ancestry_map = None
426            warnings.warn(
427                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
428                "map as a comment in the first line."
429            )
430
431        # Replace fully NaN attributes with None
432        window_sizes = self._replace_nan_with_none(window_sizes)
433        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
434        chromosomes = self._replace_nan_with_none(chromosomes)
435        physical_pos = self._replace_nan_with_none(physical_pos)
436
437        return LocalAncestryObject(
438            haplotypes=haplotypes,
439            lai=lai,
440            samples=samples,
441            ancestry_map=ancestry_map,
442            window_sizes=window_sizes,
443            centimorgan_pos=centimorgan_pos,
444            chromosomes=chromosomes,
445            physical_pos=physical_pos
446        )

A reader class for parsing Local Ancestry Inference (LAI) data from an .msp or msp.tsv file and constructing a snputils.ancestry.genobj.LocalAncestryObject.

MSPReader(file: str | pathlib.Path) View Source

35    def __init__(self, file: Union[str, Path]) -> None:
36        """
37        Args:
38            file (str or pathlib.Path): 
39                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
40        """
41        self.__file = Path(file)

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.

file: pathlib.Path View Source

43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
51        """
52        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file to be read. It should end with .msp or .msp.tsv.

def read_metadata(self) -> snputils.ancestry.io.local.read.msp.MSPMetadata: View Source

113    def read_metadata(self) -> MSPMetadata:
114        comment, header = self._parse_header_and_comment()
115
116        if len(header) != len(set(header)):
117            raise ValueError("Duplicate columns detected in the header.")
118
119        first_lai_col_indx = self._get_first_lai_col_indx(header)
120        haplotypes = header[first_lai_col_indx:]
121        samples = self._get_samples_from_haplotypes(haplotypes)
122        ancestry_map = self._get_ancestry_map_from_comment(comment) if comment is not None else None
123
124        return MSPMetadata(
125            header=header,
126            comment=comment,
127            first_lai_col_indx=first_lai_col_indx,
128            haplotypes=haplotypes,
129            samples=samples,
130            ancestry_map=ancestry_map,
131            has_physical_pos=("spos" in header and "epos" in header),
132            has_centimorgan_pos=("sgpos" in header and "egpos" in header),
133            has_window_sizes=("n snps" in header),
134        )

def iter_windows( self, chunk_size: int = 1024, sample_indices: numpy.ndarray | None = None) -> Iterator[Dict[str, numpy.ndarray]]: View Source

136    def iter_windows(
137        self,
138        chunk_size: int = 1024,
139        sample_indices: Optional[np.ndarray] = None,
140    ) -> Iterator[Dict[str, np.ndarray]]:
141        metadata = self.read_metadata()
142
143        if chunk_size < 1:
144            raise ValueError("chunk_size must be >= 1.")
145
146        header = metadata.header
147        first_lai_col_indx = metadata.first_lai_col_indx
148        column_index = {name: i for i, name in enumerate(header)}
149        chrom_col_idx = column_index["#chm"]
150
151        spos_col_idx: Optional[int] = None
152        epos_col_idx: Optional[int] = None
153        if metadata.has_physical_pos:
154            spos_col_idx = column_index["spos"]
155            epos_col_idx = column_index["epos"]
156
157        if sample_indices is None:
158            hap_col_indices = list(range(first_lai_col_indx, len(header)))
159        else:
160            sample_indices = np.asarray(sample_indices, dtype=np.int64)
161            if sample_indices.size == 0:
162                raise ValueError("sample_indices cannot be empty.")
163            if np.any(sample_indices < 0) or np.any(sample_indices >= len(metadata.samples)):
164                raise ValueError("sample_indices contain out-of-bounds sample indexes.")
165
166            hap_indices = np.empty(sample_indices.size * 2, dtype=np.int64)
167            hap_indices[0::2] = 2 * sample_indices
168            hap_indices[1::2] = 2 * sample_indices + 1
169            hap_col_indices = (first_lai_col_indx + hap_indices).astype(np.int64).tolist()
170
171        n_selected_haps = len(hap_col_indices)
172        n_total_haps = len(metadata.haplotypes)
173        all_haps_selected = (
174            n_selected_haps == n_total_haps
175            and n_selected_haps > 0
176            and hap_col_indices[0] == first_lai_col_indx
177            and hap_col_indices[-1] == (len(header) - 1)
178        )
179
180        # Pre-compute relative indices for the sample-subset path so the
181        # inner loop can use np.fromstring (C-level) + numpy fancy indexing
182        # instead of a Python for-loop over potentially millions of columns.
183        if not all_haps_selected:
184            _relative_hap_idx = np.array(hap_col_indices, dtype=np.intp) - first_lai_col_indx
185        else:
186            _relative_hap_idx = None
187
188        row_in_chunk = 0
189        window_start = 0
190        chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
191        lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
192        physical_pos_chunk = (
193            np.empty((int(chunk_size), 2), dtype=np.int64)
194            if metadata.has_physical_pos
195            else None
196        )
197
198        with open(self.file, "r", encoding="utf-8") as handle:
199            for line_no, raw_line in enumerate(handle, start=1):
200                if not raw_line:
201                    continue
202                if raw_line.startswith("#"):
203                    continue
204
205                line = raw_line.rstrip("\n")
206                if not line:
207                    continue
208
209                # Both paths split only at the metadata/haplotype boundary,
210                # then use np.fromstring (C parser) for the haplotype tail.
211                fields = line.split("\t", first_lai_col_indx)
212                if len(fields) != (first_lai_col_indx + 1):
213                    raise ValueError(
214                        f"Malformed MSP row at line {line_no}: expected {first_lai_col_indx + 1} "
215                        f"prefix segments when parsing haplotypes."
216                    )
217
218                chromosomes_chunk[row_in_chunk] = fields[chrom_col_idx]
219                if physical_pos_chunk is not None and spos_col_idx is not None and epos_col_idx is not None:
220                    physical_pos_chunk[row_in_chunk, 0] = int(fields[spos_col_idx])
221                    physical_pos_chunk[row_in_chunk, 1] = int(fields[epos_col_idx])
222
223                lai_row = np.fromstring(fields[first_lai_col_indx], sep="\t", dtype=np.uint8)
224
225                if all_haps_selected:
226                    if lai_row.size != n_selected_haps:
227                        raise ValueError(
228                            f"Malformed MSP haplotype row at line {line_no}: expected "
229                            f"{n_selected_haps} haplotype values, got {lai_row.size}."
230                        )
231                    lai_chunk[row_in_chunk, :] = lai_row
232                else:
233                    if lai_row.size < n_total_haps:
234                        raise ValueError(
235                            f"Malformed MSP haplotype row at line {line_no}: expected at least "
236                            f"{n_total_haps} haplotype values, got {lai_row.size}."
237                        )
238                    lai_chunk[row_in_chunk, :] = lai_row[_relative_hap_idx]
239
240                row_in_chunk += 1
241                if row_in_chunk == chunk_size:
242                    window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
243                    yield {
244                        "window_indexes": window_indexes,
245                        "chromosomes": chromosomes_chunk,
246                        "physical_pos": physical_pos_chunk,
247                        "lai": lai_chunk,
248                    }
249
250                    window_start += row_in_chunk
251                    row_in_chunk = 0
252                    chromosomes_chunk = np.empty(int(chunk_size), dtype=object)
253                    lai_chunk = np.empty((int(chunk_size), n_selected_haps), dtype=np.uint8)
254                    if metadata.has_physical_pos:
255                        physical_pos_chunk = np.empty((int(chunk_size), 2), dtype=np.int64)
256                    else:
257                        physical_pos_chunk = None
258
259        if row_in_chunk > 0:
260            window_indexes = np.arange(window_start, window_start + row_in_chunk, dtype=np.int64)
261            yield {
262                "window_indexes": window_indexes,
263                "chromosomes": chromosomes_chunk[:row_in_chunk],
264                "physical_pos": (
265                    physical_pos_chunk[:row_in_chunk]
266                    if physical_pos_chunk is not None
267                    else None
268                ),
269                "lai": lai_chunk[:row_in_chunk],
270            }

def read(self) -> LocalAncestryObject: View Source

335    def read(self) -> 'LocalAncestryObject':
336        """
337        Read data from the provided `.msp` or `msp.tsv` `file` and construct a 
338        `snputils.ancestry.genobj.LocalAncestryObject`.
339
340        **Expected MSP content:**
341
342        The `.msp` file should contain local ancestry assignments for each haplotype across genomic windows.
343        Each row should correspond to a genomic window and include the following columns:
344
345        - `#chm`: Chromosome numbers corresponding to each genomic window.
346        - `spos`: Start physical position for each window.
347        - `epos`: End physical position for each window.
348        - `sgpos`: Start centimorgan position for each window.
349        - `egpos`: End centimorgan position for each window.
350        - `n snps`: Number of SNPs in each genomic window.
351        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
352        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
353
354        Returns:
355            **LocalAncestryObject:**
356                A LocalAncestryObject instance.
357        """
358        log.info(f"Reading '{self.file}'...")
359        metadata = self.read_metadata()
360        comment = metadata.comment
361        header = metadata.header
362
363        # Read the main data into a DataFrame, skipping comment lines
364        msp_df = pd.read_csv(self.file, sep="\t", comment="#", names=header)
365
366        # Extract chromosomes data
367        chromosomes = msp_df['#chm'].astype(str).to_numpy()
368
369        # Extract physical positions (if available)
370        column_counter = metadata.first_lai_col_indx
371        if metadata.has_physical_pos:
372            physical_pos = msp_df[['spos', 'epos']].to_numpy()
373        else:
374            physical_pos = None
375            log.warning("Physical positions ('spos' and 'epos') not found.")
376        
377        # Extract centimorgan positions (if available)
378        if metadata.has_centimorgan_pos:
379            centimorgan_pos = msp_df[['sgpos', 'egpos']].to_numpy()
380        else:
381            centimorgan_pos = None
382            log.warning("Genetic (centimorgan) positions ('sgpos' and 'egpos') not found.")
383
384        # Extract window sizes (if available)
385        if metadata.has_window_sizes:
386            window_sizes = msp_df['n snps'].to_numpy()
387        else:
388            window_sizes = None
389            log.warning("Window sizes ('n snps') not found.")
390        
391        # Extract LAI data (haplotype-level)
392        lai = msp_df.iloc[:, column_counter:].to_numpy(dtype=np.uint8, copy=False)
393
394        # Extract haplotype identifiers
395        haplotypes = metadata.haplotypes
396
397        # Extract haplotype identifiers and sample identifiers
398        samples = metadata.samples
399        del msp_df
400        gc.collect()
401
402        # Validate the number of samples matches the LAI data dimensions
403        n_samples = len(samples)
404        if n_samples != int(lai.shape[1] / 2):
405            raise ValueError(
406                "Mismatch between the number of sample identifiers and the expected number of samples in the LAI array. "
407                f"Expected {int(lai.shape[1] / 2)} samples (derived from LAI data); found {n_samples}."
408            )
409        
410        # Count number of unique ancestries in the LAI data
411        n_ancestries = len(np.unique(lai))
412
413        # Parse ancestry map from the comment (if available)
414        ancestry_map = None
415        if comment is not None:
416            ancestry_map = metadata.ancestry_map
417            if len(ancestry_map) != n_ancestries:
418                warnings.warn(
419                    "Mismatch between the number of unique ancestries in the LAI data "
420                    f"({n_ancestries}) and the number of classes in the ancestry map "
421                    f"({len(ancestry_map)})."
422                )
423        else:
424            # Provide default ancestry mapping if no comment is provided
425            ancestry_map = None
426            warnings.warn(
427                "Ancestry map not found. It is recommended to provide an .msp file that contains the ancestry "
428                "map as a comment in the first line."
429            )
430
431        # Replace fully NaN attributes with None
432        window_sizes = self._replace_nan_with_none(window_sizes)
433        centimorgan_pos = self._replace_nan_with_none(centimorgan_pos)
434        chromosomes = self._replace_nan_with_none(chromosomes)
435        physical_pos = self._replace_nan_with_none(physical_pos)
436
437        return LocalAncestryObject(
438            haplotypes=haplotypes,
439            lai=lai,
440            samples=samples,
441            ancestry_map=ancestry_map,
442            window_sizes=window_sizes,
443            centimorgan_pos=centimorgan_pos,
444            chromosomes=chromosomes,
445            physical_pos=physical_pos
446        )

Read data from the provided .msp or msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Expected MSP content:

The .msp file should contain local ancestry assignments for each haplotype across genomic windows. Each row should correspond to a genomic window and include the following columns:

#chm: Chromosome numbers corresponding to each genomic window.
spos: Start physical position for each window.
epos: End physical position for each window.
sgpos: Start centimorgan position for each window.
egpos: End centimorgan position for each window.
n snps: Number of SNPs in each genomic window.
SampleID.0: Local ancestry for the first haplotype of the sample for each window.
SampleID.1: Local ancestry for the second haplotype of the sample for each window.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

class MSPWriter(snputils.ancestry.io.local.write.base.LAIBaseWriter): View Source

 15class MSPWriter(LAIBaseWriter):
 16    """
 17    A writer class for exporting local ancestry data from a `snputils.ancestry.genobj.LocalAncestryObject` 
 18    into an `.msp` or `.msp.tsv` file.
 19    """
 20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
 21        """
 22        Args:
 23            laiobj (LocalAncestryObject):
 24                A LocalAncestryObject instance.
 25            file (str or pathlib.Path): 
 26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 28        """
 29        self.__laiobj = laiobj
 30        self.__file = Path(file)
 31
 32    @property
 33    def laiobj(self) -> LocalAncestryObject:
 34        """
 35        Retrieve `laiobj`. 
 36
 37        Returns:
 38            **LocalAncestryObject:** 
 39                A LocalAncestryObject instance.
 40        """
 41        return self.__laiobj
 42
 43    @property
 44    def file(self) -> Path:
 45        """
 46        Retrieve `file`.
 47
 48        Returns:
 49            **pathlib.Path:** 
 50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
 51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
 52        """
 53        return self.__file
 54
 55    @file.setter
 56    def file(self, x: Union[str, Path]):
 57        """
 58        Update `file`.
 59        """
 60        self.__file = Path(x)
 61    
 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

A writer class for exporting local ancestry data from a snputils.ancestry.genobj.LocalAncestryObject into an .msp or .msp.tsv file.

MSPWriter( laiobj: LocalAncestryObject, file: str | pathlib.Path) View Source

20    def __init__(self, laiobj: LocalAncestryObject, file: Union[str, Path]) -> None:
21        """
22        Args:
23            laiobj (LocalAncestryObject):
24                A LocalAncestryObject instance.
25            file (str or pathlib.Path): 
26                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
27                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
28        """
29        self.__laiobj = laiobj
30        self.__file = Path(file)

Arguments:

laiobj (LocalAncestryObject): A LocalAncestryObject instance.
file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

laiobj: LocalAncestryObject View Source

32    @property
33    def laiobj(self) -> LocalAncestryObject:
34        """
35        Retrieve `laiobj`. 
36
37        Returns:
38            **LocalAncestryObject:** 
39                A LocalAncestryObject instance.
40        """
41        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib.Path View Source

43    @property
44    def file(self) -> Path:
45        """
46        Retrieve `file`.
47
48        Returns:
49            **pathlib.Path:** 
50                Path to the file where the data will be saved. It should end with `.msp` or `.msp.tsv`. 
51                If the provided path does not have one of these extensions, the `.msp` extension will be appended.
52        """
53        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .msp or .msp.tsv. If the provided path does not have one of these extensions, the .msp extension will be appended.

def write(self) -> None: View Source

 62    def write(self) -> None:
 63        """
 64        Write the data contained in the `laiobj` instance to the specified output `file`. 
 65        If the file already exists, it will be overwritten.
 66
 67        **Output MSP content:**
 68
 69        The output `.msp` file will contain local ancestry assignments for each haplotype across genomic windows.
 70        Each row corresponds to a genomic window and includes the following columns:
 71
 72        - `#chm`: Chromosome numbers corresponding to each genomic window.
 73        - `spos`: Start physical position for each window.
 74        - `epos`: End physical position for each window.
 75        - `sgpos`: Start centimorgan position for each window.
 76        - `egpos`: End centimorgan position for each window.
 77        - `n snps`: Number of SNPs in each genomic window.
 78        - `SampleID.0`: Local ancestry for the first haplotype of the sample for each window.
 79        - `SampleID.1`: Local ancestry for the second haplotype of the sample for each window.
 80        """
 81        log.info(f"LAI object contains: {self.laiobj.n_samples} samples, {self.laiobj.n_ancestries} ancestries.")
 82
 83        # Define the valid file extensions
 84        valid_extensions = ('.msp', '.msp.tsv')
 85
 86        # Append '.msp' extension if not already present
 87        if not self.file.name.endswith(valid_extensions):
 88            self.file = self.file.with_name(self.file.name + '.msp')
 89
 90        # Check if file already exists
 91        if self.file.exists():
 92            warnings.warn(f"File '{self.file}' already exists and will be overwritten.")
 93
 94        # Compute the number of windows and haplotypes
 95        n_windows = self.laiobj.n_windows
 96        n_haplotypes = self.laiobj.n_haplotypes
 97
 98        # Initialize attributes with NaN where they are None
 99        chromosomes = self.laiobj.chromosomes if self.laiobj.chromosomes is not None else np.full(n_windows, np.nan)
100        physical_pos = self.laiobj.physical_pos if self.laiobj.physical_pos is not None else np.full((n_windows, 2), np.nan)
101        centimorgan_pos = self.laiobj.centimorgan_pos if self.laiobj.centimorgan_pos is not None else np.full((n_windows, 2), np.nan)
102        window_sizes = self.laiobj.window_sizes if self.laiobj.window_sizes is not None else np.full(n_windows, np.nan)
103        
104        haplotypes = self.laiobj.haplotypes
105        if haplotypes is None:
106            # Generate haplotypes from samples or default identifiers
107            if self.laiobj.samples is not None:
108                haplotypes = [f"{sample}.{i}" for sample in self.laiobj.samples for i in range(2)]
109                warnings.warn(
110                    "Haplotype data is missing. Haplotypes have been automatically generated "
111                    "from the provided sample identifiers."
112                )
113            else:
114                haplotypes = [f"sample_{i//2}.{i%2}" for i in range(n_haplotypes)]
115                warnings.warn(
116                    "Haplotype data and sample identifiers are missing. Default haplotype identifiers have been generated "
117                    "as `sample_<index>.0` and `sample_<index>.1`."
118                )
119
120        # Prepare columns for the DataFrame
121        columns = ["spos", "epos", "sgpos", "egpos", "n snps"]
122        lai_dic = {
123            "#chm": chromosomes,
124            "spos": physical_pos[:, 0],
125            "epos": physical_pos[:, 1],
126            "sgpos": centimorgan_pos[:, 0],
127            "egpos": centimorgan_pos[:, 1],
128            "n snps": window_sizes,
129        }
130
131        # Populate the dictionary with haplotype data
132        for ilai, haplotype in enumerate(haplotypes):
133            lai_dic[haplotype] = self.laiobj.lai[:, ilai]
134            columns.append(haplotype)
135            
136        # Check if DataFrame is empty
137        if len(lai_dic["#chm"]) == 0:
138            raise ValueError("No data to write: all columns are empty or missing.")
139
140        # Create a DataFrame from the dictionary containing all data
141        lai_df = pd.DataFrame(lai_dic)
142
143        log.info(f"Writing MSP file to '{self.file}'...")
144
145        # Save the DataFrame to the .msp file in tab-separated format
146        lai_df.to_csv(self.file, sep="\t", index=False, header=False)
147        
148        # Construct the second line for the output file containing the column headers
149        second_line = "#chm" + "\t" + "\t".join(columns)
150        
151        # If an ancestry map is available, prepend it to the output file
152        if self.laiobj.ancestry_map is not None:
153            ancestries_codes = list(self.laiobj.ancestry_map.keys()) # Get corresponding codes
154            ancestries = list(self.laiobj.ancestry_map.values()) # Get ancestry names
155            
156            # Create the first line for the ancestry information, detailing subpopulation codes
157            first_line = "#Subpopulation order/codes: " + "\t".join(
158                f"{a}={ancestries_codes[ai]}" for ai, a in enumerate(ancestries)
159            )
160
161            # Open the file for reading and prepend the first line       
162            with open(self.__file, "r+") as f:
163                content = f.read()
164                f.seek(0,0)
165                f.write(first_line.rstrip('\r\n') + '\n' + second_line + '\n' + content)
166
167        log.info(f"Finished writing MSP file to '{self.file}'.")
168
169        return None

Write the data contained in the laiobj instance to the specified output file. If the file already exists, it will be overwritten.

Output MSP content:

The output .msp file will contain local ancestry assignments for each haplotype across genomic windows. Each row corresponds to a genomic window and includes the following columns:

#chm: Chromosome numbers corresponding to each genomic window.
spos: Start physical position for each window.
epos: End physical position for each window.
sgpos: Start centimorgan position for each window.
egpos: End centimorgan position for each window.
n snps: Number of SNPs in each genomic window.
SampleID.0: Local ancestry for the first haplotype of the sample for each window.
SampleID.1: Local ancestry for the second haplotype of the sample for each window.

class AdmixtureMappingVCFWriter: View Source

 16class AdmixtureMappingVCFWriter:
 17    """
 18    A writer class for converting and writing local ancestry data into ancestry-specific 
 19    VCF/BCF files for ADMIXTURE mapping.
 20    """
 21    def __init__(
 22            self, 
 23            laiobj: LocalAncestryObject, 
 24            file: Union[str, Path], 
 25            ancestry_map: Optional[Dict[str, str]] = None
 26        ):
 27        """
 28        Args:
 29            laiobj (LocalAncestryObject): 
 30                A LocalAncestryObject instance.
 31            file (str or pathlib.Path): 
 32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 34            ancestry_map (dict of str to str, optional): 
 35                A dictionary mapping ancestry codes to region names. If not explicitly 
 36                provided, it will default to the `ancestry_map` from `laiobj`.
 37        """
 38        self.__laiobj = laiobj
 39        self.__file = Path(file)
 40        self.__ancestry_map = ancestry_map
 41
 42    @property
 43    def laiobj(self) -> LocalAncestryObject:
 44        """
 45        Retrieve `laiobj`. 
 46
 47        Returns:
 48            **LocalAncestryObject:** 
 49                A LocalAncestryObject instance.
 50        """
 51        return self.__laiobj
 52
 53    @property
 54    def file(self) -> Path:
 55        """
 56        Retrieve `file`.
 57
 58        Returns:
 59            **pathlib.Path:** 
 60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
 61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
 62        """
 63        return self.__file
 64
 65    @property
 66    def ancestry_map(self) -> Dict[str, str]:
 67        """
 68        Retrieve `ancestry_map`.
 69
 70        Returns:
 71            **dict of str to str:** 
 72                A dictionary mapping ancestry codes to region names. If not explicitly 
 73                provided, it will default to the `ancestry_map` from `laiobj`.
 74        """
 75        if self.__ancestry_map is not None:
 76            return self.__ancestry_map
 77        elif self.laiobj.ancestry_map is not None:
 78            return self.laiobj.ancestry_map
 79        else:
 80            raise ValueError(
 81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
 82                "during initialization or ensure `laiobj.ancestry_map` is set."
 83            )
 84
 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            if self.laiobj.physical_pos is not None:
138                pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64)
139                variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142                variants_info = None
143
144            # Modify LAI data values to simulate a SNP file
145            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
146            
147            match = (self.laiobj.lai == ancestry)
148            match = match.view(np.int8)
149            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
150
151
152            # Set up VCF-related data
153            calldata_gt = match
154            del match
155            gc.collect()
156            samples = np.array(self.laiobj.samples)
157            variants_chrom = self.laiobj.chromosomes
158            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
159            variants_id = np.array(variants_list)
160            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
161            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
162
163            # Create the SNPObject
164            variant_data_obj = SNPObject(
165                calldata_gt=calldata_gt,
166                samples=samples,
167                variants_chrom=variants_chrom,
168                variants_id=variants_id,
169                variants_ref = variants_ref,
170                variants_alt = variants_alt,
171                variants_pos = pos_list,
172            )
173
174            # Log the start of the VCF file writing process
175            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
176
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write(variants_info=variants_info)
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

A writer class for converting and writing local ancestry data into ancestry-specific VCF/BCF files for ADMIXTURE mapping.

AdmixtureMappingVCFWriter( laiobj: LocalAncestryObject, file: str | pathlib.Path, ancestry_map: Dict[str, str] | None = None) View Source

21    def __init__(
22            self, 
23            laiobj: LocalAncestryObject, 
24            file: Union[str, Path], 
25            ancestry_map: Optional[Dict[str, str]] = None
26        ):
27        """
28        Args:
29            laiobj (LocalAncestryObject): 
30                A LocalAncestryObject instance.
31            file (str or pathlib.Path): 
32                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
33                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
34            ancestry_map (dict of str to str, optional): 
35                A dictionary mapping ancestry codes to region names. If not explicitly 
36                provided, it will default to the `ancestry_map` from `laiobj`.
37        """
38        self.__laiobj = laiobj
39        self.__file = Path(file)
40        self.__ancestry_map = ancestry_map

Arguments:

laiobj (LocalAncestryObject): A LocalAncestryObject instance.
file (str or pathlib.Path): Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.
ancestry_map (dict of str to str, optional): A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

laiobj: LocalAncestryObject View Source

42    @property
43    def laiobj(self) -> LocalAncestryObject:
44        """
45        Retrieve `laiobj`. 
46
47        Returns:
48            **LocalAncestryObject:** 
49                A LocalAncestryObject instance.
50        """
51        return self.__laiobj

Retrieve laiobj.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

file: pathlib.Path View Source

53    @property
54    def file(self) -> Path:
55        """
56        Retrieve `file`.
57
58        Returns:
59            **pathlib.Path:** 
60                Path to the file where the data will be saved. It should end with `.vcf` or `.bcf`. 
61                If the provided path does not have one of these extensions, the `.vcf` extension will be appended.
62        """
63        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file where the data will be saved. It should end with .vcf or .bcf. If the provided path does not have one of these extensions, the .vcf extension will be appended.

ancestry_map: Dict[str, str] View Source

65    @property
66    def ancestry_map(self) -> Dict[str, str]:
67        """
68        Retrieve `ancestry_map`.
69
70        Returns:
71            **dict of str to str:** 
72                A dictionary mapping ancestry codes to region names. If not explicitly 
73                provided, it will default to the `ancestry_map` from `laiobj`.
74        """
75        if self.__ancestry_map is not None:
76            return self.__ancestry_map
77        elif self.laiobj.ancestry_map is not None:
78            return self.laiobj.ancestry_map
79        else:
80            raise ValueError(
81                "Ancestry mapping is required but missing. Provide `ancestry_map` "
82                "during initialization or ensure `laiobj.ancestry_map` is set."
83            )

Retrieve ancestry_map.

Returns:

dict of str to str: A dictionary mapping ancestry codes to region names. If not explicitly provided, it will default to the ancestry_map from laiobj.

def write(self) -> None: View Source

 85    def write(self) -> None:
 86        """
 87        Write VCF or BCF files for each ancestry type defined in the ancestry map.
 88        If the file already exists, it will be overwritten.
 89
 90        **Output VCF/BCF content:**
 91        
 92        For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format.
 93        SNPs are encoded as follows:
 94
 95        - `1`: Indicates positions that match the specified ancestry.
 96        - `0`: Indicates positions that do not match the specified ancestry.
 97
 98        The VCF/BCF files will contain the following fields:
 99
100        - `CHROM`: Chromosome for each variant.
101        - `POS`: Chromosomal positions for each variant.
102        - `ID`: Unique identifier for each variant.
103        - `REF`: Reference allele for each variant.
104        - `ALT`: Alternate allele for each variant.
105        - `QUAL`: Phred-scaled quality score for each variant.
106        - `FILTER`: Status indicating whether each SNP passed control checks.
107        - `INFO`: When physical positions are available, contains `END=<end_pos>` for the segment end; otherwise `'.'`.
108        - `FORMAT`: Genotype format. Set to `'GT'`, representing the genotype as phased alleles.
109        - `<SampleID>`: One column per sample, containing the genotype data (`1|0`, `0|1`, etc.).
110
111        **Output files:**
112
113        - A separate VCF file is written for each ancestry type, with filenames formatted as:
114        `<filename>_<ancestry>.vcf` (e.g., `output_African.vcf`).
115        """
116        # Process the list of positions to include both the start and end coordinates for each window
117        # Iterate over each ancestry key in the ancestry mapping
118        for key in self.ancestry_map:
119            ancestry = int(key)
120            anc_string = self.ancestry_map[key]
121
122            # Define the output file format, ensuring it has the correct ancestry-specific suffix
123            file_extension = (".vcf", ".bcf")
124            
125            # Check if file has one of the specified extensions
126            if self.file.suffix not in file_extension:
127                # If file does not have the correct extension, default to ".vcf"
128                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}.vcf")
129            else:
130                # If file has the correct extension, insert the ancestry string before the extension
131                output_file = self.file.with_name(f"{self.file.stem}_{anc_string}{self.file.suffix}")
132
133            # Check if file already exists
134            if output_file.exists():
135                warnings.warn(f"File '{output_file}' already exists and will be overwritten.")
136
137            if self.laiobj.physical_pos is not None:
138                pos_list = np.array([val1 for val1, _ in self.laiobj.physical_pos], dtype=np.int64)
139                variants_info = [f"END={val2}" for _, val2 in self.laiobj.physical_pos]
140            else:
141                pos_list = None
142                variants_info = None
143
144            # Modify LAI data values to simulate a SNP file
145            # The positions in LAI corresponding to the current ancestry key are mapped to 1, and the rest to 0
146            
147            match = (self.laiobj.lai == ancestry)
148            match = match.view(np.int8)
149            match = match.reshape(len(self.laiobj.lai),int(len(self.laiobj.lai[0])/2), 2 )
150
151
152            # Set up VCF-related data
153            calldata_gt = match
154            del match
155            gc.collect()
156            samples = np.array(self.laiobj.samples)
157            variants_chrom = self.laiobj.chromosomes
158            variants_list = [str(i+1) for i in range(len(self.laiobj.lai))]
159            variants_id = np.array(variants_list)
160            variants_ref = np.full(calldata_gt.shape[0], 'A', dtype='U5')
161            variants_alt = np.full(calldata_gt.shape[0], 'T', dtype='U1')
162
163            # Create the SNPObject
164            variant_data_obj = SNPObject(
165                calldata_gt=calldata_gt,
166                samples=samples,
167                variants_chrom=variants_chrom,
168                variants_id=variants_id,
169                variants_ref = variants_ref,
170                variants_alt = variants_alt,
171                variants_pos = pos_list,
172            )
173
174            # Log the start of the VCF file writing process
175            log.info(f"Writing VCF file for ancestry '{anc_string}' to '{output_file}'...")
176
177            vcf_writer = VCFWriter(variant_data_obj, output_file)
178            vcf_writer.write(variants_info=variants_info)
179
180            log.info(f"Finished writing VCF file for ancestry '{anc_string}' to '{output_file}'.")
181
182        return

Write VCF or BCF files for each ancestry type defined in the ancestry map. If the file already exists, it will be overwritten.

Output VCF/BCF content:

For each ancestry, this method converts LAI data to SNP alleles and writes it in a VCF-compatible format. SNPs are encoded as follows:

1: Indicates positions that match the specified ancestry.
0: Indicates positions that do not match the specified ancestry.

The VCF/BCF files will contain the following fields:

CHROM: Chromosome for each variant.
POS: Chromosomal positions for each variant.
ID: Unique identifier for each variant.
REF: Reference allele for each variant.
ALT: Alternate allele for each variant.
QUAL: Phred-scaled quality score for each variant.
FILTER: Status indicating whether each SNP passed control checks.
INFO: When physical positions are available, contains END=<end_pos> for the segment end; otherwise '.'.
FORMAT: Genotype format. Set to 'GT', representing the genotype as phased alleles.
<SampleID>: One column per sample, containing the genotype data (1|0, 0|1, etc.).

Output files:

A separate VCF file is written for each ancestry type, with filenames formatted as: <filename>_<ancestry>.vcf (e.g., output_African.vcf).

class AdmixtureReader(snputils.ancestry.io.wide.read.base.WideBaseReader): View Source

 13class AdmixtureReader(WideBaseReader):
 14    """
 15    A reader class for parsing ADMIXTURE files and constructing a `snputils.ancestry.genobj.GlobalAncestryObject`.
 16    """
 17    def __init__(
 18        self,
 19        Q_file: Union[str, Path],
 20        P_file: Optional[Union[str, Path]] = None,
 21        sample_file: Optional[Union[str, Path]] = None,
 22        snp_file: Optional[Union[str, Path]] = None,
 23        ancestry_file: Optional[Union[str, Path]] = None,
 24    ) -> None:
 25        """
 26        Args:
 27            Q_file (str or pathlib.Path):
 28                Path to the file containing the Q matrix (per-sample ancestry proportions).
 29                It should end with .Q or .txt.
 30                The file should use space (' ') as the delimiter.
 31            P_file (str or pathlib.Path, optional):
 32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 33                It should end with .P or .txt.
 34                The file should use space (' ') as the delimiter. If None, P is not loaded.
 35            sample_file (str or pathlib.Path, optional):
 36                Path to the single-column file containing sample identifiers. 
 37                It should end with .fam or .txt.
 38                If None, sample identifiers are not loaded.
 39            snp_file (str or pathlib.Path, optional):
 40                Path to the single-column file containing SNP identifiers. 
 41                It should end with .bim or .txt.
 42                If None, SNP identifiers are not loaded.
 43            ancestry_file (str or pathlib.Path, optional):
 44                Path to the single-column file containing ancestry labels for each sample.
 45                It should end with .map or .txt.
 46                If None, ancestries are not loaded.
 47        """
 48        self.__Q_file = Path(Q_file)
 49        self.__P_file = Path(P_file) if P_file is not None else None
 50        self.__sample_file = Path(sample_file) if sample_file is not None else None
 51        self.__snp_file = Path(snp_file) if snp_file is not None else None
 52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None
 53
 54    @property
 55    def Q_file(self) -> Path:
 56        """
 57        Retrieve Q_file.
 58
 59        Returns:
 60            **pathlib.Path:** 
 61                Path to the file containing the Q matrix (per-sample ancestry proportions).
 62                It should end with .Q or .txt.
 63                The file should use space (' ') as the delimiter.
 64        """
 65        return self.__Q_file
 66
 67    @property
 68    def P_file(self) -> Optional[Path]:
 69        """
 70        Retrieve P_file.
 71
 72        Returns:
 73            **pathlib.Path or None:** 
 74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
 75                It should end with .P or .txt.
 76                The file should use space (' ') as the delimiter. If None, P is not loaded.
 77        """
 78        return self.__P_file
 79
 80    @property
 81    def sample_file(self) -> Optional[Path]:
 82        """
 83        Retrieve sample_file.
 84
 85        Returns:
 86            **pathlib.Path:** 
 87                Path to the single-column file containing sample identifiers. 
 88                It should end with .fam or .txt.
 89                If None, sample identifiers are not loaded.
 90        """
 91        return self.__sample_file
 92    
 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file
105
106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file
118
119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        if self.P_file is not None:
145            log.info(f"Reading P matrix from '{self.P_file}'...")
146            P_mat = np.genfromtxt(self.P_file, delimiter=' ')
147        else:
148            P_mat = None
149
150        samples = self._read_sample_ids()
151        snps = self._read_snps()
152        ancestries = self._read_ancestries()
153
154        return GlobalAncestryObject(
155            Q_mat,
156            P_mat,
157            samples=samples,
158            snps=snps,
159            ancestries=ancestries
160        )

A reader class for parsing ADMIXTURE files and constructing a snputils.ancestry.genobj.GlobalAncestryObject.

17    def __init__(
18        self,
19        Q_file: Union[str, Path],
20        P_file: Optional[Union[str, Path]] = None,
21        sample_file: Optional[Union[str, Path]] = None,
22        snp_file: Optional[Union[str, Path]] = None,
23        ancestry_file: Optional[Union[str, Path]] = None,
24    ) -> None:
25        """
26        Args:
27            Q_file (str or pathlib.Path):
28                Path to the file containing the Q matrix (per-sample ancestry proportions).
29                It should end with .Q or .txt.
30                The file should use space (' ') as the delimiter.
31            P_file (str or pathlib.Path, optional):
32                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
33                It should end with .P or .txt.
34                The file should use space (' ') as the delimiter. If None, P is not loaded.
35            sample_file (str or pathlib.Path, optional):
36                Path to the single-column file containing sample identifiers. 
37                It should end with .fam or .txt.
38                If None, sample identifiers are not loaded.
39            snp_file (str or pathlib.Path, optional):
40                Path to the single-column file containing SNP identifiers. 
41                It should end with .bim or .txt.
42                If None, SNP identifiers are not loaded.
43            ancestry_file (str or pathlib.Path, optional):
44                Path to the single-column file containing ancestry labels for each sample.
45                It should end with .map or .txt.
46                If None, ancestries are not loaded.
47        """
48        self.__Q_file = Path(Q_file)
49        self.__P_file = Path(P_file) if P_file is not None else None
50        self.__sample_file = Path(sample_file) if sample_file is not None else None
51        self.__snp_file = Path(snp_file) if snp_file is not None else None
52        self.__ancestry_file = Path(ancestry_file) if ancestry_file is not None else None

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Q_file: pathlib.Path View Source

54    @property
55    def Q_file(self) -> Path:
56        """
57        Retrieve Q_file.
58
59        Returns:
60            **pathlib.Path:** 
61                Path to the file containing the Q matrix (per-sample ancestry proportions).
62                It should end with .Q or .txt.
63                The file should use space (' ') as the delimiter.
64        """
65        return self.__Q_file

Retrieve Q_file.

Returns:

pathlib.Path: Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.

P_file: pathlib.Path | None View Source

67    @property
68    def P_file(self) -> Optional[Path]:
69        """
70        Retrieve P_file.
71
72        Returns:
73            **pathlib.Path or None:** 
74                Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
75                It should end with .P or .txt.
76                The file should use space (' ') as the delimiter. If None, P is not loaded.
77        """
78        return self.__P_file

Retrieve P_file.

Returns:

pathlib.Path or None: Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.

sample_file: pathlib.Path | None View Source

80    @property
81    def sample_file(self) -> Optional[Path]:
82        """
83        Retrieve sample_file.
84
85        Returns:
86            **pathlib.Path:** 
87                Path to the single-column file containing sample identifiers. 
88                It should end with .fam or .txt.
89                If None, sample identifiers are not loaded.
90        """
91        return self.__sample_file

Retrieve sample_file.

Returns:

pathlib.Path: Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.

snp_file: pathlib.Path | None View Source

 93    @property
 94    def snp_file(self) -> Optional[Path]:
 95        """
 96        Retrieve snp_file.
 97
 98        Returns:
 99            **pathlib.Path:** 
100                Path to the single-column file containing SNP identifiers. 
101                It should end with .bim or .txt.
102                If None, SNP identifiers are not loaded.
103        """
104        return self.__snp_file

Retrieve snp_file.

Returns:

pathlib.Path: Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.

ancestry_file: pathlib.Path | None View Source

106    @property
107    def ancestry_file(self) -> Optional[Path]:
108        """
109        Retrieve ancestry_file.
110
111        Returns:
112            **pathlib.Path:** 
113                Path to the single-column file containing ancestry labels for each sample.
114                It should end with .map or .txt.
115                If None, ancestries are not loaded.
116        """
117        return self.__ancestry_file

Retrieve ancestry_file.

Returns:

pathlib.Path: Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

def read(self) -> GlobalAncestryObject: View Source

119    def read(self) -> 'GlobalAncestryObject':
120        """
121        Read data from the provided ADMIXTURE files and construct a 
122        snputils.ancestry.genobj.GlobalAncestryObject instance.
123
124        **Expected ADMIXTURE files content:**
125
126        - **Q_file**: 
127            A text file containing the Q matrix with per-sample ancestry proportions. 
128             Each row corresponds to a sample, and each column corresponds to an ancestry.
129        - **P_file**: 
130            A text file containing the P matrix with per-ancestry SNP frequencies.
131            Each row corresponds to a SNP, and each column corresponds to an ancestry.
132
133        Optional files (if provided):
134        - **sample_file**: A single-column text file containing sample identifiers in order.
135        - **snp_file**: A single-column text file containing SNP identifiers in order.
136        - **ancestry_file**: A single-column text file containing ancestry labels for each sample.
137
138        Returns:
139            **GlobalAncestryObject:** 
140                A GlobalAncestryObject instance.
141        """
142        log.info(f"Reading Q matrix from '{self.Q_file}'...")
143        Q_mat = np.genfromtxt(self.Q_file, delimiter=' ')
144        if self.P_file is not None:
145            log.info(f"Reading P matrix from '{self.P_file}'...")
146            P_mat = np.genfromtxt(self.P_file, delimiter=' ')
147        else:
148            P_mat = None
149
150        samples = self._read_sample_ids()
151        snps = self._read_snps()
152        ancestries = self._read_ancestries()
153
154        return GlobalAncestryObject(
155            Q_mat,
156            P_mat,
157            samples=samples,
158            snps=snps,
159            ancestries=ancestries
160        )

Read data from the provided ADMIXTURE files and construct a snputils.ancestry.genobj.GlobalAncestryObject instance.

Expected ADMIXTURE files content:

Q_file: A text file containing the Q matrix with per-sample ancestry proportions. Each row corresponds to a sample, and each column corresponds to an ancestry.
P_file: A text file containing the P matrix with per-ancestry SNP frequencies. Each row corresponds to a SNP, and each column corresponds to an ancestry.

Optional files (if provided):

sample_file: A single-column text file containing sample identifiers in order.
snp_file: A single-column text file containing SNP identifiers in order.
ancestry_file: A single-column text file containing ancestry labels for each sample.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_lai( file: str | pathlib.Path, **kwargs) -> LocalAncestryObject: View Source

 8def read_lai(file: Union[str, Path], **kwargs) -> LocalAncestryObject:
 9    """
10    Automatically detect the local ancestry data file format from the file's extension and 
11    read it into a `snputils.ancestry.genobj.LocalAncestryObject`.
12
13    **Supported formats:**
14
15    - `.msp`: Text-based MSP format.
16    - `.msp.tsv`: Text-based MSP format with TSV extension.
17    
18    Args:
19        file (str or pathlib.Path): 
20            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
21        **kwargs: Additional arguments passed to the reader method.
22    """
23    from snputils.ancestry.io.local.read.auto import LAIReader
24
25    return LAIReader(file).read(**kwargs)

Automatically detect the local ancestry data file format from the file's extension and read it into a snputils.ancestry.genobj.LocalAncestryObject.

Supported formats:

.msp: Text-based MSP format.
.msp.tsv: Text-based MSP format with TSV extension.

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.
**kwargs: Additional arguments passed to the reader method.

def read_msp( file: str | pathlib.Path) -> LocalAncestryObject: View Source

28def read_msp(file: Union[str, Path]) -> 'LocalAncestryObject':
29    """
30    Read data from an `.msp` or `.msp.tsv` file and construct a `snputils.ancestry.genobj.LocalAncestryObject`.
31
32    Args:
33        file (str or pathlib.Path): 
34            Path to the file to be read. It should end with `.msp` or `.msp.tsv`.
35
36    Returns:
37        **LocalAncestryObject:**
38            A LocalAncestryObject instance.
39    """
40    from snputils.ancestry.io.local.read.msp import MSPReader
41
42    return MSPReader(file).read()

Read data from an .msp or .msp.tsv file and construct a snputils.ancestry.genobj.LocalAncestryObject.

Arguments:

file (str or pathlib.Path): Path to the file to be read. It should end with .msp or .msp.tsv.

Returns:

LocalAncestryObject: A LocalAncestryObject instance.

 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Optional[Union[str, Path]] = None,
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path, optional):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter. If None, P is not loaded.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

 8def read_admixture(
 9    Q_file: Union[str, Path],
10    P_file: Optional[Union[str, Path]] = None,
11    sample_file: Optional[Union[str, Path]] = None,
12    snp_file: Optional[Union[str, Path]] = None,
13    ancestry_file: Optional[Union[str, Path]] = None,
14) -> 'GlobalAncestryObject':
15    """
16    Read ADMIXTURE files into a `snputils.ancestry.genobj.GlobalAncestryObject`.
17
18    Args:
19        Q_file (str or pathlib.Path):
20            Path to the file containing the Q matrix (per-sample ancestry proportions).
21            It should end with .Q or .txt.
22            The file should use space (' ') as the delimiter.
23        P_file (str or pathlib.Path, optional):
24            Path to the file containing the P/F matrix (per-ancestry SNP frequencies).
25            It should end with .P or .txt.
26            The file should use space (' ') as the delimiter. If None, P is not loaded.
27        sample_file (str or pathlib.Path, optional):
28            Path to the single-column file containing sample identifiers. 
29            It should end with .fam or .txt.
30            If None, sample identifiers are not loaded.
31        snp_file (str or pathlib.Path, optional):
32            Path to the single-column file containing SNP identifiers. 
33            It should end with .bim or .txt.
34            If None, SNP identifiers are not loaded.
35        ancestry_file (str or pathlib.Path, optional):
36            Path to the single-column file containing ancestry labels for each sample.
37            It should end with .map or .txt.
38            If None, ancestries are not loaded.
39
40    Returns:
41            **GlobalAncestryObject:** 
42                A GlobalAncestryObject instance.
43    """
44    from snputils.ancestry.io.wide.read.admixture import AdmixtureReader
45
46    return AdmixtureReader(
47        Q_file=Q_file,
48        P_file=P_file,
49        sample_file=sample_file,
50        snp_file=snp_file,
51        ancestry_file=ancestry_file
52    ).read()

Read ADMIXTURE files into a snputils.ancestry.genobj.GlobalAncestryObject.

Arguments:

Q_file (str or pathlib.Path): Path to the file containing the Q matrix (per-sample ancestry proportions). It should end with .Q or .txt. The file should use space (' ') as the delimiter.
P_file (str or pathlib.Path, optional): Path to the file containing the P/F matrix (per-ancestry SNP frequencies). It should end with .P or .txt. The file should use space (' ') as the delimiter. If None, P is not loaded.
sample_file (str or pathlib.Path, optional): Path to the single-column file containing sample identifiers. It should end with .fam or .txt. If None, sample identifiers are not loaded.
snp_file (str or pathlib.Path, optional): Path to the single-column file containing SNP identifiers. It should end with .bim or .txt. If None, SNP identifiers are not loaded.
ancestry_file (str or pathlib.Path, optional): Path to the single-column file containing ancestry labels for each sample. It should end with .map or .txt. If None, ancestries are not loaded.

Returns:

GlobalAncestryObject: A GlobalAncestryObject instance.

def read_ibd( file: str | pathlib.Path, **kwargs) -> IBDObject: View Source

 8def read_ibd(file: Union[str, Path], **kwargs) -> IBDObject:
 9    """
10    Automatically detect the IBD data file format from the file's extension and read it into an `IBDObject`.
11
12    Supported formats:
13    - Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
14    - ancIBD (template only).
15
16    Args:
17        file (str or pathlib.Path): Path to the file to be read.
18        **kwargs: Additional arguments passed to the reader method.
19    """
20    from snputils.ibd.io.read.auto import IBDReader
21
22    return IBDReader(file).read(**kwargs)

Automatically detect the IBD data file format from the file's extension and read it into an IBDObject.

Supported formats:

Hap-IBD (no standard extension; defaults to tab-delimited columns without header).
ancIBD (template only).

Arguments:

file (str or pathlib.Path): Path to the file to be read.
**kwargs: Additional arguments passed to the reader method.

class HapIBDReader(snputils.ibd.io.read.base.IBDBaseReader): View Source

 18class HapIBDReader(IBDBaseReader):
 19    """
 20    Reads an IBD file in Hap-IBD format and processes it into an `IBDObject`.
 21    """
 22
 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Reads an IBD file in Hap-IBD format and processes it into an IBDObject.

def read( self, separator: str | None = None) -> IBDObject: View Source

 23    def read(self, separator: Optional[str] = None) -> IBDObject:
 24        """
 25        Read a Hap-IBD file into an `IBDObject`.
 26
 27        The Hap-IBD format is a delimited text without a header with columns:
 28        sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm
 29
 30        Notes:
 31        - Haplotype identifiers are 1-based and take values in {1, 2}.
 32
 33        Args:
 34            separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.
 35
 36        Returns:
 37            **IBDObject**: An IBDObject instance.
 38        """
 39        log.info(f"Reading {self.file}")
 40
 41        # Column names for Hap-IBD files (no header present in input)
 42        col_names = [
 43            'sample_id_1', 'haplotype_id_1', 'sample_id_2', 'haplotype_id_2',
 44            'chrom', 'start', 'end', 'length_cm'
 45        ]
 46
 47        # Detect gzip by extension
 48        is_gz = str(self.file).endswith('.gz')
 49
 50        # If separator is None, treat as whitespace-delimited (any spaces or tabs)
 51        if separator is None:
 52            # Polars doesn't support regex separators; normalize whitespace to single tabs before parsing
 53            if is_gz:
 54                with gzip.open(self.file, 'rt') as f:
 55                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 56            else:
 57                with open(self.file, 'r') as f:
 58                    lines = [re.sub(r"\s+", "\t", line.strip()) for line in f if line.strip()]
 59
 60            data = StringIO("\n".join(lines))
 61            df = pl.read_csv(
 62                source=data,
 63                has_header=False,
 64                separator='\t',
 65                new_columns=col_names,
 66                schema_overrides={
 67                    'sample_id_1': pl.Utf8,
 68                    'haplotype_id_1': pl.Int8,
 69                    'sample_id_2': pl.Utf8,
 70                    'haplotype_id_2': pl.Int8,
 71                    'chrom': pl.Utf8,
 72                    'start': pl.Int64,
 73                    'end': pl.Int64,
 74                    'length_cm': pl.Float64,
 75                },
 76            )
 77        else:
 78            df = pl.read_csv(
 79                source=str(self.file),
 80                has_header=False,
 81                separator=separator,
 82                new_columns=col_names,
 83                schema_overrides={
 84                    'sample_id_1': pl.Utf8,
 85                    'haplotype_id_1': pl.Int8,
 86                    'sample_id_2': pl.Utf8,
 87                    'haplotype_id_2': pl.Int8,
 88                    'chrom': pl.Utf8,
 89                    'start': pl.Int64,
 90                    'end': pl.Int64,
 91                    'length_cm': pl.Float64,
 92                },
 93            )
 94
 95        ibdobj = IBDObject(
 96            sample_id_1=df['sample_id_1'].to_numpy(),
 97            haplotype_id_1=df['haplotype_id_1'].to_numpy(),
 98            sample_id_2=df['sample_id_2'].to_numpy(),
 99            haplotype_id_2=df['haplotype_id_2'].to_numpy(),
100            chrom=df['chrom'].to_numpy(),
101            start=df['start'].to_numpy(),
102            end=df['end'].to_numpy(),
103            length_cm=df['length_cm'].to_numpy(),
104            segment_type=np.array(["IBD1"] * df.height),  # hap-IBD does not distinguish; treat as IBD1
105        )
106
107        log.info(f"Finished reading {self.file}")
108
109        return ibdobj

Read a Hap-IBD file into an IBDObject.

The Hap-IBD format is a delimited text without a header with columns: sample_id_1, haplotype_id_1, sample_id_2, haplotype_id_2, chromosome, start, end, length_cm

Notes:

Haplotype identifiers are 1-based and take values in {1, 2}.

Arguments:

separator (str, optional): Field delimiter. If None, whitespace (any number of spaces or tabs) is assumed.

Returns:

IBDObject: An IBDObject instance.

class AncIBDReader(snputils.ibd.io.read.base.IBDBaseReader): View Source

 17class AncIBDReader(IBDBaseReader):
 18    """
 19    Reads IBD data from ancIBD outputs (TSV), accepting a file (`ch_all.tsv` or `ch*.tsv`) or a directory.
 20    """
 21
 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Reads IBD data from ancIBD outputs (TSV), accepting a file (ch_all.tsv or ch*.tsv) or a directory.

def read( self, path: str | pathlib.Path | None = None, include_segment_types: Sequence[str] | None = ('IBD1', 'IBD2')) -> IBDObject: View Source

 22    def read(
 23        self,
 24        path: Optional[Union[str, Path]] = None,
 25        include_segment_types: Optional[Sequence[str]] = ("IBD1", "IBD2"),
 26    ) -> IBDObject:
 27        """
 28        Read ancIBD outputs and convert to `IBDObject`.
 29
 30        Inputs accepted:
 31        - A single TSV (optionally gzipped), e.g. `ch_all.tsv[.gz]` or `ch{CHR}.tsv[.gz]`.
 32        - A directory containing per-chromosome TSVs or `ch_all.tsv`.
 33
 34        Column schema (tab-separated with header):
 35        iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type
 36
 37        Notes:
 38        - Haplotype indices are not provided by ancIBD; set to -1.
 39        - Positions in IBDObject use base-pair StartBP/EndBP.
 40        - Length uses centiMorgan as `lengthM * 100`.
 41
 42        Args:
 43            path (str or Path, optional): Override input path. Defaults to `self.file`.
 44            include_segment_types (sequence of str, optional): Filter by `segment_type` (e.g., IBD1, IBD2). None to disable.
 45
 46        Returns:
 47            **IBDObject**: An IBDObject instance.
 48        """
 49        p = Path(path) if path is not None else Path(self.file)
 50        log.info(f"Reading ancIBD from {p}")
 51
 52        files: list[Path]
 53        if p.is_dir():
 54            # Prefer combined file if present, else gather per-chromosome files
 55            combined = p / "ch_all.tsv"
 56            combined_gz = p / "ch_all.tsv.gz"
 57            if combined.exists():
 58                files = [combined]
 59            elif combined_gz.exists():
 60                files = [combined_gz]
 61            else:
 62                files = sorted(list(p.glob("ch*.tsv")) + list(p.glob("ch*.tsv.gz")))
 63                if not files:
 64                    raise FileNotFoundError("No ancIBD output files found in directory.")
 65        else:
 66            files = [p]
 67
 68        frames = []
 69        schema_overrides = {
 70            "iid1": pl.Utf8,
 71            "iid2": pl.Utf8,
 72            "ch": pl.Utf8,
 73            "Start": pl.Int64,
 74            "End": pl.Int64,
 75            "length": pl.Int64,  # marker span; not used
 76            "StartM": pl.Float64,
 77            "EndM": pl.Float64,
 78            "lengthM": pl.Float64,
 79            "StartBP": pl.Int64,
 80            "EndBP": pl.Int64,
 81            "segment_type": pl.Utf8,
 82        }
 83
 84        for f in files:
 85            frame = pl.read_csv(str(f), separator="\t", has_header=True, schema_overrides=schema_overrides)
 86            frames.append(frame)
 87
 88        df = pl.concat(frames, how="vertical") if len(frames) > 1 else frames[0]
 89
 90        if include_segment_types is not None:
 91            df = df.filter(pl.col("segment_type").is_in(list(include_segment_types)))
 92
 93        # Map columns to IBDObject schema
 94        sample_id_1 = df["iid1"].to_numpy()
 95        sample_id_2 = df["iid2"].to_numpy()
 96        chrom = df["ch"].to_numpy()
 97        start_bp = df["StartBP"].to_numpy()
 98        end_bp = df["EndBP"].to_numpy()
 99        length_cm = (df["lengthM"] * 100.0).to_numpy()
100
101        # ancIBD doesn't include haplotype indices; set to -1
102        hap1 = np.full(sample_id_1.shape[0], -1, dtype=np.int8)
103        hap2 = np.full(sample_id_2.shape[0], -1, dtype=np.int8)
104
105        ibdobj = IBDObject(
106            sample_id_1=sample_id_1,
107            haplotype_id_1=hap1,
108            sample_id_2=sample_id_2,
109            haplotype_id_2=hap2,
110            chrom=chrom,
111            start=start_bp,
112            end=end_bp,
113            length_cm=length_cm,
114            segment_type=df["segment_type"].to_numpy(),
115        )
116
117        log.info(f"Finished reading ancIBD from {p}")
118        return ibdobj

Read ancIBD outputs and convert to IBDObject.

Inputs accepted:

A single TSV (optionally gzipped), e.g. ch_all.tsv[.gz] or ch{CHR}.tsv[.gz].
A directory containing per-chromosome TSVs or ch_all.tsv.

Column schema (tab-separated with header): iid1, iid2, ch, Start, End, length, StartM, EndM, lengthM, StartBP, EndBP, segment_type

Notes:

Haplotype indices are not provided by ancIBD; set to -1.
Positions in IBDObject use base-pair StartBP/EndBP.
Length uses centiMorgan as lengthM * 100.

Arguments:

path (str or Path, optional): Override input path. Defaults to self.file.
include_segment_types (sequence of str, optional): Filter by segment_type (e.g., IBD1, IBD2). None to disable.

Returns:

IBDObject: An IBDObject instance.

class IBDReader: View Source

 8class IBDReader:
 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)

IBDReader(file: str | pathlib.Path) View Source

 9    def __new__(
10        cls,
11        file: Union[str, Path]
12    ) -> object:
13        """
14        A factory class that attempts to detect the IBD file format and returns the corresponding reader.
15
16        Supported detections:
17        - Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
18        - ancIBD: directories with `ch_all.tsv`/`ch*.tsv` or files *.tsv / *.tsv.gz with ancIBD schema
19        """
20        file = Path(file)
21        suffixes = [s.lower() for s in file.suffixes]
22
23        # Directory-based detection for ancIBD
24        if file.is_dir():
25            if (file / 'ch_all.tsv').exists() or (file / 'ch_all.tsv.gz').exists():
26                from snputils.ibd.io.read.anc_ibd import AncIBDReader
27                return AncIBDReader(file)
28            has_chr_files = list(file.glob('ch*.tsv')) or list(file.glob('ch*.tsv.gz'))
29            if has_chr_files:
30                from snputils.ibd.io.read.anc_ibd import AncIBDReader
31                return AncIBDReader(file)
32            # Fallback to HapIBD if nothing matches
33            from snputils.ibd.io.read.hap_ibd import HapIBDReader
34            return HapIBDReader(file)
35
36        # File-based detection
37        if suffixes[-2:] == ['.ibd', '.gz'] or suffixes[-1:] == ['.ibd']:
38            from snputils.ibd.io.read.hap_ibd import HapIBDReader
39            return HapIBDReader(file)
40        if suffixes[-2:] == ['.tsv', '.gz'] or suffixes[-1:] == ['.tsv']:
41            from snputils.ibd.io.read.anc_ibd import AncIBDReader
42            return AncIBDReader(file)
43
44        # Default to HapIBDReader (most tools use .ibd[.gz])
45        from snputils.ibd.io.read.hap_ibd import HapIBDReader
46        return HapIBDReader(file)

A factory class that attempts to detect the IBD file format and returns the corresponding reader.

Supported detections:

Hap-IBD: *.ibd or *.ibd.gz (headerless, 8 columns)
ancIBD: directories with ch_all.tsv/ch*.tsv or files *.tsv / *.tsv.gz with ancIBD schema

class MultiPhenotypeObject: View Source

  9class MultiPhenotypeObject():
 10    """
 11    A class for multi-phenotype data.
 12
 13    This class serves as a container for phenotype data, allowing for
 14    operations such as filtering samples and accessing phenotype information.
 15    It uses a DataFrame to store the data, with the first column reserved for the sample identifers.
 16    """
 17    def __init__(
 18        self,
 19        phen_df: pd.DataFrame
 20    ) -> None:
 21        """
 22        Args:
 23            phen_df (pd.DataFrame): 
 24                A Pandas DataFrame containing phenotype data, with the first column 
 25                representing sample identifiers.
 26        """
 27        self.__phen_df = phen_df
 28
 29    def __getitem__(self, key):
 30        """
 31        To access an attribute of the class using the square bracket notation,
 32        similar to a dictionary.
 33        """
 34        try:
 35            return getattr(self, key)
 36        except:
 37            raise KeyError(f'Invalid key: {key}')
 38
 39    def __setitem__(self, key, value):
 40        """
 41        To set an attribute of the class using the square bracket notation,
 42        similar to a dictionary.
 43        """
 44        try:
 45            setattr(self, key, value)
 46        except AttributeError:
 47            raise KeyError(f'Invalid key: {key}')
 48
 49    @property
 50    def phen_df(self) -> pd.DataFrame:
 51        """
 52        Retrieve `phen_df`.
 53
 54        Returns:
 55            pd.DataFrame: 
 56                A Pandas DataFrame containing phenotype data, with the first column 
 57                representing sample identifiers.
 58        """
 59        return self.__phen_df
 60    
 61    @phen_df.setter
 62    def phen_df(self, x: pd.DataFrame):
 63        """
 64        Update `phen_df`.
 65        """
 66        self.__phen_df = x
 67    
 68    @property
 69    def n_samples(self) -> int:
 70        """
 71        Retrieve `n_samples`.
 72
 73        Returns:
 74            int: The total number of samples.
 75        """
 76        return len(self.phen_df)
 77
 78    def copy(self):
 79        """
 80        Create and return a copy of the current `MultiPhenotypeObject` instance.
 81
 82        Returns:
 83            MultiPhenotypeObject: A new instance of the current object.
 84        """
 85        return copy.copy(self)
 86    
 87    def filter_samples(
 88            self, 
 89            samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 
 90            indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 
 91            include: bool = True, 
 92            reorder: bool = False, 
 93            inplace: bool = False
 94        ) -> Optional['MultiPhenotypeObject']:
 95        """
 96        Filter samples in the `MultiPhenotypeObject` based on sample names or indexes.
 97
 98        This method allows you to include or exclude specific samples by their names,
 99        indexes, or both. When both samples and indexes are provided, the union of
100        the specified samples is used. Negative indexes are supported and follow NumPy's indexing 
101        conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or
102        `indexes` lists when including.
103
104        Args:
105            samples (str or array_like of str, optional): 
106                 Names of the samples to include or exclude. Can be a single sample name or a
107                 sequence of sample names. Default is None.
108            indexes (int or array_like of int, optional):
109                Indexes of the samples to include or exclude. Can be a single index or a sequence
110                of indexes. Negative indexes are supported. Default is None.
111            include (bool, default=True): 
112                If True, includes only the specified samples. If False, excludes the specified
113                samples. Default is True.
114            inplace (bool, default=False): 
115                If True, modifies the object in place. If False, returns a new
116                `MultiPhenotypeObject` with the samples filtered. Default is False.
117
118        Returns:
119            Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 
120            filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None.
121        """
122        # Ensure at least one of samples or indexes is provided
123        if samples is None and indexes is None:
124            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
125
126        n_samples = self.n_samples
127
128        # Create mask based on sample names
129        if samples is not None:
130            samples = np.asarray(samples).ravel()
131            # Extract sample names from the DataFrame
132            sample_names = self.__phen_df.iloc[:, 0].values
133            # Create mask for samples belonging to specified names
134            mask_samples = np.isin(sample_names, samples)
135        else:
136            mask_samples = np.zeros(n_samples, dtype=bool)
137
138        # Create mask based on sample indexes
139        if indexes is not None:
140            indexes = np.asarray(indexes).ravel()
141            # Adjust negative indexes
142            indexes = np.mod(indexes, n_samples)
143            if np.any((indexes < 0) | (indexes >= n_samples)):
144                raise IndexError("One or more sample indexes are out of bounds.")
145            # Create mask for samples at specified indexes
146            mask_indexes = np.zeros(n_samples, dtype=bool)
147            mask_indexes[indexes] = True
148        else:
149            mask_indexes = np.zeros(n_samples, dtype=bool)
150
151        # Combine masks using logical OR (union of samples)
152        mask_combined = mask_samples | mask_indexes
153
154        if not include:
155            # Invert mask if excluding samples
156            mask_combined = ~mask_combined
157
158        # If requested, compute an ordering of selected rows that follows the provided lists
159        ordered_indices = None
160        if include and reorder:
161            sel_indices = np.where(mask_combined)[0]
162            sample_names = self.__phen_df.iloc[:, 0].values
163            ordered_list = []
164            added = np.zeros(n_samples, dtype=bool)
165
166            # Respect the order provided in `samples` (supports duplicate sample names)
167            if samples is not None:
168                for s in samples:
169                    matches = np.where(sample_names == s)[0]
170                    for idx in matches:
171                        if mask_combined[idx] and not added[idx]:
172                            ordered_list.append(int(idx))
173                            added[idx] = True
174
175            # Then respect the order in `indexes`
176            if indexes is not None:
177                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
178                for idx in adj_idx:
179                    if mask_combined[idx] and not added[idx]:
180                        ordered_list.append(int(idx))
181                        added[idx] = True
182
183            # Finally, append any remaining selected rows in their original order
184            for idx in sel_indices:
185                if not added[idx]:
186                    ordered_list.append(int(idx))
187
188            ordered_indices = np.asarray(ordered_list, dtype=int)
189
190        # Filter the phenotype DataFrame
191        if inplace:
192            if ordered_indices is not None:
193                self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True)
194            else:
195                self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True)
196            return None
197        else:
198            phen_obj = self.copy()
199            if ordered_indices is not None:
200                phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True)
201            else:
202                phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True)
203            return phen_obj

A class for multi-phenotype data.

This class serves as a container for phenotype data, allowing for operations such as filtering samples and accessing phenotype information. It uses a DataFrame to store the data, with the first column reserved for the sample identifers.

MultiPhenotypeObject(phen_df: pandas.DataFrame) View Source

17    def __init__(
18        self,
19        phen_df: pd.DataFrame
20    ) -> None:
21        """
22        Args:
23            phen_df (pd.DataFrame): 
24                A Pandas DataFrame containing phenotype data, with the first column 
25                representing sample identifiers.
26        """
27        self.__phen_df = phen_df

Arguments:

phen_df (pd.DataFrame): A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.

phen_df: pandas.DataFrame View Source

49    @property
50    def phen_df(self) -> pd.DataFrame:
51        """
52        Retrieve `phen_df`.
53
54        Returns:
55            pd.DataFrame: 
56                A Pandas DataFrame containing phenotype data, with the first column 
57                representing sample identifiers.
58        """
59        return self.__phen_df

Retrieve phen_df.

Returns:

pd.DataFrame: A Pandas DataFrame containing phenotype data, with the first column representing sample identifiers.

n_samples: int View Source

68    @property
69    def n_samples(self) -> int:
70        """
71        Retrieve `n_samples`.
72
73        Returns:
74            int: The total number of samples.
75        """
76        return len(self.phen_df)

Retrieve n_samples.

Returns:

int: The total number of samples.

def copy(self): View Source

78    def copy(self):
79        """
80        Create and return a copy of the current `MultiPhenotypeObject` instance.
81
82        Returns:
83            MultiPhenotypeObject: A new instance of the current object.
84        """
85        return copy.copy(self)

Create and return a copy of the current MultiPhenotypeObject instance.

Returns:

MultiPhenotypeObject: A new instance of the current object.

 87    def filter_samples(
 88            self, 
 89            samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 
 90            indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 
 91            include: bool = True, 
 92            reorder: bool = False, 
 93            inplace: bool = False
 94        ) -> Optional['MultiPhenotypeObject']:
 95        """
 96        Filter samples in the `MultiPhenotypeObject` based on sample names or indexes.
 97
 98        This method allows you to include or exclude specific samples by their names,
 99        indexes, or both. When both samples and indexes are provided, the union of
100        the specified samples is used. Negative indexes are supported and follow NumPy's indexing 
101        conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or
102        `indexes` lists when including.
103
104        Args:
105            samples (str or array_like of str, optional): 
106                 Names of the samples to include or exclude. Can be a single sample name or a
107                 sequence of sample names. Default is None.
108            indexes (int or array_like of int, optional):
109                Indexes of the samples to include or exclude. Can be a single index or a sequence
110                of indexes. Negative indexes are supported. Default is None.
111            include (bool, default=True): 
112                If True, includes only the specified samples. If False, excludes the specified
113                samples. Default is True.
114            inplace (bool, default=False): 
115                If True, modifies the object in place. If False, returns a new
116                `MultiPhenotypeObject` with the samples filtered. Default is False.
117
118        Returns:
119            Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 
120            filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None.
121        """
122        # Ensure at least one of samples or indexes is provided
123        if samples is None and indexes is None:
124            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")
125
126        n_samples = self.n_samples
127
128        # Create mask based on sample names
129        if samples is not None:
130            samples = np.asarray(samples).ravel()
131            # Extract sample names from the DataFrame
132            sample_names = self.__phen_df.iloc[:, 0].values
133            # Create mask for samples belonging to specified names
134            mask_samples = np.isin(sample_names, samples)
135        else:
136            mask_samples = np.zeros(n_samples, dtype=bool)
137
138        # Create mask based on sample indexes
139        if indexes is not None:
140            indexes = np.asarray(indexes).ravel()
141            # Adjust negative indexes
142            indexes = np.mod(indexes, n_samples)
143            if np.any((indexes < 0) | (indexes >= n_samples)):
144                raise IndexError("One or more sample indexes are out of bounds.")
145            # Create mask for samples at specified indexes
146            mask_indexes = np.zeros(n_samples, dtype=bool)
147            mask_indexes[indexes] = True
148        else:
149            mask_indexes = np.zeros(n_samples, dtype=bool)
150
151        # Combine masks using logical OR (union of samples)
152        mask_combined = mask_samples | mask_indexes
153
154        if not include:
155            # Invert mask if excluding samples
156            mask_combined = ~mask_combined
157
158        # If requested, compute an ordering of selected rows that follows the provided lists
159        ordered_indices = None
160        if include and reorder:
161            sel_indices = np.where(mask_combined)[0]
162            sample_names = self.__phen_df.iloc[:, 0].values
163            ordered_list = []
164            added = np.zeros(n_samples, dtype=bool)
165
166            # Respect the order provided in `samples` (supports duplicate sample names)
167            if samples is not None:
168                for s in samples:
169                    matches = np.where(sample_names == s)[0]
170                    for idx in matches:
171                        if mask_combined[idx] and not added[idx]:
172                            ordered_list.append(int(idx))
173                            added[idx] = True
174
175            # Then respect the order in `indexes`
176            if indexes is not None:
177                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
178                for idx in adj_idx:
179                    if mask_combined[idx] and not added[idx]:
180                        ordered_list.append(int(idx))
181                        added[idx] = True
182
183            # Finally, append any remaining selected rows in their original order
184            for idx in sel_indices:
185                if not added[idx]:
186                    ordered_list.append(int(idx))
187
188            ordered_indices = np.asarray(ordered_list, dtype=int)
189
190        # Filter the phenotype DataFrame
191        if inplace:
192            if ordered_indices is not None:
193                self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True)
194            else:
195                self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True)
196            return None
197        else:
198            phen_obj = self.copy()
199            if ordered_indices is not None:
200                phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True)
201            else:
202                phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True)
203            return phen_obj

Filter samples in the MultiPhenotypeObject based on sample names or indexes.

This method allows you to include or exclude specific samples by their names, indexes, or both. When both samples and indexes are provided, the union of the specified samples is used. Negative indexes are supported and follow NumPy's indexing conventions. Set reorder=True to match the ordering of the provided samples and/or indexes lists when including.

Arguments:

samples (str or array_like of str, optional): Names of the samples to include or exclude. Can be a single sample name or a sequence of sample names. Default is None.
indexes (int or array_like of int, optional): Indexes of the samples to include or exclude. Can be a single index or a sequence of indexes. Negative indexes are supported. Default is None.
include (bool, default=True): If True, includes only the specified samples. If False, excludes the specified samples. Default is True.
inplace (bool, default=False): If True, modifies the object in place. If False, returns a new MultiPhenotypeObject with the samples filtered. Default is False.

Returns:

Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples filtered if inplace=False. If inplace=True, modifies the object in place and returns None.

class MultiPhenReader(snputils.phenotype.io.read.base.PhenotypeBaseReader): View Source

 17class MultiPhenReader(PhenotypeBaseReader):
 18    """
 19    Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen),
 20    constructing a `MultiPhenotypeObject`.
 21    """
 22    def __init__(self, file: Union[str, Path]) -> None:
 23        """
 24        Args:
 25            file (str or pathlib.Path):
 26                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
 27        """
 28        self.__file = file
 29
 30    @property
 31    def file(self) -> Path:
 32        """
 33        Retrieve `file`.
 34
 35        Returns:
 36            pathlib.Path:
 37                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
 38        """
 39        return self.__file
 40
 41    def read(
 42            self,
 43            samples_idx: int = 0,
 44            phen_names: Optional[List] = None,
 45            sep: str = ',',
 46            header: int = 0,
 47            drop: bool = False
 48        ) -> 'MultiPhenotypeObject':
 49        """
 50        Read data from `file` and construct a `MultiPhenotypeObject`.
 51
 52        Args:
 53            samples_idx (int, default=0): Index of the column containing sample identifiers.
 54                Default is 0, assuming the first column contains sample identifiers.
 55            phen_names (list of str, optional): List of phenotype column names. If provided,
 56                these columns will be renamed to the specified names.
 57            sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`,
 58                `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited.
 59            header (int, default=0): Row index to use as the column names. By default,
 60                uses the first row (`header=0`). Set to `None` if column names are provided
 61                explicitly.
 62            drop (bool, default=False): If True, removes columns not listed in `phen_names`
 63                (except the samples column).
 64
 65        Returns:
 66            MultiPhenotypeObject:
 67                A multi-phenotype object instance.
 68        """
 69        file_extension = os.path.splitext(self.file)[1]
 70
 71        log.info(f"Reading '{file_extension}' file from '{self.file}'...")
 72
 73        if file_extension == '.xlsx':
 74            phen_df = pd.read_excel(self.file, header=0, index_col=None)
 75        elif file_extension == '.csv':
 76            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 77        elif file_extension in ['.map', '.smap']:
 78            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 79        elif file_extension == '.tsv':
 80            phen_df = pd.read_csv(self.file, sep='\t')
 81        elif file_extension in ['.txt', '.phe', '.pheno']:
 82            phen_df = pd.read_csv(self.file, sep=r'\s+', header=header)
 83        elif file_extension == '.phen':
 84            with open(self.file, 'r') as f:
 85                contents = f.readlines()
 86            phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]}
 87            phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())})
 88        else:
 89            raise ValueError(
 90                f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}."
 91            )
 92
 93        phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True)
 94
 95        if samples_idx != 0:
 96            cols = ['samples'] + [col for col in phen_df.columns if col != 'samples']
 97            phen_df = phen_df[cols]
 98
 99        if phen_names is not None:
100            if drop:
101                non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names))
102                phen_df = phen_df.drop(non_phen_columns, axis=1)
103
104            phenotype_col_count = phen_df.shape[1] - 1
105            if phenotype_col_count == len(phen_names):
106                phen_df.columns.values[1:] = phen_names
107            else:
108                raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) "
109                                 f"and length of `phen_names` ({len(phen_names)}).")
110
111        return MultiPhenotypeObject(phen_df=phen_df)

Reader for multi-phenotype data from file (.xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen), constructing a MultiPhenotypeObject.

MultiPhenReader(file: str | pathlib.Path) View Source

22    def __init__(self, file: Union[str, Path]) -> None:
23        """
24        Args:
25            file (str or pathlib.Path):
26                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
27        """
28        self.__file = file

Arguments:

file (str or pathlib.Path): Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.

file: pathlib.Path View Source

30    @property
31    def file(self) -> Path:
32        """
33        Retrieve `file`.
34
35        Returns:
36            pathlib.Path:
37                Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.
38        """
39        return self.__file

Retrieve file.

Returns:

pathlib.Path: Path to the file containing phenotype data. Accepted formats: .xlsx, .csv, .tsv, .txt, .phe, .pheno, .map, .smap, .phen.

def read( self, samples_idx: int = 0, phen_names: List | None = None, sep: str = ',', header: int = 0, drop: bool = False) -> MultiPhenotypeObject: View Source

 41    def read(
 42            self,
 43            samples_idx: int = 0,
 44            phen_names: Optional[List] = None,
 45            sep: str = ',',
 46            header: int = 0,
 47            drop: bool = False
 48        ) -> 'MultiPhenotypeObject':
 49        """
 50        Read data from `file` and construct a `MultiPhenotypeObject`.
 51
 52        Args:
 53            samples_idx (int, default=0): Index of the column containing sample identifiers.
 54                Default is 0, assuming the first column contains sample identifiers.
 55            phen_names (list of str, optional): List of phenotype column names. If provided,
 56                these columns will be renamed to the specified names.
 57            sep (str, default=','): The delimiter for separating values in `.csv`, `.tsv`,
 58                `.txt`, `.phe`, `.pheno`, or `.map` files. Default is ','; use `sep=r'\\s+'` for whitespace-delimited.
 59            header (int, default=0): Row index to use as the column names. By default,
 60                uses the first row (`header=0`). Set to `None` if column names are provided
 61                explicitly.
 62            drop (bool, default=False): If True, removes columns not listed in `phen_names`
 63                (except the samples column).
 64
 65        Returns:
 66            MultiPhenotypeObject:
 67                A multi-phenotype object instance.
 68        """
 69        file_extension = os.path.splitext(self.file)[1]
 70
 71        log.info(f"Reading '{file_extension}' file from '{self.file}'...")
 72
 73        if file_extension == '.xlsx':
 74            phen_df = pd.read_excel(self.file, header=0, index_col=None)
 75        elif file_extension == '.csv':
 76            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 77        elif file_extension in ['.map', '.smap']:
 78            phen_df = pd.read_csv(self.file, sep=sep, header=header)
 79        elif file_extension == '.tsv':
 80            phen_df = pd.read_csv(self.file, sep='\t')
 81        elif file_extension in ['.txt', '.phe', '.pheno']:
 82            phen_df = pd.read_csv(self.file, sep=r'\s+', header=header)
 83        elif file_extension == '.phen':
 84            with open(self.file, 'r') as f:
 85                contents = f.readlines()
 86            phen_dict = {line.split()[0]: line.split()[1].strip() for line in contents[1:]}
 87            phen_df = pd.DataFrame({'samples': list(phen_dict.keys()), 'phenotype': list(phen_dict.values())})
 88        else:
 89            raise ValueError(
 90                f"Unsupported file extension {file_extension}. Supported extensions: {SUPPORTED_EXTENSIONS}."
 91            )
 92
 93        phen_df.rename(columns={phen_df.columns[samples_idx]: 'samples'}, inplace=True)
 94
 95        if samples_idx != 0:
 96            cols = ['samples'] + [col for col in phen_df.columns if col != 'samples']
 97            phen_df = phen_df[cols]
 98
 99        if phen_names is not None:
100            if drop:
101                non_phen_columns = list(set(phen_df.columns) - set(['samples']+phen_names))
102                phen_df = phen_df.drop(non_phen_columns, axis=1)
103
104            phenotype_col_count = phen_df.shape[1] - 1
105            if phenotype_col_count == len(phen_names):
106                phen_df.columns.values[1:] = phen_names
107            else:
108                raise ValueError(f"Mismatch between number of phenotype columns ({phenotype_col_count}) "
109                                 f"and length of `phen_names` ({len(phen_names)}).")
110
111        return MultiPhenotypeObject(phen_df=phen_df)

Read data from file and construct a MultiPhenotypeObject.

Arguments:

samples_idx (int, default=0): Index of the column containing sample identifiers. Default is 0, assuming the first column contains sample identifiers.
phen_names (list of str, optional): List of phenotype column names. If provided, these columns will be renamed to the specified names.
sep (str, default=','): The delimiter for separating values in .csv, .tsv, .txt, .phe, .pheno, or .map files. Default is ','; use sep=r'\s+' for whitespace-delimited.
header (int, default=0): Row index to use as the column names. By default, uses the first row (header=0). Set to None if column names are provided explicitly.
drop (bool, default=False): If True, removes columns not listed in phen_names (except the samples column).

Returns:

MultiPhenotypeObject: A multi-phenotype object instance.

class PhenotypeReader(snputils.phenotype.io.read.base.PhenotypeBaseReader): View Source

 11class PhenotypeReader(PhenotypeBaseReader):
 12    """
 13    Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno).
 14
 15    Expected format (headered, whitespace-delimited):
 16      - Must include `IID` (optionally preceded by `FID`)
 17      - First phenotype column after `IID` is used by default
 18    """
 19
 20    def __init__(self, file: Union[str, Path]) -> None:
 21        super().__init__(file)
 22
 23    @property
 24    def file(self) -> Path:
 25        return Path(self._file)
 26
 27    @staticmethod
 28    def _has_header_with_iid(file_path: Path) -> bool:
 29        with open(file_path, "r", encoding="utf-8") as handle:
 30            for raw_line in handle:
 31                line = raw_line.strip()
 32                if not line:
 33                    continue
 34                tokens = line.split()
 35                return any(token.lstrip("#").upper() == "IID" for token in tokens)
 36        raise ValueError("Empty phenotype file.")
 37
 38    @staticmethod
 39    def _resolve_column(columns, normalized_columns, requested: str) -> Optional[str]:
 40        requested_norm = str(requested).lstrip("#").upper()
 41        for col, col_norm in zip(columns, normalized_columns):
 42            if str(col) == str(requested) or col_norm == requested_norm:
 43                return str(col)
 44        return None
 45
 46    def read(
 47        self,
 48        phenotype_col: Optional[str] = None,
 49        quantitative: Optional[bool] = None,
 50    ) -> PhenotypeObject:
 51        file_path = self.file
 52        if not file_path.exists():
 53            raise FileNotFoundError(f"Phenotype file not found: '{file_path}'")
 54
 55        has_iid_header = self._has_header_with_iid(file_path)
 56        if has_iid_header:
 57            phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str)
 58        else:
 59            warnings.warn(
 60                (
 61                    "Phenotype file has no header/IID column. Legacy 3-column parsing "
 62                    "(FID IID PHENO) is deprecated; please switch to a headered format."
 63                ),
 64                UserWarning,
 65                stacklevel=2,
 66            )
 67            legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str)
 68            if legacy.shape[1] < 3:
 69                raise ValueError(
 70                    "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO."
 71                )
 72            phen_df = legacy.iloc[:, :3].copy()
 73            phen_df.columns = ["FID", "IID", "PHENO"]
 74
 75        if phen_df.empty:
 76            raise ValueError("Empty phenotype file.")
 77
 78        columns = [str(col) for col in phen_df.columns]
 79        normalized_columns = [col.lstrip("#").upper() for col in columns]
 80        if "IID" not in normalized_columns:
 81            raise ValueError("Phenotype file must include an IID column in the header.")
 82        iid_col = columns[normalized_columns.index("IID")]
 83
 84        iid_series = phen_df[iid_col].astype(str).str.strip()
 85        if iid_series.eq("").any():
 86            raise ValueError("Phenotype IID column contains empty values.")
 87        if iid_series.duplicated().any():
 88            raise ValueError("Phenotype IID values must be unique.")
 89
 90        if phenotype_col is not None:
 91            resolved = self._resolve_column(columns, normalized_columns, phenotype_col)
 92            if resolved is None:
 93                raise ValueError(
 94                    f"Phenotype column '{phenotype_col}' not found in header: {columns}"
 95                )
 96            target_col = resolved
 97        else:
 98            iid_idx = normalized_columns.index("IID")
 99            if iid_idx + 1 >= len(columns):
100                raise ValueError(
101                    "Phenotype file must include at least one phenotype column after IID."
102                )
103            target_col = columns[iid_idx + 1]
104
105        values = pd.to_numeric(phen_df[target_col], errors="coerce")
106        if values.isna().any():
107            bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist()
108            raise ValueError(
109                f"Phenotype column '{target_col}' contains non-numeric or missing values: "
110                f"{bad_examples}"
111            )
112
113        phenotype_name = str(target_col).lstrip("#")
114        return PhenotypeObject(
115            samples=iid_series.tolist(),
116            values=values.to_numpy(),
117            phenotype_name=phenotype_name,
118            quantitative=quantitative,
119        )

Reader for single-trait phenotype files (any extension; common: .txt, .phe, .pheno).

Expected format (headered, whitespace-delimited):

Must include IID (optionally preceded by FID)
First phenotype column after IID is used by default

PhenotypeReader(file: str | pathlib.Path) View Source

20    def __init__(self, file: Union[str, Path]) -> None:
21        super().__init__(file)

file: pathlib.Path View Source

23    @property
24    def file(self) -> Path:
25        return Path(self._file)

Retrieve file.

Returns:

pathlib.Path: Path to the file containing phenotype data.

def read( self, phenotype_col: str | None = None, quantitative: bool | None = None) -> PhenotypeObject: View Source

 46    def read(
 47        self,
 48        phenotype_col: Optional[str] = None,
 49        quantitative: Optional[bool] = None,
 50    ) -> PhenotypeObject:
 51        file_path = self.file
 52        if not file_path.exists():
 53            raise FileNotFoundError(f"Phenotype file not found: '{file_path}'")
 54
 55        has_iid_header = self._has_header_with_iid(file_path)
 56        if has_iid_header:
 57            phen_df = pd.read_csv(file_path, sep=r"\s+", dtype=str)
 58        else:
 59            warnings.warn(
 60                (
 61                    "Phenotype file has no header/IID column. Legacy 3-column parsing "
 62                    "(FID IID PHENO) is deprecated; please switch to a headered format."
 63                ),
 64                UserWarning,
 65                stacklevel=2,
 66            )
 67            legacy = pd.read_csv(file_path, header=None, sep=r"\s+", dtype=str)
 68            if legacy.shape[1] < 3:
 69                raise ValueError(
 70                    "Legacy phenotype parsing expects at least 3 columns: FID IID PHENO."
 71                )
 72            phen_df = legacy.iloc[:, :3].copy()
 73            phen_df.columns = ["FID", "IID", "PHENO"]
 74
 75        if phen_df.empty:
 76            raise ValueError("Empty phenotype file.")
 77
 78        columns = [str(col) for col in phen_df.columns]
 79        normalized_columns = [col.lstrip("#").upper() for col in columns]
 80        if "IID" not in normalized_columns:
 81            raise ValueError("Phenotype file must include an IID column in the header.")
 82        iid_col = columns[normalized_columns.index("IID")]
 83
 84        iid_series = phen_df[iid_col].astype(str).str.strip()
 85        if iid_series.eq("").any():
 86            raise ValueError("Phenotype IID column contains empty values.")
 87        if iid_series.duplicated().any():
 88            raise ValueError("Phenotype IID values must be unique.")
 89
 90        if phenotype_col is not None:
 91            resolved = self._resolve_column(columns, normalized_columns, phenotype_col)
 92            if resolved is None:
 93                raise ValueError(
 94                    f"Phenotype column '{phenotype_col}' not found in header: {columns}"
 95                )
 96            target_col = resolved
 97        else:
 98            iid_idx = normalized_columns.index("IID")
 99            if iid_idx + 1 >= len(columns):
100                raise ValueError(
101                    "Phenotype file must include at least one phenotype column after IID."
102                )
103            target_col = columns[iid_idx + 1]
104
105        values = pd.to_numeric(phen_df[target_col], errors="coerce")
106        if values.isna().any():
107            bad_examples = phen_df.loc[values.isna(), target_col].astype(str).head(5).tolist()
108            raise ValueError(
109                f"Phenotype column '{target_col}' contains non-numeric or missing values: "
110                f"{bad_examples}"
111            )
112
113        phenotype_name = str(target_col).lstrip("#")
114        return PhenotypeObject(
115            samples=iid_series.tolist(),
116            values=values.to_numpy(),
117            phenotype_name=phenotype_name,
118            quantitative=quantitative,
119        )

Abstract method to read data from the provided file.

Subclasses must implement this method to read and parse the data. The implementation should construct an instance of snputils.phenotype.genobj.MultiPhenotypeObject or snputils.phenotype.genobj.PhenotypeObject based on the read data.

 34def load_dataset(
 35        name: str,
 36        chromosomes: Union[List[str], List[int], str, int],
 37        variants_ids: Optional[List[str]] = None,
 38        sample_ids: Optional[List[str]] = None,
 39        verbose: bool = True,
 40        **read_kwargs
 41) -> SNPObject:
 42    """
 43    Load a genome dataset.
 44
 45    Args:
 46        name (str): Name of the dataset to load. Call `available_datasets_list()` to get the list of available datasets.
 47        chromosomes (List[str] | List[int] | str | int): Chromosomes to load.
 48        variants_ids (List[str]): List of variant IDs to load.
 49        sample_ids (List[str]): List of sample IDs to load.
 50        verbose (bool): Whether to show progress.
 51        **read_kwargs: Keyword arguments to pass to `PGENReader.read()`.
 52
 53    Returns:
 54        SNPObject: SNPObject containing the loaded dataset.
 55    """
 56    if isinstance(chromosomes, (str, int)):
 57        chromosomes = [chromosomes]
 58    chromosomes = [str(chr).lower().replace("chr", "") for chr in chromosomes]
 59
 60    if variants_ids is not None:
 61        variants_ids_txt = tempfile.NamedTemporaryFile(mode='w')
 62        variants_ids_txt.write("\n".join(variants_ids))
 63        variants_ids_txt.flush()
 64
 65    if sample_ids is not None:
 66        sample_ids_txt = tempfile.NamedTemporaryFile(mode='w')
 67        sample_ids_txt.write("\n".join(sample_ids))
 68        sample_ids_txt.flush()
 69
 70    merge_list_txt = tempfile.NamedTemporaryFile(mode='w')
 71
 72    data_home = get_data_home()
 73
 74    if name == "1kgp":
 75        data_path = data_home / name
 76        data_path.mkdir(parents=True, exist_ok=True)
 77        for chr in chromosomes:
 78            chr_path = data_path / chr_urls[name][chr]
 79            if not Path(chr_path).exists():
 80                log.info(f"Downloading chromosome {chr}...")
 81                download_url(f"{base_urls[name]}/{chr_urls[name][chr]}", chr_path, show_progress=verbose)
 82            else:
 83                log.info(f"Chromosome {chr} already exists. Skipping download.")
 84
 85            # Filter and convert to PGEN
 86            log.info(f"Processing chromosome {chr}...")
 87            out_file = chr_urls[name][chr].replace('.vcf.gz', '')
 88            execute_plink_cmd(
 89                ["--vcf", f"{chr_urls[name][chr]}"]
 90                + (["--keep", sample_ids_txt.name] if sample_ids is not None else [])
 91                + (["--extract", variants_ids_txt.name] if variants_ids is not None else [])
 92                + [
 93                    "--set-missing-var-ids", "@:#",
 94                    "--make-pgen",
 95                    "--out", out_file,
 96                ], cwd=data_path)
 97            merge_list_txt.write(f"{out_file}\n")
 98
 99        if len(chromosomes) > 1:
100            # Merge the PGEN files into single PGEN fileset
101            log.info("Merging PGEN files...")
102            merge_list_txt.flush()
103            print(f"Merge list file contents: {open(merge_list_txt.name, 'r').read()}")
104            execute_plink_cmd(["--pmerge-list", merge_list_txt.name, "--make-pgen", "--out", "1kgp"],
105                              cwd=data_path)
106        else:
107            # Rename the single PGEN file
108            for ext in ["pgen", "psam", "pvar"]:
109                Path(data_path / f"{out_file}.{ext}").rename(data_path / f"1kgp.{ext}")
110
111        # Read PGEN fileset with PGENReader into SNPObject
112        log.info("Reading PGEN fileset...")
113        snpobj = PGENReader(data_path / "1kgp").read(**read_kwargs)
114    else:
115        raise NotImplementedError(f"Dataset {name} not implemented.")
116
117    if variants_ids is not None:
118        variants_ids_txt.close()
119    if sample_ids is not None:
120        sample_ids_txt.close()
121    merge_list_txt.close()
122
123    return snpobj

Load a genome dataset.

Arguments:

name (str): Name of the dataset to load. Call available_datasets_list() to get the list of available datasets.
chromosomes (List[str] | List[int] | str | int): Chromosomes to load.
variants_ids (List[str]): List of variant IDs to load.
sample_ids (List[str]): List of sample IDs to load.
verbose (bool): Whether to show progress.
**read_kwargs: Keyword arguments to pass to PGENReader.read().

Returns:

SNPObject: SNPObject containing the loaded dataset.