Source code for snputils.phenotype.genobj.multi_phenobj

import copy
import warnings
import numpy as np
import pandas as pd
from typing import Union, Sequence, Optional

from snputils._utils.printing import array_shape, format_repr



[docs]
class MultiPhenotypeObject():
    """
    A class for multi-phenotype data.

    This class serves as a container for phenotype data, allowing for
    operations such as filtering samples and accessing phenotype information.
    It uses a DataFrame to store the data, with the first column reserved for the sample identifers.
    """
    def __init__(
        self,
        phen_df: pd.DataFrame
    ) -> None:
        """
        Args:
            phen_df (pd.DataFrame): 
                A Pandas DataFrame containing phenotype data, with the first column 
                representing sample identifiers.
        """
        self.__phen_df = phen_df

    def __getitem__(self, key):
        """
        To access an attribute of the class using the square bracket notation,
        similar to a dictionary.
        """
        try:
            return getattr(self, key)
        except:
            raise KeyError(f'Invalid key: {key}')

    def __setitem__(self, key, value):
        """
        To set an attribute of the class using the square bracket notation,
        similar to a dictionary.
        """
        try:
            setattr(self, key, value)
        except AttributeError:
            raise KeyError(f'Invalid key: {key}')

    def __repr__(self) -> str:
        sample_column = None if self.__phen_df.shape[1] == 0 else self.__phen_df.columns[0]
        return format_repr(
            self,
            shape=self.shape,
            n_samples=self.n_samples,
            n_phenotypes=self.n_phenotypes,
            sample_column=sample_column,
        )

    def __str__(self) -> str:
        return self.__repr__()

    @property
    def phen_df(self) -> pd.DataFrame:
        """
        Retrieve `phen_df`.

        Returns:
            pd.DataFrame: 
                A Pandas DataFrame containing phenotype data, with the first column 
                representing sample identifiers.
        """
        return self.__phen_df
    
    @phen_df.setter
    def phen_df(self, x: pd.DataFrame):
        """
        Update `phen_df`.
        """
        self.__phen_df = x
    
    @property
    def n_samples(self) -> int:
        """
        Retrieve `n_samples`.

        Returns:
            int: The total number of samples.
        """
        return len(self.phen_df)

    @property
    def n_phenotypes(self) -> int:
        """
        Retrieve `n_phenotypes`.

        Returns:
            int: Number of phenotype columns, excluding the sample identifier column.
        """
        return max(0, self.phen_df.shape[1] - 1)

    @property
    def shape(self) -> tuple[int, int]:
        """
        Retrieve the shape of the phenotype DataFrame.
        """
        phen_shape = array_shape(self.__phen_df)
        if phen_shape is None:
            return (self.n_samples, self.n_phenotypes + 1)
        return phen_shape


[docs]
    def copy(self):
        """
        Create and return a copy of the current `MultiPhenotypeObject` instance.

        Returns:
            MultiPhenotypeObject: A new instance of the current object.
        """
        return copy.copy(self)

    

[docs]
    def filter_samples(
            self, 
            samples: Optional[Union[str, Sequence[str], np.ndarray]] = None, 
            indexes: Optional[Union[int, Sequence[int], np.ndarray]] = None, 
            include: bool = True, 
            reorder: bool = False, 
            inplace: bool = False
        ) -> Optional['MultiPhenotypeObject']:
        """
        Filter samples in the `MultiPhenotypeObject` based on sample names or indexes.

        This method allows you to include or exclude specific samples by their names,
        indexes, or both. When both samples and indexes are provided, the union of
        the specified samples is used. Negative indexes are supported and follow NumPy's indexing 
        conventions. Set `reorder=True` to match the ordering of the provided `samples` and/or
        `indexes` lists when including.

        Args:
            samples (str or array_like of str, optional): 
                 Names of the samples to include or exclude. Can be a single sample name or a
                 sequence of sample names. Default is None.
            indexes (int or array_like of int, optional):
                Indexes of the samples to include or exclude. Can be a single index or a sequence
                of indexes. Negative indexes are supported. Default is None.
            include (bool, default=True): 
                If True, includes only the specified samples. If False, excludes the specified
                samples. Default is True.
            inplace (bool, default=False): 
                If True, modifies the object in place. If False, returns a new
                `MultiPhenotypeObject` with the samples filtered. Default is False.

        Returns:
            Optional[MultiPhenotypeObject]: Returns a new MultiPhenotypeObject with the specified samples 
            filtered if `inplace=False`. If `inplace=True`, modifies the object in place and returns None.
        """
        # Ensure at least one of samples or indexes is provided
        if samples is None and indexes is None:
            raise ValueError("At least one of 'samples' or 'indexes' must be provided.")

        n_samples = self.n_samples

        # Create mask based on sample names
        if samples is not None:
            samples = np.asarray(samples).ravel()
            # Extract sample names from the DataFrame
            sample_names = self.__phen_df.iloc[:, 0].values
            # Create mask for samples belonging to specified names
            mask_samples = np.isin(sample_names, samples)
        else:
            mask_samples = np.zeros(n_samples, dtype=bool)

        # Create mask based on sample indexes
        if indexes is not None:
            indexes = np.asarray(indexes).ravel()
            # Adjust negative indexes
            indexes = np.mod(indexes, n_samples)
            if np.any((indexes < 0) | (indexes >= n_samples)):
                raise IndexError("One or more sample indexes are out of bounds.")
            # Create mask for samples at specified indexes
            mask_indexes = np.zeros(n_samples, dtype=bool)
            mask_indexes[indexes] = True
        else:
            mask_indexes = np.zeros(n_samples, dtype=bool)

        # Combine masks using logical OR (union of samples)
        mask_combined = mask_samples | mask_indexes

        if not include:
            # Invert mask if excluding samples
            mask_combined = ~mask_combined

        # If requested, compute an ordering of selected rows that follows the provided lists
        ordered_indices = None
        if include and reorder:
            sel_indices = np.where(mask_combined)[0]
            sample_names = self.__phen_df.iloc[:, 0].values
            ordered_list = []
            added = np.zeros(n_samples, dtype=bool)

            # Respect the order provided in `samples` (supports duplicate sample names)
            if samples is not None:
                for s in samples:
                    matches = np.where(sample_names == s)[0]
                    for idx in matches:
                        if mask_combined[idx] and not added[idx]:
                            ordered_list.append(int(idx))
                            added[idx] = True

            # Then respect the order in `indexes`
            if indexes is not None:
                adj_idx = np.mod(np.atleast_1d(indexes), n_samples)
                for idx in adj_idx:
                    if mask_combined[idx] and not added[idx]:
                        ordered_list.append(int(idx))
                        added[idx] = True

            # Finally, append any remaining selected rows in their original order
            for idx in sel_indices:
                if not added[idx]:
                    ordered_list.append(int(idx))

            ordered_indices = np.asarray(ordered_list, dtype=int)

        # Filter the phenotype DataFrame
        if inplace:
            if ordered_indices is not None:
                self['phen_df'] = self['phen_df'].iloc[ordered_indices].reset_index(drop=True)
            else:
                self['phen_df'] = self['phen_df'][mask_combined].reset_index(drop=True)
            return None
        else:
            phen_obj = self.copy()
            if ordered_indices is not None:
                phen_obj['phen_df'] = phen_obj['phen_df'].iloc[ordered_indices].reset_index(drop=True)
            else:
                phen_obj['phen_df'] = phen_obj['phen_df'][mask_combined].reset_index(drop=True)
            return phen_obj