Source code for openasce.inference.tree.dataset

#    Copyright 2023 AntGroup CO., Ltd.
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

import numpy as np
import pandas as pd
from pyhocon import ConfigFactory

from .reflect_utils import get_class
from .utils import to_row_major, logger


[docs]class Dataset(object):
    """Abstract interface of class dataset"""

[docs]    def __init__(self):
        pass

    def __len__(self):
        return self.features.shape[0]

[docs]    @staticmethod
    def new_instance(conf):
        data_conf = conf.get('dataset', conf)
        cls_name = data_conf.get('type', 'dataset.CSVDataset')
        return get_class(cls_name).new_instance(conf)

[docs]    def read(self, filename):
        pass

[docs]    def sub_dataset(self, index=None):
        """
        Abstract interface of sub-sampling

        Arguments:
            index (_type_, optional): _description_. Defaults to None.

        Raises:
            NotImplementedError: _description_
        """
        raise NotImplementedError

[docs]    def description(self, detail: bool = False) -> None:
        """
        description the dataset

        Arguments:
            detail (bool, optional): [description]. Defaults to False.
        """
        n_ins, n_feat = self.features.shape
        n_y_len = self.targets.shape[1]
        # calculate treatment distinct count
        treats = np.unique(self.treatment)
        logger.info(f'#inst: {n_ins}')
        logger.info(f'#feat: {n_feat}')
        logger.info(f'#time serise length: {n_y_len}')
        logger.info(f'#treatments : {len(treats)}')

    @property
    def targets(self):
        raise NotImplementedError

    @property
    def features(self):
        raise NotImplementedError

    @property
    def treatment(self):
        raise NotImplementedError

    @property
    def feature_columns(self):
        if hasattr(self, 'used_features'):
            return getattr(self, 'used_features')
        elif isinstance(self.features, pd.DataFrame):
            return self.features.columns
        else:
            raise RuntimeError('There is no attribute `feature_columns`!')


[docs]class PsudoDataset(Dataset):
    """
    A Psudo Dataset to wrap for the numpy formatting data.

    Arguments:
        features (np.ndarray, optional): features. Defaults to None.
        outcome (np.ndarray, optional): outcome. Defaults to None.
        treatment (np.ndarray, optional): treatment. Defaults to None.
        conf (_type_, optional): configure. Defaults to None.
    """
        
[docs]    def __init__(self, features: np.ndarray=None, outcome: np.ndarray=None, treatment: np.ndarray=None, conf=None):

        self._features = to_row_major(features.copy('C')) if features is not None else None
        self._treatment = to_row_major(treatment.copy('C')) if treatment is not None else None
        self._outcome = to_row_major(outcome.copy('C')) if outcome is not None else None
        if conf is None:
            feature_columns = [f'X{i}' for i in range(features.shape[1])]
            conf = ConfigFactory.from_dict({'dataset':{'feature': feature_columns}})
        self.conf = conf
        self.used_features = self.conf.dataset.feature

    @property
    def features(self):
        return self._features
    
    @property
    def treatment(self):
        return self._treatment
    
    @property
    def targets(self):
        return self._outcome

    @property
    def weight(self):
        return np.ones_like(self.treatment)

    @property
    def feature_columns(self):
        return self.used_features

[docs]    def sub_dataset(self, index=None, cols=None) -> Dataset:
        """
        Create a sub-dataset.

        Arguments:
            index: Indices of the samples to include in the sub-dataset.
            cols: Columns to include in the sub-dataset.

        Returns:
            The sub-dataset.

        """
        if index.dtype in (pd.BooleanDtype, np.bool):
            assert index.shape[0] == self.n_inst
            index = np.where(index)[0]
        data_conf = self.conf

        if cols is None:
            idx = index
            feature_columns = self.feature_columns
        else:
            cols = np.asarray(cols, dtype=int)
            idx = np.ix_(index, cols)
            feature_columns = self.feature_columns[cols]

        _feature = self._features[idx].copy()
        _treatment = self._treatment[idx].copy()
        _outcome = self._outcome[idx].copy()
        data = PsudoDataset(_feature, _outcome, _treatment, conf=data_conf)
        data.used_features = feature_columns
        if hasattr(self, 'bin_features') and self.bin_features is not None:
            # bin features : DataFrame
            if cols is None:
                cols = self.bin_features.columns
            data.bin_features = self.bin_features.loc[index, cols]
        return data