Source code for openasce.inference.tree.csv_dataset

#    Copyright 2023 AntGroup CO., Ltd.
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

import pandas as pd
import numpy as np

from .dataset import Dataset
from .utils import to_row_major


[docs]class CsvDataset(Dataset):
    """A Dataset interface for loading csv data"""

[docs]    def __init__(self, conf=None, **kwargs):
        super().__init__()

        self.conf = conf
        self.data_conf = self.conf.get('dataset', self.conf)
        self.bin_features = None

[docs]    def read(self, filename=None):
        if filename is None and self.conf is not None:
            filename = self.data_conf.get('train_data_path')[0]
        data = pd.read_csv(filename, index_col=0)
        feat_cols = self.data_conf.get_list('feature')
        treatment_info = self.data_conf.get('treatment_info', None)
        try:
            label_cols = [f.name for f in self.data_conf.get('label_columns')]
            treat_cols = [f.name for f in self.data_conf.get('treatment_columns')]
            weight_cols = [f.name for f in self.data_conf.get('weight_columns', [])]
        except:
            label_cols = [f for f in self.data_conf.get('label')]
            treat_cols = [self.data_conf.get('treatment')]
            weight_cols = [f for f in self.data_conf.get('weight', [])]
        for f in feat_cols + label_cols + treat_cols + weight_cols:
            if f not in data.columns:
                raise RuntimeError(f'feature `{f}` not exists in data!')
        # transform the treatment into [0, 1, 2, ....]
        if treatment_info is not None:
            treatment_map = {info[0]: np.int32(i) for i, info in enumerate(treatment_info)}
            data[treat_cols[0]] = data[treat_cols[0]].apply(lambda x: treatment_map[int(x)])
        self._data = data
        self.feat_cols = feat_cols
        self.label_cols = label_cols
        self.treat_cols = treat_cols
        self.weight_cols = weight_cols
        self.n_feat = len(feat_cols)

[docs]    def sub_dataset(self, index=None, cols=None, cols_y=[]) -> Dataset:
        if index.dtype in (pd.BooleanDtype, np.bool):
            assert index.shape[0] == self.n_inst
            index = np.where(index)[0]
        data_conf = self.conf
        data = CsvDataset(conf=data_conf)
        data.n_inst = index.shape[0]
        if cols is None:
            data.n_feat = self.n_feat
            cols = self.features.columns
        else:
            data.n_feat = len(cols)
        data.feat_cols = cols
        data.label_cols = self.label_cols
        data.treat_cols = self.treat_cols
        data.weight_cols = self.weight_cols
        data._data = self._data.iloc[index]
        return data

    @property
    def targets(self):
        return self._data[self.label_cols]

    @property
    def features(self):
        return self._data[self.feat_cols]
    
    @property
    def treatment(self):
        return self._data[self.treat_cols[0]]

    @property
    def weight(self):
        if len(self.weight_cols) > 0:
            return to_row_major(self._data[self.weight_cols[0]])
        return None

[docs]    @staticmethod
    def new_instance(conf):
        data_conf = conf.get('dataset', conf)
        data = CsvDataset(conf=conf)
        data.read(data_conf.get('data.path'))
        data.description()
        return data