Source code for pyrelational.datasets.regression

"""Regression datasets that can be used for benchmarking AL strategies
"""

from typing import Tuple

import numpy as np
import torch
from sklearn.datasets import load_diabetes, make_regression
from sklearn.model_selection import KFold
from torch import Tensor
from torch.utils.data import Dataset

from .uci_datasets import UCIDatasets



[docs]
class SynthReg1(Dataset[Tuple[Tensor, Tensor]]):
    """Synthetic dataset for active learning on a regression based task

    Simple 1 dof regression problem that can be placed into two types
    of AL situations as described in the module docstring

    :param n_splits: an int describing the number of class stratified
            splits to compute
    :param size: an int describing the number of observations the dataset
            is to have
    :param random_seed: random seed for reproducibility on splits
    """

    def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234):
        super(SynthReg1, self).__init__()
        self.size = size
        self.random_seed = random_seed
        self.n_splits = n_splits

        X, y = make_regression(n_samples=size, n_features=1, n_targets=1, random_state=random_seed)

        self.x = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)

        kf = KFold(n_splits=n_splits)
        self.data_splits = kf.split(self.x, self.y)
        self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits]

    def __len__(self) -> int:
        ret: int = self.x.shape[0]
        return ret

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        return self.x[idx], self.y[idx]




[docs]
class SynthReg2(Dataset[Tuple[Tensor, Tensor]]):
    """Synthetic dataset for active learning on a regression based task

    A more challenging dataset than SynthReg1 wherein we see a periodic
    pattern with 2 degrees of freedom.

    :param n_splits: an int describing the number of class stratified
            splits to compute
    :param size: an int describing the number of observations the dataset
            is to have
    :param random_seed: random seed for reproducibility on splits
    """

    def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234):
        super(SynthReg2, self).__init__()
        self.size = size
        self.random_seed = random_seed
        self.n_splits = n_splits

        # Samples
        zdata = 15 * np.random.random(size)
        xdata = np.sin(zdata) + 0.1 * np.random.randn(size)
        ydata = np.cos(zdata) + 0.1 * np.random.randn(size)

        # Convert
        zdata = torch.FloatTensor(zdata)
        xdata = torch.FloatTensor(xdata)
        ydata = torch.FloatTensor(ydata)

        self.x = torch.vstack([zdata, xdata]).T
        self.y = ydata

        kf = KFold(n_splits=n_splits)
        self.data_splits = kf.split(self.x, self.y)
        self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits]

    def __len__(self) -> int:
        ret: int = self.x.shape[0]
        return ret

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        return self.x[idx], self.y[idx]




[docs]
class DiabetesDataset(Dataset[Tuple[Tensor, Tensor]]):
    """A small regression dataset for examples

    From Bradley Efron, Trevor Hastie, Iain Johnstone and
    Robert Tibshirani (2004) “Least Angle Regression,”
    Annals of Statistics (with discussion), 407-499.

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, n_splits: int = 5):
        # Load the diabetes dataset
        diabetes_X, diabetes_y = load_diabetes(return_X_y=True)
        self.x = torch.FloatTensor(diabetes_X)
        self.y = torch.FloatTensor(diabetes_y)

        kf = KFold(n_splits=n_splits)
        self.data_splits = kf.split(self.x, self.y)
        self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits]

    def __len__(self) -> int:
        ret: int = self.x.shape[0]
        return ret

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        return self.x[idx], self.y[idx]




[docs]
class UCIRegression(Dataset[Tuple[Tensor, Tensor]]):
    """UCI regression dataset base class

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, name: str, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIRegression, self).__init__()
        dataset = UCIDatasets(name=name, data_dir=data_dir, n_splits=n_splits)
        self.data_dir = dataset.data_dir
        self.name = dataset.name
        self.data_splits = dataset.data_splits

        dataset = dataset.get_simple_dataset()
        self.len_dataset = len(dataset)
        self.x = dataset[:][0]
        self.y = dataset[:][1].squeeze()

    def __len__(self) -> int:
        ret: int = self.x.shape[0]
        return ret

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        return self.x[idx], self.y[idx]




[docs]
class UCIConcrete(UCIRegression):
    """UCI housing dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIConcrete, self).__init__(name="concrete", data_dir=data_dir, n_splits=n_splits)




[docs]
class UCIEnergy(UCIRegression):
    """UCI housing dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIEnergy, self).__init__(name="energy", data_dir=data_dir, n_splits=n_splits)




[docs]
class UCIPower(UCIRegression):
    """UCI housing dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIPower, self).__init__(name="power", data_dir=data_dir, n_splits=n_splits)




[docs]
class UCIWine(UCIRegression):
    """UCI housing dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIWine, self).__init__(name="wine", data_dir=data_dir, n_splits=n_splits)




[docs]
class UCIYacht(UCIRegression):
    """UCI housing dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIYacht, self).__init__(name="yacht", data_dir=data_dir, n_splits=n_splits)




[docs]
class UCIAirfoil(UCIRegression):
    """UCI Airfoil dataset

    :param n_splits: an int describing the number of class stratified
        splits to compute
    """

    def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5):
        super(UCIAirfoil, self).__init__(name="airfoil", data_dir=data_dir, n_splits=n_splits)