Source code for pyrelational.datasets.regression

"""Regression datasets that can be used for benchmarking AL strategies
"""

from typing import Tuple

import numpy as np
import torch
from sklearn.datasets import load_diabetes, make_regression
from sklearn.model_selection import KFold
from torch import Tensor
from torch.utils.data import Dataset

from .uci_datasets import UCIDatasets


[docs] class SynthReg1(Dataset[Tuple[Tensor, Tensor]]): """Synthetic dataset for active learning on a regression based task Simple 1 dof regression problem that can be placed into two types of AL situations as described in the module docstring :param n_splits: an int describing the number of class stratified splits to compute :param size: an int describing the number of observations the dataset is to have :param random_seed: random seed for reproducibility on splits """ def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234): super(SynthReg1, self).__init__() self.size = size self.random_seed = random_seed self.n_splits = n_splits X, y = make_regression(n_samples=size, n_features=1, n_targets=1, random_state=random_seed) self.x = torch.FloatTensor(X) self.y = torch.FloatTensor(y) kf = KFold(n_splits=n_splits) self.data_splits = kf.split(self.x, self.y) self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits] def __len__(self) -> int: ret: int = self.x.shape[0] return ret def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]: return self.x[idx], self.y[idx]
[docs] class SynthReg2(Dataset[Tuple[Tensor, Tensor]]): """Synthetic dataset for active learning on a regression based task A more challenging dataset than SynthReg1 wherein we see a periodic pattern with 2 degrees of freedom. :param n_splits: an int describing the number of class stratified splits to compute :param size: an int describing the number of observations the dataset is to have :param random_seed: random seed for reproducibility on splits """ def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234): super(SynthReg2, self).__init__() self.size = size self.random_seed = random_seed self.n_splits = n_splits # Samples zdata = 15 * np.random.random(size) xdata = np.sin(zdata) + 0.1 * np.random.randn(size) ydata = np.cos(zdata) + 0.1 * np.random.randn(size) # Convert zdata = torch.FloatTensor(zdata) xdata = torch.FloatTensor(xdata) ydata = torch.FloatTensor(ydata) self.x = torch.vstack([zdata, xdata]).T self.y = ydata kf = KFold(n_splits=n_splits) self.data_splits = kf.split(self.x, self.y) self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits] def __len__(self) -> int: ret: int = self.x.shape[0] return ret def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]: return self.x[idx], self.y[idx]
[docs] class DiabetesDataset(Dataset[Tuple[Tensor, Tensor]]): """A small regression dataset for examples From Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) “Least Angle Regression,” Annals of Statistics (with discussion), 407-499. :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, n_splits: int = 5): # Load the diabetes dataset diabetes_X, diabetes_y = load_diabetes(return_X_y=True) self.x = torch.FloatTensor(diabetes_X) self.y = torch.FloatTensor(diabetes_y) kf = KFold(n_splits=n_splits) self.data_splits = kf.split(self.x, self.y) self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits] def __len__(self) -> int: ret: int = self.x.shape[0] return ret def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]: return self.x[idx], self.y[idx]
[docs] class UCIRegression(Dataset[Tuple[Tensor, Tensor]]): """UCI regression dataset base class :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, name: str, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIRegression, self).__init__() dataset = UCIDatasets(name=name, data_dir=data_dir, n_splits=n_splits) self.data_dir = dataset.data_dir self.name = dataset.name self.data_splits = dataset.data_splits dataset = dataset.get_simple_dataset() self.len_dataset = len(dataset) self.x = dataset[:][0] self.y = dataset[:][1].squeeze() def __len__(self) -> int: ret: int = self.x.shape[0] return ret def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]: return self.x[idx], self.y[idx]
[docs] class UCIConcrete(UCIRegression): """UCI housing dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIConcrete, self).__init__(name="concrete", data_dir=data_dir, n_splits=n_splits)
[docs] class UCIEnergy(UCIRegression): """UCI housing dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIEnergy, self).__init__(name="energy", data_dir=data_dir, n_splits=n_splits)
[docs] class UCIPower(UCIRegression): """UCI housing dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIPower, self).__init__(name="power", data_dir=data_dir, n_splits=n_splits)
[docs] class UCIWine(UCIRegression): """UCI housing dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIWine, self).__init__(name="wine", data_dir=data_dir, n_splits=n_splits)
[docs] class UCIYacht(UCIRegression): """UCI housing dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIYacht, self).__init__(name="yacht", data_dir=data_dir, n_splits=n_splits)
[docs] class UCIAirfoil(UCIRegression): """UCI Airfoil dataset :param n_splits: an int describing the number of class stratified splits to compute """ def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5): super(UCIAirfoil, self).__init__(name="airfoil", data_dir=data_dir, n_splits=n_splits)