Source code for heat.utils.data.spherical

"""Create a sperical dataset."""

import heat as ht
import torch


[docs] def create_spherical_dataset( num_samples_cluster, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1 ): """ Creates k=4 sperical clusters in 3D space along the space-diagonal Parameters ---------- num_samples_cluster: int Number of samples per cluster. Each process will create n // MPI_WORLD.size elements for each cluster radius: float Radius of the sphere offset: float Shift of the clusters along the axes. The 4 clusters will be positioned centered around c1=(offset, offset,offset), c2=(2*offset,2*offset,2*offset), c3=(-offset, -offset, -offset) and c4=(2*offset, -2*offset, -2*offset) dtype: ht.datatype Dataset dtype random_state: int seed of the torch random number generator """ # contains num_samples p = ht.MPI_WORLD.size # create k sperical clusters with each n elements per cluster. Each process creates k * n/p elements num_ele = num_samples_cluster // p ht.random.seed(random_state) # radius between 0 and 1 r = ht.random.rand(num_ele, split=0) * radius # theta between 0 and pi theta = ht.random.rand(num_ele, split=0) * 3.1415 # phi between 0 and 2pi phi = ht.random.rand(num_ele, split=0) * 2 * 3.1415 # Cartesian coordinates x = r * ht.sin(theta) * ht.cos(phi) x.astype(dtype, copy=False) y = r * ht.sin(theta) * ht.sin(phi) y.astype(dtype, copy=False) z = r * ht.cos(theta) z.astype(dtype, copy=False) cluster1 = ht.stack((x + offset, y + offset, z + offset), axis=1) cluster2 = ht.stack((x + 2 * offset, y + 2 * offset, z + 2 * offset), axis=1) cluster3 = ht.stack((x - offset, y - offset, z - offset), axis=1) cluster4 = ht.stack((x - 2 * offset, y - 2 * offset, z - 2 * offset), axis=1) data = ht.concatenate((cluster1, cluster2, cluster3, cluster4), axis=0) # Note: enhance when shuffel is available return data
[docs] def create_clusters( n_samples, n_features, n_clusters, cluster_mean, cluster_std, cluster_weight=None, device=None ): """ Creates a DNDarray of shape (n_samples, n_features), split=0, and dtype=ht.float32, that is balanced (i.e. roughly same size of samples on each process). The data set consists of n_clusters clusters, each of which is sampled from a multivariate normal distribution with mean cluster_mean[k,:] and covariance matrix cluster_std[k,:,:]. The clusters are of the same size (quantitatively) and distributed evenly over the processes, unless cluster_weight is specified. Parameters ---------- n_samples: int Number of overall samples n_features: int Number of features n_clusters: int Number of clusters cluster_mean: torch.Tensor of shape (n_clusters, n_features) featurewise mean (center) of each cluster; of course not the true mean, but rather the mean according to which the elements of the cluster are sampled. cluster_std: torch.Tensor of shape (n_clusters, n_features, n_features), or (n_clusters,) featurewise standard deviation of each cluster from the mean value; of course not the true std, but rather the std according to which the elements of the cluster are sampled. If shape is (n_clusters,), std is assumed to be the same in each direction for each cluster cluster_weight: torch.Tensor of shape (n_clusters,), optional On each process, cluster_weight is assumed to be a torch.Tensor whose entries add up to 1. The i-th entry of cluster_weight on process p specified which amount of the samples on process p is sampled according to the distribution of cluster i. Thus, this parameter allows to distribute the n_cluster clusters unevenly over the processes. If None, each cluster is distributed evenly over all processes. device: Optional[str] = None, The device on which the data is stored. If None, the default device is used. """ device = ht.devices.sanitize_device(device) if cluster_weight is None: cluster_weight = torch.ones(n_clusters) / n_clusters else: if not isinstance(cluster_weight, torch.Tensor): raise TypeError( "cluster_weight must be None or a torch.Tensor, but is {}".format( type(cluster_weight) ) ) elif not cluster_weight.shape == (n_clusters,): raise ValueError( "If a torch.Tensor, cluster_weight must be of shape (n_clusters,), but is {}".format( cluster_weight.shape ) ) elif not torch.allclose(torch.sum(cluster_weight), torch.tensor(1.0)): raise ValueError( "If a torch.Tensor, cluster_weight must add up to 1, but adds up to {}".format( torch.sum(cluster_weight) ) ) if not isinstance(cluster_mean, torch.Tensor): raise TypeError("cluster_mean must be a torch.Tensor, but is {}".format(type(cluster_mean))) elif not cluster_mean.shape == (n_clusters, n_features): raise ValueError( "cluster_mean must be of shape (n_clusters, n_features), but is {}".format( cluster_mean.shape ) ) if not isinstance(cluster_std, torch.Tensor): raise TypeError("cluster_std must be a torch.Tensor, but is {}".format(type(cluster_std))) elif not cluster_std.shape == ( n_clusters, n_features, n_features, ) and not cluster_std.shape == (n_clusters,): raise ValueError( "cluster_std must be of shape (n_clusters, n_features, n_features) or (n_clusters,), but is {}".format( cluster_std.shape ) ) if cluster_std.shape == (n_clusters,): cluster_std = torch.stack( [torch.eye(n_features) * cluster_std[k] for k in range(n_clusters)], dim=0 ) global_shape = (n_samples, n_features) local_shape = ht.MPI_WORLD.chunk(global_shape, 0)[1] local_size_of_clusters = [int(local_shape[0] * cluster_weight[k]) for k in range(n_clusters)] if sum(local_size_of_clusters) != local_shape[0]: local_size_of_clusters[0] += local_shape[0] - sum(local_size_of_clusters) distributions = [ torch.distributions.multivariate_normal.MultivariateNormal( cluster_mean[k, :], cluster_std[k] ) for k in range(n_clusters) ] local_data = [ distributions[k].sample((local_size_of_clusters[k],)).to(device.torch_device) for k in range(n_clusters) ] local_data = torch.cat(local_data, dim=0) rand_perm = torch.randperm(local_shape[0], device=device.torch_device) local_data = local_data[rand_perm, :] data = ht.DNDarray( local_data, global_shape, dtype=ht.float32, split=0, device=device, comm=ht.MPI_WORLD, balanced=True, ) return data