# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # sampler.py import math from typing import Callable, Iterator, Optional import numpy as np import torch import torch.distributed as dist from torch.utils.data.dataset import Dataset from torch.utils.data.sampler import Sampler def _shard_wrapped_indices_across_workers(dataset_index_list, num_shards, num_samples_per_shard): """Yield successive num_shards-sized chunks from dataset_index_list.""" num_samples = max(1, num_samples_per_shard) num_elements = num_samples * num_shards current_lst = [] for i in range(num_elements): current_lst.append(dataset_index_list[i % len(dataset_index_list)]) if len(current_lst) == num_shards: yield current_lst current_lst = [] def shard_wrapped_indices_for_worker(dataset_index_list, shard_id, num_shards): """Shard wrapped around dataset_index_list across num_shards and return the indices for this shard_id""" num_samples_per_worker = (len(dataset_index_list) + num_shards - 1) // num_shards sharded_indices = list( _shard_wrapped_indices_across_workers(dataset_index_list, num_shards, num_samples_per_worker) ) return [sharded_indices[i][shard_id] for i in range(len(sharded_indices))] # Implementation is adapted from bagua/load_balancing_data_loader.py # https://github.com/BaguaSys/bagua/blob/01874a7c3f90904c37c5612a9db866b5d4b8b5ed/bagua/torch_api/contrib/load_balancing_data_loader.py#L12 class LoadBalancingDistributedSampler: r"""Sampler that balances the data load across workers based on the sample's complexity. This sampler uses a :attr:`complexity_fn` to calculate each sample's computational complexity and make each batch get similar computational complexity. This is useful in scenarios like speech and NLP, where each batch has variable length and distributed training suffers from straggler problem. In such scenarios, the complexity function could be defined to return the length of the input sample sequence. The usage is similar to `torch.utils.data.DistributedSampler`, where each process loads a subset of the original dataset that is exclusive to it. The sampler sorts the dataset in increasing order of complexity. If the :attr:`group_size` is provided, the sorting happens within dataset groups of size :attr:`group_size` before the group order is shuffled followed by sharding of data across workers. If :attr:`group_size` is not provided, the data is distributed across workers before the data indices for each worker is shuffled deterministically. .. note:: Dataset is assumed to be of constant size (map-style dataset). Args: dataset: Dataset (map-style) used for sampling. complexity_fn(Callable): A function whose input is a sample and output is an integer as a measure of the computational complexity of the sample. world_size (int, optional): Number of processes participating in distributed training. By default, :attr:`world_size` is retrieved from the current distributed group. rank (int, optional): Rank of the current process within :attr:`world_size`. By default, :attr:`rank` is retrieved from the current distributed group. shuffle (bool, optional): If ``True`` (default), sampler will shuffle the indices within the dataset if :attr:`group_size` is None, else will shuffle the groups if :attr:`group_size` is not None. group_size (int, optional): If provided, the dataset will be broken down into :attr:`group_size` sized groups. Indices will only be sorted within the groups and not across the entire dataset. If :attr:`shuffle` is ```True``` and :attr:`group_size` is not ```None```, the position of each group in the dataset will be shuffled. Default: ```None``` seed (int, optional): random seed used to shuffle the sampler if :attr:`shuffle=True`. This number should be identical across all processes in the distributed group. Default: 0. drop_last (bool, optional): if ``True``, then the sampler will drop the tail of the data to make it evenly divisible across the number of shards. If ``False``, the sampler will add extra indices to make the data evenly divisible across the shards. Default: ``False``. random_level (float, optional): A float varies from 0 and 1 that controls the extent of load balance. 0 means the best load balance, while 1 means the opposite. .. warning:: In distributed mode, calling the :meth:`set_epoch` method at the beginning of each epoch **before** creating the `torch.utils.data.DataLoader` iterator is necessary to make shuffling work properly across multiple epochs. Otherwise, the same ordering will be always used. Example:: Define your :attr:`complexity_fn`, which accepts a dataset sample as its input and produces an integer as the sample's computational complexity: >>> dataset = MyVariableSequenceLengthDataset(dataset_samples) >>> complexity_fn = lambda x: len(x) Below is the usage of :class:`LoadBalancingDistributedSampler` and `torch.utils.data.DataLoader`: >>> sampler = onnxruntime.training.utils.data.LoadBalancingDistributedSampler( ... dataset, ... complexity_fn=complexity_fn) >>> loader = torch.utils.data.DataLoader(dataset, ... sampler=sampler) >>> >>> for epoch in range(start_epoch, n_epochs): ... if is_distributed: ... sampler.set_epoch(epoch) ... train(loader) """ def __init__( self, dataset: Dataset, complexity_fn: Callable[..., int], world_size: Optional[int] = None, rank: Optional[int] = None, shuffle: bool = True, group_size: Optional[int] = None, seed: int = 0, drop_last: bool = False, random_level: float = 0, ) -> None: if world_size is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") world_size = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() if rank >= world_size or rank < 0: raise ValueError(f"Invalid rank {rank}, rank should be in the interval [0, {world_size - 1}]") self.dataset = dataset self.world_size = world_size self.rank = rank self.epoch = 0 self.drop_last = drop_last self.group_size = group_size # If the dataset length is evenly divisible by number of shards, then there # is no need to drop any data, since the dataset will be split equally. dataset_len = len(self.dataset) if self.drop_last and dataset_len % self.world_size != 0: # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = dataset_len // self.world_size else: self.num_samples = math.ceil(dataset_len / self.world_size) self.total_size = self.num_samples * self.world_size self.shuffle = shuffle self.seed = seed self.complexity_fn = complexity_fn self.sample_complexities = None self.ordered_sample_complexities = None if random_level < 0.0 or random_level > 1.0: raise ValueError(f"Invalid random level {random_level}, shoule be in the range [0.0, 1.0]") self.random_level = random_level self.random_number = None def _sort_shard_and_shuffle_dataset(self): # This method returns a list of dataset sample indices after # the dataset has been sorted, sharded and shuffled. # The sorting of the dataset happens based on the group_size and complexities # of each sample. # Sharding happens across the number of workers. # Shuffling is done either before sharding on the group indices (if group_size is provided) # or on the dataset sample indices if the group_size is not provided. def sort_in_groups(sample_complexities, group_size): """Sort the dataset samples indices inside each group of size group_size.""" # If the group_size is None, the entire dataset is considered as a single group if group_size is None: group_size = len(sample_complexities) # Sort the dataset samples inside each group of the dataset based on sample complexity. for group_begin_index in range(0, len(sample_complexities), group_size): group_end_index = min(group_begin_index + group_size, len(sample_complexities)) sorted_indices = group_begin_index + np.argsort( sample_complexities[group_begin_index:group_end_index, 1] ) sample_complexities[group_begin_index:group_end_index, :] = sample_complexities[sorted_indices] return sample_complexities # Get the samples and their complexities from the complexity_fn if not self.sample_complexities: self.sample_complexities = np.empty((len(self.dataset), 2), dtype=np.int64) for sample_index in range(len(self.dataset)): self.sample_complexities[sample_index][0] = sample_index self.sample_complexities[sample_index][1] = self.complexity_fn(self.dataset[sample_index]) if self.random_number is None: max_complexity = max(self.sample_complexities, key=lambda t: t[1])[1] min_complexity = min(self.sample_complexities, key=lambda t: t[1])[1] self.random_number = int((max_complexity - min_complexity) * self.random_level + 1) sample_complexities = self.sample_complexities.copy() # Control the degree of load balancing by modifying the complexities of # all samples using the random_number. g = torch.Generator() g = g.manual_seed(self.seed + self.epoch) if self.random_number > 1: complexity_random_ints = torch.randint( self.random_number, (len(sample_complexities),), generator=g ).tolist() for index, random_int in enumerate(complexity_random_ints): sample_complexities[index][1] += random_int # Sort the data based on the computed complexities and group sizes. # Sort only once if random_number <= 1 else sort everytime if self.ordered_sample_complexities is None or self.random_number > 1: self.ordered_sample_complexities = sort_in_groups(sample_complexities, self.group_size) ordered_sample_complexities = self.ordered_sample_complexities # If group_size is not None, shuffle the index of each group instead # of shuffling the data indices. if self.shuffle and self.group_size is not None: num_groups = (len(self.sample_complexities) + self.group_size - 1) // self.group_size group_order = torch.randperm(num_groups, generator=g).tolist() end = 0 sample_complexities_copy = ordered_sample_complexities.copy() for group_index in group_order: original_list_begin_index = self.group_size * group_index original_list_end_index = min(original_list_begin_index + self.group_size, len(sample_complexities)) begin = end end = begin + (original_list_end_index - original_list_begin_index) sample_complexities_copy[begin:end, :] = sample_complexities[ original_list_begin_index:original_list_end_index, : ] ordered_sample_complexities = sample_complexities_copy # Shard the data across the different workers. index_chunks = list( _shard_wrapped_indices_across_workers( [index_complexity_tuple[0] for index_complexity_tuple in ordered_sample_complexities], self.world_size, self.num_samples, ) ) # Shuffle the sharded data indices deterministically based on epoch and seed. chunk_indices = list(range(len(index_chunks))) if self.shuffle and self.group_size is None: chunk_indices = torch.randperm(len(index_chunks), generator=g).tolist() if not self.drop_last: # Add extra samples to make it evenly divisible padding_size = self.num_samples - len(chunk_indices) if padding_size <= len(chunk_indices): chunk_indices += chunk_indices[:padding_size] else: chunk_indices += (chunk_indices * math.ceil(padding_size / len(chunk_indices)))[:padding_size] else: # Remove tail of data to make it evenly divisible. chunk_indices = chunk_indices[: self.num_samples] assert len(chunk_indices) == self.num_samples return index_chunks, chunk_indices def __iter__(self) -> Iterator: index_chunks, chunk_indices = self._sort_shard_and_shuffle_dataset() # Extract indices based on current rank. indices = [index_chunks[i][self.rank] for i in chunk_indices] assert len(indices) == self.num_samples return iter(indices) def __len__(self) -> int: return self.num_samples def set_epoch(self, epoch: int) -> None: r"""Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all shards use a different random ordering for each epoch. Otherwise, the next iteration of this sampler will yield the same ordering. Args: epoch (int): Epoch number. """ self.epoch = epoch class LoadBalancingDistributedBatchSampler(Sampler): r"""Wraps another load balance sampler to yield variable sized mini-batches. Args: sampler (LoadBalancingDistributedSampler): Load balance sampler. batch_fn (Callable): Callable to yield mini-batch indices. drop_last (bool): If ``True``, the sampler will drop the last few batches exceeding the least number of batches among replicas, otherwise, the number of batches on each replica will be padded to the same. :attr:`batch_fn` will have the signature of:: def batch_fn(indices: List[int]) -> List[List[int]] Example:: >>> from onnxruntime.training.utils.data import LoadBalancingDistributedSampler, \ ... LoadBalancingDistributedBatchSampler >>> >>> sampler = LoadBalancingDistributedSampler(dataset, complexity_fn=complexity_fn) >>> batch_sampler = LoadBalancingDistributedBatchSampler(sampler, batch_fn=batch_fn) >>> loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler) >>> >>> for epoch in range(start_epoch, n_epochs): ... batch_sampler.set_epoch(epoch) ... train(loader) """ def __init__( self, sampler: LoadBalancingDistributedSampler, batch_fn, drop_last: bool = False, ) -> None: if not isinstance(sampler, LoadBalancingDistributedSampler): raise ValueError("sampler should be of LoadBalancingDistributedSampler type.") if sampler.drop_last: raise ValueError("drop_last of sampler should be False") self.sampler = sampler self.batch_fn = batch_fn self.drop_last = drop_last self.world_size = self.sampler.world_size self.rank = self.sampler.rank self.generate_batches() def generate_batches(self): index_chunks, chunk_indices = self.sampler._sort_shard_and_shuffle_dataset() batches = [] for rank in range(self.world_size): sub_indices = [index_chunks[i][rank] for i in chunk_indices] batches.append(self.batch_fn(sub_indices)) self.total_batch = max([len(b) for b in batches]) if not self.drop_last else min([len(b) for b in batches]) # here {len(batches[self.rank]) - self.total_batch} batches dropped for # rank {self.rank} if self.total_batch < len(batches[self.rank]): pass self.padded_batches = [batch + batch[: self.total_batch - len(batch)] for batch in batches] def __iter__(self): return iter(self.padded_batches[self.rank]) def __len__(self): return self.total_batch def set_epoch(self, epoch: int) -> None: r""" Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas use a different random ordering for each epoch. Otherwise, the next iteration of this sampler will yield the same ordering. Args: epoch (int): Epoch number. """ self.sampler.set_epoch(epoch) self.generate_batches()