ctheodoris
/

Geneformer

@@ -607,7 +607,7 @@ class GeneformerPretrainer(Trainer):
             )
         super().__init__(*args, **kwargs)
-    # modify LengthGroupedSampler to avoid dataset[length_column_name] hanging
     def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
         if not isinstance(self.train_dataset, collections.abc.Sized):
             return None
@@ -630,181 +630,15 @@ class GeneformerPretrainer(Trainer):
                 if self.tokenizer is not None
                 else None
             )
-            if self.args.world_size <= 1:
-                return LengthGroupedSampler(
                     dataset=self.train_dataset,
                     batch_size=self.args.train_batch_size,
                     lengths=lengths,
                     model_input_name=model_input_name,
                     generator=generator,
-                )
-            else:
-                return CustomDistributedLengthGroupedSampler(
-                    dataset=self.train_dataset,
-                    batch_size=self.args.train_batch_size,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    seed=self.args.seed,
-                )
-        else:
-            if self.args.world_size <= 1:
-                if _is_torch_generator_available:
-                    return RandomSampler(self.train_dataset, generator=generator)
-                return RandomSampler(self.train_dataset)
-            elif (
-                self.args.parallel_mode
-                in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
-                and not self.args.dataloader_drop_last
-            ):
-                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
-                return DistributedSamplerWithLoop(
-                    self.train_dataset,
-                    batch_size=self.args.per_device_train_batch_size,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=self.args.seed,
-                )
-            else:
-                return DistributedSampler(
-                    self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=self.args.seed,
-                )
-class CustomDistributedLengthGroupedSampler(DistributedLengthGroupedSampler):
-    r"""
-    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
-    length while keeping a bit of randomness.
-    """
-    # Copied and adapted from PyTorch DistributedSampler.
-    def __init__(
-        self,
-        dataset: Dataset,
-        batch_size: int,
-        num_replicas: Optional[int] = None,
-        rank: Optional[int] = None,
-        seed: int = 0,
-        drop_last: bool = False,
-        lengths: Optional[List[int]] = None,
-        model_input_name: Optional[str] = None,
-    ):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-        self.dataset = dataset
-        self.batch_size = batch_size
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        self.drop_last = drop_last
-        # If the dataset length is evenly divisible by # of replicas, then there
-        # is no need to drop any data, since the dataset will be split equally.
-        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
-            # Split to nearest available length that is evenly divisible.
-            # This is to ensure each rank receives the same amount of data when
-            # using this Sampler.
-            self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas
             )
-        else:
-            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
-        self.total_size = self.num_samples * self.num_replicas
-        self.seed = seed
-        self.model_input_name = (
-            model_input_name if model_input_name is not None else "input_ids"
-        )
-        if lengths is None:
-            print("Lengths is none - calculating lengths.")
-            if (
-                not (
-                    isinstance(dataset[0], dict)
-                    or isinstance(dataset[0], BatchEncoding)
-                )
-                or self.model_input_name not in dataset[0]
-            ):
-                raise ValueError(
-                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
-                    f"'{self.model_input_name}' key."
-                )
-            lengths = [len(feature[self.model_input_name]) for feature in dataset]
-        self.lengths = lengths
-    def __iter__(self) -> Iterator:
-        # Deterministically shuffle based on epoch and seed
-        g = torch.Generator()
-        g.manual_seed(self.seed + self.epoch)
-        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
-        if not self.drop_last:
-            # add extra samples to make it evenly divisible
-            indices += indices[: (self.total_size - len(indices))]
         else:
-            # remove tail of data to make it evenly divisible.
-            indices = indices[: self.total_size]
-        assert len(indices) == self.total_size
-        # subsample
-        indices = indices[self.rank : self.total_size : self.num_replicas]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-def get_length_grouped_indices(
-    lengths, batch_size, mega_batch_mult=None, generator=None
-):
-    """
-    Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of
-    similar lengths. To do this, the indices are:
-    - randomly permuted
-    - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size`
-    - sorted by length in each mega-batch
-    The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of
-    maximum length placed first, so that an OOM happens sooner rather than later.
-    """
-    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
-    if mega_batch_mult is None:
-        # mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
-        mega_batch_mult = min(len(lengths) // (batch_size * 4), 1000)
-        # Just in case, for tiny datasets
-        if mega_batch_mult == 0:
-            mega_batch_mult = 1
-    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
-    indices = torch.randperm(len(lengths), generator=generator)
-    megabatch_size = mega_batch_mult * batch_size
-    megabatches = [
-        indices[i : i + megabatch_size].tolist()
-        for i in range(0, len(lengths), megabatch_size)
-    ]
-    megabatches = [
-        list(sorted(megabatch, key=lambda i: lengths[i], reverse=True))
-        for megabatch in megabatches
-    ]
-    # The rest is to get the biggest batch first.
-    # Since each megabatch is sorted by descending length, the longest element is the first
-    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
-    max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
-    # Switch to put the longest element in first position
-    megabatches[0][0], megabatches[max_idx][0] = (
-        megabatches[max_idx][0],
-        megabatches[0][0],
-    )
-    return [item for sublist in megabatches for item in sublist]

             )
         super().__init__(*args, **kwargs)
+    # updated to not use distributed sampler since Trainer now distributes with accelerate
     def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
         if not isinstance(self.train_dataset, collections.abc.Sized):
             return None
                 if self.tokenizer is not None
                 else None
             )
+            return LengthGroupedSampler(
                     dataset=self.train_dataset,
                     batch_size=self.args.train_batch_size,
                     lengths=lengths,
                     model_input_name=model_input_name,
                     generator=generator,
             )
         else:
+            if _is_torch_generator_available:
+                return RandomSampler(self.train_dataset, generator=generator)
+            return RandomSampler(self.train_dataset)