acul3
/

roberta-base-indo

@@ -262,29 +262,6 @@ class FlaxDataCollatorForLanguageModeling:
         return inputs, labels
-@dataclass
-class SamplingArguments:
-    """
-    Arguments pertaining to how to perform sampling of the dataset.
-    """
-    perplexity_model: Optional[str] = field(
-        default="./es.arpa.bin", metadata={"help": "Path to KenLM model to use to get perplexity values."}
-    )
-    sampling_method: Optional[str] = field(
-        default=None, metadata={"help": "Sample using a 'step' or 'gaussian' perplexity function per document, or 'random'."}
-    )
-    sampling_factor: Optional[float]  = field(
-        default=None, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
-    )
-    boundaries: Optional[str] = field(
-        default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
-    )
-    def __post_init__(self):
-        self.boundaries = [float(q.strip()) for q in self.boundaries.split(",")]
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
     num_samples = len(samples_idx)
     samples_to_remove = num_samples % batch_size
@@ -310,7 +287,9 @@ def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
         i += len(tokenized_samples["input_ids"])
         # concatenate tokenized samples to list
-        samples = {k: samples[k] + tokenized_samples[k] for k in tokenized_samples.keys()}
     # Concatenated tokens are split to lists of length `max_seq_length`.
     # Note that remainedr of % max_seq_length are thrown away.
@@ -404,7 +383,7 @@ if __name__ == "__main__":
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, SamplingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
@@ -528,6 +507,7 @@ if __name__ == "__main__":
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
     # Initialize our training

         return inputs, labels
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
     num_samples = len(samples_idx)
     samples_to_remove = num_samples % batch_size
         i += len(tokenized_samples["input_ids"])
         # concatenate tokenized samples to list
+        samples = {
+            k: samples[k] + tokenized_samples[k] for k in ["input_ids", "attention_mask", "special_tokens_mask"]
+        }
     # Concatenated tokens are split to lists of length `max_seq_length`.
     # Note that remainedr of % max_seq_length are thrown away.
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
     # Data collator
     # This one will take care of randomly masking the tokens.
+    print("DATA COLLATOR")
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
     # Initialize our training