In [1]:
from datasets import load_dataset

dataset = load_dataset("google/boolq")

 from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
dataset

In [None]:
dataset["train"][0]

In [None]:
def encode_question_context_pairs(example):
 text=f'{example["question"]} [SEP] {example["passage"]}'
 label= 0 if not example["answer"] else 1
 inputs=tokenizer(text,truncation=True)
 inputs["labels"]=[float(label)]
 return inputs

In [None]:
train_dataset=dataset["train"].map(encode_question_context_pairs,remove_columns=dataset["train"].column_names)

In [None]:
validation_dataset=dataset["validation"].map(encode_question_context_pairs,remove_columns=dataset["train"].column_names)

In [None]:
# train_dataset['labels']

In [None]:
# tokenizer("question","answer",max_length=512,padding="max_length",truncation="only_second",)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 predictions = np.where(predictions<0.5,0,1)
 return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
 "bert-base-uncased", num_labels=1,
)

In [None]:
training_args = TrainingArguments(
 output_dir="./",
 learning_rate=2e-5,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=16,
 num_train_epochs=4,
 weight_decay=0.01,
 evaluation_strategy="epoch",
 save_strategy="epoch",
 load_best_model_at_end=True,
 gradient_accumulation_steps=4,
 logging_steps=50,
 seed=42,
 adam_beta1= 0.9,
 adam_beta2= 0.999,
 adam_epsilon= 1e-08,
 report_to="tensorboard",
 push_to_hub=True,
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=validation_dataset,
 tokenizer=tokenizer,
 data_collator=data_collator,
 compute_metrics=compute_metrics,
)

# trainer.train()

In [None]:
model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)

In [None]:
trainer.train()

In [None]:
kwargs = {
 "dataset_tags": "google/boolq",
 "dataset": "boolq", # a 'pretty' name for the training dataset
 "language": "en",
 "model_name": "Bert Base Uncased Boolean Question Answer model", # a 'pretty' name for your model
 "finetuned_from": "bert-base-uncased",
 "tasks": "text-classification",
}

In [None]:
trainer.push_to_hub(**kwargs)