In [1]:
from datasets import load_dataset

dataset = load_dataset("google/boolq")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'passage'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'answer', 'passage'],
        num_rows: 3270
    })
})

In [4]:
dataset["train"][0]

{'question': 'do iran and afghanistan speak the same language',
 'answer': True,
 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.'}

In [5]:
def encode_question_context_pairs(example):
    text=f'{example["question"]} [SEP] {example["passage"]}'
    label= 0 if not example["answer"] else 1
    inputs=tokenizer(text,truncation=True)
    inputs["labels"]=[float(label)]
    return inputs

In [6]:
train_dataset=dataset["train"].map(encode_question_context_pairs,remove_columns=dataset["train"].column_names)

In [7]:
validation_dataset=dataset["validation"].map(encode_question_context_pairs,remove_columns=dataset["train"].column_names)

In [8]:
# train_dataset['labels']

In [9]:
# tokenizer("question","answer",max_length=512,padding="max_length",truncation="only_second",)

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions<0.5,0,1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=1,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    logging_steps=50,
    seed=42,
    adam_beta1= 0.9,
    adam_beta2= 0.999,
    adam_epsilon= 1e-08,
    report_to="tensorboard",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

In [15]:
model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.2317,0.219812,0.656881
2,0.1741,0.196769,0.712232


In [None]:
kwargs = {
    "dataset_tags": "google/boolq",
    "dataset": "boolq",  # a 'pretty' name for the training dataset
    "language": "en",
    "model_name": "Bert Base Uncased Boolean Question Answer model",  # a 'pretty' name for your model
    "finetuned_from": "bert-base-uncased",
    "tasks": "text-classification",
}

In [None]:
trainer.push_to_hub(**kwargs)