# # -*- coding: utf-8 -*- | |
# """turkish-sentence-embedding.ipynb | |
# Automatically generated by Colaboratory. | |
# Original file is located at | |
# https://colab.research.google.com/drive/1jvsd0ZRXCjsd5-lH6EI7GaEYIjHN-6d8 | |
# """ | |
# import sys | |
# import torch | |
# if not torch.cuda.is_available(): | |
# print("CUDA NOT FOUND!") | |
# sys.exit(0) | |
from datasets import load_dataset | |
# ds_multinli = load_dataset("nli_tr", "multinli_tr") | |
# ds_snli = load_dataset("nli_tr", "snli_tr") | |
ds_stsb = load_dataset("emrecan/stsb-mt-turkish") | |
# """# ALLNLI Training""" | |
# import math | |
# from sentence_transformers import models, losses, datasets | |
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample | |
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
import logging | |
# from datetime import datetime | |
# import sys | |
# import os | |
# import gzip | |
# import csv | |
# import random | |
# #### Just some code to print debug information to stdout | |
logging.basicConfig( | |
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()] | |
) | |
# #### /print debug information to stdout | |
# model_name = "ytu-ce-cosmos/turkish-small-bert-uncased" | |
train_batch_size = 64 # The larger you select this, the better the results (usually). But it requires more GPU memory | |
max_seq_length = 75 | |
num_epochs = 5 | |
# # Save path of the model | |
model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli" | |
# # Here we define our SentenceTransformer model | |
# word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length).cuda() | |
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean") | |
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) | |
# def add_to_samples(sent1, sent2, label): | |
# if sent1 not in train_data: | |
# train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()} | |
# train_data[sent1][label].add(sent2) | |
# """ | |
# 0: neutral | |
# 1: entailment | |
# 2: contradiction | |
# """ | |
# id_to_label = {0: "entailment", 1: "neutral", 2: "contradiction"} | |
# train_data = {} | |
# nan_count = 0 | |
# ds_allnli_train = [ds_multinli["train"], ds_snli["train"]] | |
# for ds in ds_allnli_train: | |
# for row in ds: | |
# sent1 = row["premise"].strip() | |
# sent2 = row["hypothesis"].strip() | |
# label = row["label"] | |
# label = id_to_label.get(label) | |
# if label: | |
# add_to_samples(sent1, sent2, label) | |
# add_to_samples(sent2, sent1, label) # Also add the opposite | |
# else: | |
# nan_count += 1 | |
# print("total Nan:", nan_count) | |
# train_samples = [] | |
# for sent1, others in train_data.items(): | |
# if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0: | |
# train_samples.append( | |
# InputExample( | |
# texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))] | |
# ) | |
# ) | |
# train_samples.append( | |
# InputExample( | |
# texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))] | |
# ) | |
# ) | |
# logging.info("Train samples: {}".format(len(train_samples))) | |
# train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size) | |
# # Our training loss | |
# train_loss = losses.MultipleNegativesRankingLoss(model) | |
# logging.info("Read STSbenchmark dev dataset") | |
# dev_samples = [] | |
# for row in ds_stsb["validation"]: | |
# score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 | |
# dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) | |
# dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( | |
# dev_samples, batch_size=train_batch_size, name="sts-dev" | |
# ) | |
# test_samples = [] | |
# for row in ds_stsb["test"]: | |
# score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 | |
# test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) | |
# test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( | |
# test_samples, batch_size=train_batch_size, name="sts-test" | |
# ) | |
# # Configure the training | |
# warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up | |
# logging.info("Warmup-steps: {}".format(warmup_steps)) | |
# print(test_evaluator(model)) | |
# model.fit( | |
# train_objectives=[(train_dataloader, train_loss)], | |
# evaluator=dev_evaluator, | |
# epochs=num_epochs, | |
# evaluation_steps=int(len(train_dataloader) * 0.1), | |
# warmup_steps=warmup_steps, | |
# output_path=model_save_path, | |
# use_amp=False, # Set to True, if your GPU supports FP16 operations | |
# ) | |
# ft_model = SentenceTransformer(model_save_path) | |
# print(test_evaluator(ft_model, output_path=model_save_path)) | |
from torch.utils.data import DataLoader | |
import math | |
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample | |
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
import logging | |
from datetime import datetime | |
import os | |
import gzip | |
import csv | |
#### /print debug information to stdout | |
# Read the dataset | |
# Load a pre-trained sentence transformer model | |
model = SentenceTransformer(model_save_path, device="cuda") | |
model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli-stsb" | |
# model_save_path = ( | |
# "output/training_stsbenchmark_continue_training-" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
# ) | |
# Convert the dataset to a DataLoader ready for training | |
logging.info("Read STSbenchmark train dataset") | |
def generate_samples(split): | |
samples = [] | |
for row in ds_stsb[split]: | |
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 | |
samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) | |
return samples | |
train_samples = generate_samples("train") | |
dev_samples = generate_samples("validation") | |
test_samples = generate_samples("test") | |
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) | |
train_loss = losses.CosineSimilarityLoss(model=model) | |
# Development set: Measure correlation between cosine score and gold labels | |
logging.info("Read STSbenchmark dev dataset") | |
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev") | |
# Configure the training. We skip evaluation in this example | |
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up | |
logging.info("Warmup-steps: {}".format(warmup_steps)) | |
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test") | |
print(test_evaluator(model)) | |
model.fit( | |
train_objectives=[(train_dataloader, train_loss)], | |
evaluator=evaluator, | |
epochs=num_epochs, | |
evaluation_steps=int(len(train_dataloader) * 0.5), | |
# evaluation_steps=1000, | |
warmup_steps=warmup_steps, | |
output_path=model_save_path, | |
) | |
ft_model = SentenceTransformer(model_save_path) | |
print(test_evaluator(ft_model, output_path=model_save_path)) |