Ahmet
upload files
e8f6c47
# # -*- coding: utf-8 -*-
# """turkish-sentence-embedding.ipynb
# Automatically generated by Colaboratory.
# Original file is located at
# https://colab.research.google.com/drive/1jvsd0ZRXCjsd5-lH6EI7GaEYIjHN-6d8
# """
# import sys
# import torch
# if not torch.cuda.is_available():
# print("CUDA NOT FOUND!")
# sys.exit(0)
from datasets import load_dataset
# ds_multinli = load_dataset("nli_tr", "multinli_tr")
# ds_snli = load_dataset("nli_tr", "snli_tr")
ds_stsb = load_dataset("emrecan/stsb-mt-turkish")
# """# ALLNLI Training"""
# import math
# from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
# from datetime import datetime
# import sys
# import os
# import gzip
# import csv
# import random
# #### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
# #### /print debug information to stdout
# model_name = "ytu-ce-cosmos/turkish-small-bert-uncased"
train_batch_size = 64 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 5
# # Save path of the model
model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli"
# # Here we define our SentenceTransformer model
# word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length).cuda()
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# def add_to_samples(sent1, sent2, label):
# if sent1 not in train_data:
# train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
# train_data[sent1][label].add(sent2)
# """
# 0: neutral
# 1: entailment
# 2: contradiction
# """
# id_to_label = {0: "entailment", 1: "neutral", 2: "contradiction"}
# train_data = {}
# nan_count = 0
# ds_allnli_train = [ds_multinli["train"], ds_snli["train"]]
# for ds in ds_allnli_train:
# for row in ds:
# sent1 = row["premise"].strip()
# sent2 = row["hypothesis"].strip()
# label = row["label"]
# label = id_to_label.get(label)
# if label:
# add_to_samples(sent1, sent2, label)
# add_to_samples(sent2, sent1, label) # Also add the opposite
# else:
# nan_count += 1
# print("total Nan:", nan_count)
# train_samples = []
# for sent1, others in train_data.items():
# if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
# train_samples.append(
# InputExample(
# texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
# )
# )
# train_samples.append(
# InputExample(
# texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
# )
# )
# logging.info("Train samples: {}".format(len(train_samples)))
# train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# # Our training loss
# train_loss = losses.MultipleNegativesRankingLoss(model)
# logging.info("Read STSbenchmark dev dataset")
# dev_samples = []
# for row in ds_stsb["validation"]:
# score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
# dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
# dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
# dev_samples, batch_size=train_batch_size, name="sts-dev"
# )
# test_samples = []
# for row in ds_stsb["test"]:
# score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
# test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
# test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
# test_samples, batch_size=train_batch_size, name="sts-test"
# )
# # Configure the training
# warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
# logging.info("Warmup-steps: {}".format(warmup_steps))
# print(test_evaluator(model))
# model.fit(
# train_objectives=[(train_dataloader, train_loss)],
# evaluator=dev_evaluator,
# epochs=num_epochs,
# evaluation_steps=int(len(train_dataloader) * 0.1),
# warmup_steps=warmup_steps,
# output_path=model_save_path,
# use_amp=False, # Set to True, if your GPU supports FP16 operations
# )
# ft_model = SentenceTransformer(model_save_path)
# print(test_evaluator(ft_model, output_path=model_save_path))
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
#### /print debug information to stdout
# Read the dataset
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_save_path, device="cuda")
model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli-stsb"
# model_save_path = (
# "output/training_stsbenchmark_continue_training-" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# )
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
def generate_samples(split):
samples = []
for row in ds_stsb[split]:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
return samples
train_samples = generate_samples("train")
dev_samples = generate_samples("validation")
test_samples = generate_samples("test")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
print(test_evaluator(model))
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.5),
# evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
ft_model = SentenceTransformer(model_save_path)
print(test_evaluator(ft_model, output_path=model_save_path))