|
--- |
|
language: |
|
- pt |
|
- en |
|
license: mit |
|
base_model: |
|
- google/bert_uncased_L-4_H-256_A-4 |
|
--- |
|
|
|
```python |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
|
|
model_id = "cnmoro/BertMini-Reranker-EnPt" |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_id, |
|
num_labels=2 |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
template = "Query: {query}\nSentence: {document}" |
|
|
|
def rank(query, documents, normalize_scores=True): |
|
texts = [template.format(query=query, document=document) for document in documents] |
|
|
|
inputs = tokenizer( |
|
texts, |
|
add_special_tokens=True, |
|
max_length=512, |
|
truncation=True, |
|
padding=True, |
|
return_tensors="pt", |
|
) |
|
|
|
input_ids = inputs["input_ids"].to(device) |
|
attention_mask = inputs["attention_mask"].to(device) |
|
|
|
model.eval() |
|
with torch.no_grad(): |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
logits = outputs.logits |
|
probabilities = torch.softmax(logits, dim=1) |
|
|
|
# Get the predicted classes and confidence scores |
|
predicted_classes = torch.argmax(probabilities, dim=1).tolist() |
|
confidences = probabilities.max(dim=1).values.tolist() |
|
|
|
# Construct the results |
|
results = [ |
|
{"prediction": pred, "confidence": conf} |
|
for pred, conf in zip(predicted_classes, confidences) |
|
] |
|
|
|
final_results = [] |
|
for document, result in zip(documents, results): |
|
# If the prediction is 0, then get the score as 1 - confidence |
|
if result['prediction'] == 0: |
|
result['confidence'] = 1 - result['confidence'] |
|
final_results.append((document, result['confidence'])) |
|
|
|
# Sort by the confidence score, descending |
|
sorted_results = sorted(final_results, key=lambda x: x[1], reverse=True) |
|
|
|
if normalize_scores: |
|
total_score = sum([result[1] for result in sorted_results]) |
|
sorted_results = [(result[0], result[1] / total_score) for result in sorted_results] |
|
|
|
return sorted_results |
|
|
|
# Sample - 1 |
|
query = "O que é o Pantanal?" |
|
documents = [ |
|
"É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.", |
|
"Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.", |
|
"O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.", |
|
"O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.", |
|
"É um local com importância histórica e cultural para as populações locais.", |
|
"O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias." |
|
] |
|
rank(query, documents) |
|
# [('O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.', |
|
# 0.36703487634136817), |
|
# ('O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias.', |
|
# 0.36591911362645174), |
|
# ('O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.', |
|
# 0.13708830048931145), |
|
# ('É um local com importância histórica e cultural para as populações locais.', |
|
# 0.0718928987255767), |
|
# ('Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.', |
|
# 0.02968024567026795), |
|
# ('É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.', |
|
# 0.02838456514702401)] |
|
|
|
# Sample - 2 |
|
query = "What is the speed of light?" |
|
documents = [ |
|
"Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.", |
|
"The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.", |
|
"The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.", |
|
"The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.", |
|
"Light can be described as both a wave and a particle, a concept known as wave-particle duality." |
|
] |
|
rank(query, documents) |
|
# [('The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.', |
|
# 0.33902196713184685), |
|
# ("Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.", |
|
# 0.2309855191720416), |
|
# ('The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.', |
|
# 0.20293087063400417), |
|
# ('Light can be described as both a wave and a particle, a concept known as wave-particle duality.', |
|
# 0.188980879354878), |
|
# ('The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.', |
|
# 0.03808076370722937)] |
|
``` |