|
--- |
|
language: ru |
|
tags: |
|
- russian |
|
- text-to-text |
|
- PyTorch |
|
- Transformers |
|
license: apache-2.0 |
|
widget: |
|
- text: <LM>Водка ""Русская валюта"" премиум люкс 38% 0,25л, Россия |
|
pipeline_tag: text2text-generation |
|
--- |
|
|
|
This is a named entity recognizer for goods and brands extraction from receipts of fiscal data operators in Russian. |
|
|
|
It was developed for the special multi-staged competition devoted to receipt structurization. This competition was organized by [Open Data Science community](https://ods.ai) and [Alpha Bank](https://alfabank.ru), and it was consisted of [the first](https://ods.ai/competitions/nlp-receipts), [the second](https://ods.ai/competitions/alfabank-nlp-receipts-2) and [the final](https://ods.ai/competitions/alfabank-nlp-receipts-final) stage. But this model can be used for any receipt parsing and structurization in Russian. The repository with code for fine-tuning and inference is available on [gitflic.ru](https://gitflic.ru/project/bond005/ods-ner-2023). |
|
|
|
Example of using: |
|
|
|
``` |
|
from typing import Tuple |
|
import torch |
|
from transformers import T5ForConditionalGeneration, GPT2Tokenizer |
|
|
|
|
|
MODEL_NAME = 'bond005/FRED-T5-large-ods-ner-2023' |
|
START_TAG = '<LM>' |
|
END_TAG = '</s>' |
|
|
|
|
|
def initialize_recognizer(model_path: str) -> Tuple[GPT2Tokenizer, T5ForConditionalGeneration]: |
|
model = T5ForConditionalGeneration.from_pretrained(model_path) |
|
if not torch.cuda.is_available(): |
|
raise ValueError('CUDA is not available!') |
|
model = model.cuda() |
|
model.eval() |
|
tokenizer = GPT2Tokenizer.from_pretrained(model_path) |
|
return tokenizer, model |
|
|
|
|
|
def recognize(text: str, tokenizer: GPT2Tokenizer, model: T5ForConditionalGeneration) -> Tuple[str, str]: |
|
if text.startswith(START_TAG): |
|
x = tokenizer(text, return_tensors='pt', padding=True).to(model.device) |
|
else: |
|
x = tokenizer(START_TAG + text, return_tensors='pt', padding=True).to(model.device) |
|
out = model.generate(**x) |
|
predictions = tokenizer.decode(out[0], skip_special_tokens=True).strip() |
|
while predictions.endswith(END_TAG): |
|
predictions = predictions[:-len(END_TAG)].strip() |
|
prediction_pair = predictions.split(';') |
|
if len(prediction_pair) == 0: |
|
goods = '' |
|
brands = '' |
|
elif len(prediction_pair) == 1: |
|
goods = prediction_pair[0].strip() |
|
brands = '' |
|
else: |
|
goods = prediction_pair[0].strip() |
|
brands = prediction_pair[1].strip() |
|
return goods, brands |
|
|
|
|
|
recognizer = initialize_recognizer(MODEL_NAME) |
|
|
|
goods_and_brands = recognize(text='Водка "Русская валюта" премиум люкс 38% 0,25л, Россия', |
|
tokenizer=recognizer[0], model=recognizer[1]) |
|
|
|
print(f'GOODS: {goods_and_brands[0]}') |
|
# водка |
|
|
|
print(f'BRANDS: {goods_and_brands[1]}') |
|
# русская валюта |
|
``` |