hu_core_news_trf / lemma_postprocessing.py
oroszgy's picture
Update spacy pipeline to 3.5.0
2db3482
"""
This module contains various rule-based components aiming to improve on baseline lemmatization tools.
"""
import re
from typing import List, Callable
from spacy.lang.hu import Hungarian
from spacy.pipeline import Pipe
from spacy.tokens import Token
from spacy.tokens.doc import Doc
@Hungarian.component(
"lemma_case_smoother",
assigns=["token.lemma"],
requires=["token.lemma", "token.pos"],
)
def lemma_case_smoother(doc: Doc) -> Doc:
"""Smooth lemma casing by POS.
DEPRECATED: This is not needed anymore, as the lemmatizer is now case-insensitive.
Args:
doc (Doc): Input document.
Returns:
Doc: Output document.
"""
for token in doc:
if token.is_sent_start and token.tag_ != "PROPN":
token.lemma_ = token.lemma_.lower()
return doc
class LemmaSmoother(Pipe):
"""Smooths lemma by fixing common errors of the edit-tree lemmatizer."""
_DATE_PATTERN = re.compile(r"(\d+)-j?[éá]?n?a?(t[őó]l)?")
_NUMBER_PATTERN = re.compile(r"(\d+([-,/_.:]?(._)?\d+)*%?)")
# noinspection PyUnusedLocal
@staticmethod
@Hungarian.factory("lemma_smoother", assigns=["token.lemma"], requires=["token.lemma", "token.pos"])
def create_lemma_smoother(nlp: Hungarian, name: str) -> "LemmaSmoother":
return LemmaSmoother()
def __call__(self, doc: Doc) -> Doc:
rules: List[Callable] = [
self._remove_exclamation_marks,
self._remove_question_marks,
self._remove_date_suffixes,
self._remove_suffix_after_numbers,
]
for token in doc:
for rule in rules:
rule(token)
return doc
@classmethod
def _remove_exclamation_marks(cls, token: Token) -> None:
"""Removes exclamation marks from the lemma.
Args:
token (Token): The original token.
"""
if "!" != token.lemma_:
exclamation_mark_index = token.lemma_.find("!")
if exclamation_mark_index != -1:
token.lemma_ = token.lemma_[:exclamation_mark_index]
@classmethod
def _remove_question_marks(cls, token: Token) -> None:
"""Removes question marks from the lemma.
Args:
token (Token): The original token.
"""
if "?" != token.lemma_:
question_mark_index = token.lemma_.find("?")
if question_mark_index != -1:
token.lemma_ = token.lemma_[:question_mark_index]
@classmethod
def _remove_date_suffixes(cls, token: Token) -> None:
"""Fixes the suffixes of dates.
Args:
token (Token): The original token.
"""
if token.pos_ == "NOUN":
match = cls._DATE_PATTERN.match(token.lemma_)
if match is not None:
token.lemma_ = match.group(1) + "."
@classmethod
def _remove_suffix_after_numbers(cls, token: Token) -> None:
"""Removes suffixes after numbers.
Args:
token (str): The original token.
"""
if token.pos_ == "NUM":
match = cls._NUMBER_PATTERN.match(token.text)
if match is not None:
token.lemma_ = match.group(0)