impresso-project
/

nel-mgenre-multilingual

+from transformers import Pipeline
+import numpy as np
+import torch
+import nltk
+nltk.download("averaged_perceptron_tagger")
+nltk.download("averaged_perceptron_tagger_eng")
+from nltk.chunk import conlltags2tree
+from nltk import pos_tag
+from nltk.tree import Tree
+import requests
+import torch.nn.functional as F
+import re, string
+def get_wikipedia_page_props(input_str: str):
+    """
+    Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
+    If the request fails, it falls back to using the OpenRefine Wikidata API.
+    Args:
+        input_str (str): The input string in the format "page_name >> language".
+    Returns:
+        str: The QID or "NIL" if the QID is not found.
+    """
+    try:
+        # Preprocess the input string
+        page_name, language = input_str.split(" >> ")
+        page_name = page_name.strip()
+        language = language.strip()
+    except ValueError:
+        return "Invalid input format. Use 'page_name >> language'."
+    wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
+    wikipedia_params = {
+        "action": "query",
+        "prop": "pageprops",
+        "format": "json",
+        "titles": page_name,
+    }
+    qid = "NIL"
+    try:
+        # Attempt to fetch from Wikipedia API
+        response = requests.get(wikipedia_url, params=wikipedia_params)
+        response.raise_for_status()
+        data = response.json()
+        if "pages" in data["query"]:
+            page_id = list(data["query"]["pages"].keys())[0]
+            if "pageprops" in data["query"]["pages"][page_id]:
+                page_props = data["query"]["pages"][page_id]["pageprops"]
+                if "wikibase_item" in page_props:
+                    return page_props["wikibase_item"]
+                else:
+                    return qid
+            else:
+                return qid
+    except Exception as e:
+        return qid
+def get_wikipedia_title(qid, language="en"):
+    url = f"https://www.wikidata.org/w/api.php"
+    params = {
+        "action": "wbgetentities",
+        "format": "json",
+        "ids": qid,
+        "props": "sitelinks/urls",
+        "sitefilter": f"{language}wiki",
+    }
+    response = requests.get(url, params=params)
+    data = response.json()
+    try:
+        title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
+        url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
+        return title, url
+    except KeyError:
+        return "NIL", "None"
+class NelPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        outputs = self.model.generate(
+            **self.tokenizer([text], return_tensors="pt"),
+            num_beams=5,
+            num_return_sequences=5,
+            max_new_tokens=30,
+        )
+        wikipedia_predictons = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True
+        )
+        print(f"Decoded: {wikipedia_predictons}")
+        return wikipedia_predictons
+    def _forward(self, inputs):
+        return inputs
+    def postprocess(self, outputs, **kwargs):
+        """
+        Postprocess the outputs of the model
+        :param outputs:
+        :param kwargs:
+        :return:
+        """
+        # outputs
+        #
+        # predictions = {}
+        # confidence_scores = {}
+        # for task, logits in tokens_result.logits.items():
+        #     predictions[task] = torch.argmax(logits, dim=-1).tolist()[0]
+        #     confidence_scores[task] = F.softmax(logits, dim=-1).tolist()[0]
+        #
+        # entities = {}
+        # for task in predictions.keys():
+        #     words_list, preds_list, confidence_list = realign(
+        #         text_sentence,
+        #         predictions[task],
+        #         confidence_scores[task],
+        #         self.tokenizer,
+        #         self.id2label[task],
+        #     )
+        #
+        #     entities[task] = get_entities(words_list, preds_list, confidence_list, text)
+        #
+        # postprocessed_entities = self.postprocess_entities(entities, text_sentence)
+        results = []
+        for wikipedia_name in outputs:
+            # Get QID
+            qid = get_wikipedia_page_props(wikipedia_name)
+            print(f"{wikipedia_name} -- QID: {qid}")
+            # Get Wikipedia title and URL
+            title, url = get_wikipedia_title(qid)
+            results.append({"title": title, "qid": qid, "url": url})
+        return results