impresso-project
/

nel-mgenre-multilingual

@@ -115,9 +115,9 @@ class NelPipeline(Pipeline):
             output_scores=True,
         )
         # Decode the predictions into readable text
-        wikipedia_predictions = self.tokenizer.batch_decode(
             outputs.sequences, skip_special_tokens=True
-        )
         # Process the scores for each token
         transition_scores = self.model.compute_transition_scores(
@@ -127,12 +127,12 @@ class NelPipeline(Pipeline):
         # Calculate the probability for the entire sequence by exponentiating the sum of log probabilities
         sequence_confidence = torch.exp(log_prob_sum)
-        percentages = sequence_confidence.cpu().numpy() * 100.0
-        print(wikipedia_predictions, enclosed_entity, lOffset, rOffset, [percentages])
         # Return the predictions along with the extracted entity, lOffset, and rOffset
-        return wikipedia_predictions, enclosed_entity, lOffset, rOffset, [percentages]
     def _forward(self, inputs):
         return inputs
@@ -157,27 +157,21 @@ class NelPipeline(Pipeline):
         #         ],  # This can be improved with a real API call to get the QID
         #     "confidence_nel": np.round(percentages[i], 2),
         # }
-        wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages = outputs
-        results = []
-        for idx, wikipedia_name in enumerate(wikipedia_predictions):
-            # Get QID
-            qid, language = get_wikipedia_page_props(wikipedia_name)
-            # print(f"{wikipedia_name} -- QID: {qid}")
-            # Get Wikipedia title and URL
-            wkpedia_pagename, url = get_wikipedia_title(qid, language)
-            results.append(
-                {
-                    # "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
-                    "surface": enclosed_entity,
-                    "wkd_id": qid,
-                    "wkpedia_pagename": wkpedia_pagename,
-                    "wkpedia_url": url,
-                    "type": "UNK",
-                    "confidence_nel": round(percentages[idx], 2),
-                    "lOffset": lOffset,
-                    "rOffset": rOffset,
-                }
-            )
         print(results)
         return results

             output_scores=True,
         )
         # Decode the predictions into readable text
+        wikipedia_prediction = self.tokenizer.batch_decode(
             outputs.sequences, skip_special_tokens=True
+        )[0]
         # Process the scores for each token
         transition_scores = self.model.compute_transition_scores(
         # Calculate the probability for the entire sequence by exponentiating the sum of log probabilities
         sequence_confidence = torch.exp(log_prob_sum)
+        percentage = sequence_confidence.cpu().numpy() * 100.0
+        # print(wikipedia_prediction, enclosed_entity, lOffset, rOffset, percentage)
         # Return the predictions along with the extracted entity, lOffset, and rOffset
+        return wikipedia_prediction, enclosed_entity, lOffset, rOffset, percentage
     def _forward(self, inputs):
         return inputs
         #         ],  # This can be improved with a real API call to get the QID
         #     "confidence_nel": np.round(percentages[i], 2),
         # }
+        wikipedia_prediction, enclosed_entity, lOffset, rOffset, percentage = outputs
+        qid, language = get_wikipedia_page_props(wikipedia_prediction)
+        title, url = get_wikipedia_title(qid, language="en")
+        results = [
+            {
+                # "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
+                "surface": enclosed_entity,
+                "wkd_id": qid,
+                "wkpedia_pagename": title,
+                "wkpedia_url": url,
+                "type": "UNK",
+                "confidence_nel": round(percentage, 2),
+                "lOffset": lOffset,
+                "rOffset": rOffset,
+            }
+        ]
         print(results)
         return results