impresso-project
/

mallet-topic-inferencer

Model card Files Files and versions Community

Simon Clematide commited on Oct 29, 2024

Commit

fc83ec7

1 Parent(s): feeca2c

Initial commit with models, scripts, and JAR files

Browse files

Files changed (16) hide show

.gitattributes +4 -0
lib/__init__.py +0 -0
lib/mallet2topic_assignment_jsonl.py +346 -0
lib/mallet_topic_inferencer.py +739 -0
mallet/lib/mallet-deps.jar +3 -0
mallet/lib/mallet.jar +3 -0
models/tm/tm-de-all-v2.0.inferencer +3 -0
models/tm/tm-de-all-v2.0.pipe +3 -0
models/tm/tm-de-all-v2.0.vocab.lemmatization.tsv.gz +3 -0
models/tm/tm-fr-all-v2.0.inferencer +3 -0
models/tm/tm-fr-all-v2.0.pipe +3 -0
models/tm/tm-fr-all-v2.0.vocab.lemmatization.tsv.gz +3 -0
models/tm/tm-lb-all-v2.0.inferencer +3 -0
models/tm/tm-lb-all-v2.0.pipe +3 -0
models/tm/tm-lb-all-v2.0.vocab.lemmatization.tsv.gz +3 -0
requirements.txt +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.inferencer filter=lfs diff=lfs merge=lfs -text
+*.pipe filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text

lib/__init__.py ADDED Viewed

File without changes

lib/mallet2topic_assignment_jsonl.py ADDED Viewed

	@@ -0,0 +1,346 @@

+#!/usr/bin/env python3
+"""
+Typical output of the script:
+{"topic_model":"tm-fr-all-v2.0","topic_count":100,"lang":"fr","ci_ref":"actionfem-1936-02-15-a-i0022","topics":[],"min_p":0.02}
+{
+    "topic_count": 100,
+    "lang": "de",
+    "topics": [
+        {"t": "tm-de-all-v2.0_tp02_de", "p": 0.027},
+        {"t": "tm-de-all-v2.0_tp11_de", "p": 0.119},
+        {"t": "tm-de-all-v2.0_tp26_de", "p": 0.045}
+    ],
+    "min_p": 0.02,
+    "ts": "2024.08.29",
+    "id": "actionfem-1927-12-15-a-i0001",
+    "sys_id": "tm-de-all-v2.0"
+}
+"""
+import datetime
+import logging
+import argparse
+import traceback
+import math
+import json
+import re
+import collections
+from typing import Generator, List, Dict, Any, Optional
+from smart_open import open
+CI_ID_REGEX = re.compile(r"^(.+?/)?([^/]+?-\d{4}-\d{2}-\d{2}-\w-i\d{4})[^/]*$")
+class Mallet2TopicAssignment:
+    def __init__(
+        self,
+        args: Optional[argparse.Namespace] = None,
+        topic_assignment_threshold: Optional[float] = None,
+        lang: Optional[str] = None,
+        topic_model: Optional[str] = None,
+        numeric_topic_ids: Optional[bool] = None,
+        format_type: Optional[str] = None,
+        topic_count: Optional[int] = None,
+        output: Optional[str] = None,
+    ) -> None:
+        self.eps = args.topic_assignment_threshold
+        self.lang = args.lang
+        self.topic_model = args.topic_model
+        self.numeric_topic_ids = args.numeric_topic_ids
+        self.format_type = args.format_type.lower()  # Normalize case
+        self.topic_count = args.topic_count
+        self.output = args.output
+        self.args = args  # Ensure we keep the args namespace
+        self.validate_options()
+        self.precision = math.ceil(abs(math.log10(self.eps))) + 1
+        self.padding_length = math.ceil(math.log10(self.topic_count))
+        self.topic_id_format = (
+            f"{self.topic_model}_tp{{t:0{self.padding_length}d}}_{self.lang}"
+        )
+        self.last_timestamp = (
+            datetime.datetime.now(tz=datetime.timezone.utc)
+            .replace(microsecond=0)
+            .isoformat()
+            + "Z"
+        )
+    def validate_options(self) -> None:
+        if self.eps <= 0 or self.eps >= 1:
+            raise ValueError("topic_assignment_threshold must be between 0 and 1.")
+        if self.format_type == "sparse" and not self.topic_count:
+            raise ValueError(
+                "The --topic_count option is required when using the 'sparse' format."
+            )
+    def read_tsv_files(self, filenames: List[str]) -> Generator[List[str], None, None]:
+        for filename in filenames:
+            yield from self.read_tsv_file(filename)
+    def read_tsv_file(self, filename: str) -> Generator[List[str], None, None]:
+        line_count = 0
+        with open(filename, "r", encoding="utf-8") as file:
+            for line in file:
+                line_count += 1
+                if not line.startswith("#"):
+                    yield line.strip().split("\t")
+                if line_count % 1000 == 0:
+                    logging.info("Processed lines: %s", line_count)
+    def convert_matrix_row(self, row: List[str]) -> Dict[str, Any]:
+        ci_id = re.sub(CI_ID_REGEX, r"\2", row[1])
+        topics = row[2:]
+        topic_count = len(topics)
+        if self.numeric_topic_ids:
+            topics = [
+                {"t": t, "p": round(fp, self.precision)}
+                for t, p in enumerate(topics)
+                if (fp := float(p)) >= self.eps
+            ]
+        else:
+            topics = [
+                {
+                    "t": self.topic_id_format.format(t=t),
+                    "p": round(fp, self.precision),
+                }
+                for t, p in enumerate(topics)
+                if (fp := float(p)) >= self.eps
+            ]
+        return {
+            "ci_id": ci_id,
+            "model_id": self.topic_model,
+            "lang": self.lang,
+            "topic_count": topic_count,
+            "topics": topics,
+            "min_p": self.eps,
+            "ts": self.last_timestamp,
+        }
+    def convert_sparse_row(self, row: List[str]) -> Dict[str, Any]:
+        ci_id = re.sub(CI_ID_REGEX, r"\2", row[1])
+        topic_pairs = row[2:]
+        topics = []
+        for i in range(0, len(topic_pairs), 2):
+            t = int(topic_pairs[i])
+            p = float(topic_pairs[i + 1])
+            if p >= self.eps:
+                if self.numeric_topic_ids:
+                    topics.append(
+                        {
+                            "t": t,
+                            "p": round(p, math.ceil(abs(math.log10(self.eps))) + 1),
+                        }
+                    )
+                else:
+                    topics.append(
+                        {
+                            "t": self.topic_id_format.format(t=t),
+                            "p": round(p, math.ceil(abs(math.log10(self.eps))) + 1),
+                        }
+                    )
+        return {
+            "ci_id": ci_id,
+            "model_id": self.topic_model,
+            "lang": self.lang,
+            "topic_count": self.topic_count,
+            "topics": topics,
+            "min_p": self.eps,
+            "ts": self.last_timestamp,
+        }
+    def parse_mallet_files(
+        self, filenames: List[str]
+    ) -> Generator[Dict[str, Any], None, None]:
+        """
+        Process the Mallet topic word weights from multiple files and yield topic assignments in JSON format.
+        Args:
+            filenames (List[str]): List of paths to the input files.
+        Yields:
+            Dict[str, Any]: Parsed topic assignment from each line in the input files.
+        """
+        ci_id_stats = collections.Counter()
+        if self.format_type == "sparse":
+            convert_row = self.convert_sparse_row
+        elif self.format_type == "matrix":
+            convert_row = self.convert_matrix_row
+        else:
+            raise ValueError(f"Invalid format type: {self.format_type}")
+        for row in self.read_tsv_files(filenames):
+            ci_id = re.sub(CI_ID_REGEX, r"\2", row[1])
+            if ci_id in ci_id_stats:
+                ci_id_stats["DUPLICATE_COUNT"] += 1
+                continue
+            ci_id_stats[ci_id] = 1
+            yield convert_row(row)
+        logging.info("DUPLICATE-COUNT: %d", ci_id_stats["DUPLICATE_COUNT"])
+    def run(self) -> Optional[Generator[Dict[str, Any], None, None]]:
+        """
+        Main method to process the input files based on the command line arguments.
+        Returns a generator if output is set to '<generator>', otherwise writes to a file.
+        Returns:
+            Optional[Generator[Dict[str, Any], None, None]]: A generator for topic assignments
+            if output is set to '<generator>', otherwise None.
+        """
+        if self.output == "<generator>":
+            # Return a generator if the output is set to '<generator>'
+            return self.parse_mallet_files(self.args.INPUT_FILES)
+        try:
+            with open(self.output, "w", encoding="utf-8") as out_file:
+                for topic_assignment in self.parse_mallet_files(self.args.INPUT_FILES):
+                    out_file.write(
+                        json.dumps(
+                            topic_assignment, ensure_ascii=False, separators=(",", ":")
+                        )
+                        + "\n"
+                    )
+        except Exception as e:
+            logging.error(f"An error occurred: {e}")
+            logging.error("Traceback: %s", traceback.format_exc())
+            exit(1)
+    @staticmethod
+    def setup_logging(options: argparse.Namespace) -> None:
+        """
+        Set up logging configuration based on command line options.
+        """
+        log_level = logging.DEBUG if options.debug else logging.INFO
+        logging.basicConfig(
+            level=log_level, filename=options.logfile if options.logfile else None
+        )
+    @staticmethod
+    def main(
+        args: Optional[List[str]],
+    ) -> Optional[Generator[Dict[str, Any], None, None]]:
+        """
+        Static method serving as the entry point of the script.
+        If the output option is set to '<generator>', it returns a Python generator
+        for topic assignments, otherwise prints results or writes to a file.
+        Returns:
+            Optional[Generator[Dict[str, Any], None, None]]: Generator for topic assignments
+            if output is set to '<generator>', otherwise None.
+        """
+        parser = argparse.ArgumentParser(
+            usage="%(prog)s [OPTIONS] INPUT [INPUT ...]",
+            description=(
+                "Return topic assignments from mallet textual topic modeling output."
+            ),
+            epilog="Contact [email protected] for more information.",
+        )
+        parser.add_argument("--version", action="version", version="2024.10.23")
+        parser.add_argument(
+            "-l", "--logfile", help="Write log information to FILE", metavar="FILE"
+        )
+        parser.add_argument(
+            "-q",
+            "--quiet",
+            action="store_true",
+            help="Do not print status messages to stderr",
+        )
+        parser.add_argument(
+            "-d", "--debug", action="store_true", help="Print debug information"
+        )
+        parser.add_argument(
+            "-L",
+            "--lang",
+            "--language",
+            default="und",
+            help="ISO 639 language code two-letter or 'und' for undefined",
+        )
+        parser.add_argument(
+            "-M",
+            "--topic_model",
+            default="tm000",
+            help="Topic model identifier, e.g., tm001",
+        )
+        parser.add_argument(
+            "-N",
+            "--numeric_topic_ids",
+            action="store_true",
+            help="Use numeric topic IDs in the topic assignment",
+        )
+        parser.add_argument(
+            "-T",
+            "--topic_assignment_threshold",
+            type=float,
+            default=0.02,
+            help="Minimum probability for inclusion in the output",
+        )
+        parser.add_argument(
+            "-F",
+            "--format_type",
+            choices=["matrix", "sparse"],
+            default="matrix",
+            help="Format of the input file: 'matrix' or 'sparse'",
+        )
+        parser.add_argument(
+            "-C",
+            "--topic_count",
+            type=int,
+            help="Needed for formatting ",
+            required=True,
+        )
+        parser.add_argument(
+            "-o",
+            "--output",
+            help=(
+                "Path to the output file (%(default)s). If set to '<generator>' it will"
+                " return a generator that can be used to enumerate all results in a"
+                " flexible way. "
+            ),
+            default="/dev/stdout",
+        )
+        parser.add_argument(
+            "--level",
+            default="INFO",
+            choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+            help="Set the logging level. Default: %(default)s",
+        )
+        parser.add_argument(
+            "INPUT_FILES", nargs="+", help="One or more input files to process."
+        )
+        options = parser.parse_args(args=args)
+        # Configure logging
+        Mallet2TopicAssignment.setup_logging(options)
+        # Validate specific arguments
+        if options.format_type == "sparse" and not options.topic_count:
+            parser.error(
+                "The --topic_count option is required when using the 'sparse' format"
+            )
+        # Create the application instance
+        app = Mallet2TopicAssignment(args=options)
+        # Check if output is set to '<generator>' and return a generator if so
+        if options.output == "<generator>":
+            return app.run()
+        # Otherwise, run normally (output to file or stdout)
+        app.run()
+        return None
+if __name__ == "__main__":
+    Mallet2TopicAssignment.main()

lib/mallet_topic_inferencer.py ADDED Viewed

	@@ -0,0 +1,739 @@

+#!/usr/bin/python3
+"""
+DOCUMENTATION: This script performs vectorization and topic inference using Mallet models. It accepts a raw JSONL file,
+identifies the language of the text, and applies the corresponding Mallet model for topic inference. It also supports
+other input formats through a flexible InputReader abstraction (e.g., CSV, JSONL).
+The benefit of this script with respect to the Mallet CLI is that it can handle
+multiple languages in a single run without calling Mallet multiple times.
+Classes:
+- MalletVectorizer: Handles text-to-Mallet vectorization.
+- LanguageInferencer: Performs topic inference using a Mallet inferencer and
+  the vectorizer.
+- InputReader (abstract class): Defines the interface for reading input
+  documents.
+- JsonlInputReader: Reads input from JSONL files.
+- CsvInputReader: Reads input from CSV files (Mallet format).
+- MalletTopicInferencer: Coordinates the process, identifies language, and manages
+  inference.
+USAGE:
+python mallet_topic_inferencer.py --input input.jsonl --output output.txt
+--logfile logfile.log --input-format jsonl --level INFO --num_iterations 1000
+--languages de,en --de_inferencer models/de.inferencer --de_pipe models/de.pipe
+"""
+import collections
+import traceback
+import jpype
+import jpype.imports
+import spacy
+# from jpype.types import JString
+import os
+import logging
+import argparse
+import json
+import csv
+import tempfile
+from typing import List, Dict, Generator, Tuple, Optional, Set
+from abc import ABC, abstractmethod
+import mallet2topic_assignment_jsonl as m2taj
+from smart_open import open
+log = logging.getLogger(__name__)
+def save_text_as_csv(text: str) -> str:
+    """
+    Save the given text as a temporary CSV file with an arbitrary ID and return the file name.
+    Args:
+        text (str): The text to be saved in the CSV file.
+    Returns:
+        str: The name of the temporary CSV file.
+    """
+    # Create a temporary file with .csv suffix
+    temp_csv_file = tempfile.NamedTemporaryFile(
+        delete=False, mode="w", suffix=".csv", newline="", encoding="utf-8"
+    )
+    # Write the text to the CSV file with an arbitrary ID
+    csv_writer = csv.writer(temp_csv_file, delimiter="\t")
+    csv_writer.writerow(["ID", "DUMMYCLASS", "TEXT"])  # Header
+    csv_writer.writerow(["USERINPUT-2024-10-24-a-i0042", "dummy_class", text])
+    # Close the file to ensure all data is written
+    temp_csv_file.close()
+    return temp_csv_file.name
+class Lemmatizer:
+    def __init__(
+        self,
+        languages_dict: Dict[str, str],
+        lang_lemmatization_dict: Dict[str, Dict[str, str]],
+    ):
+        """
+        Initializes the linguistic lemmatizer with specified languages and lemmatization dictionary.
+        Args:
+            languages (List[str]): List of language codes to load processing pipelines for.
+            lemmatization_dict (Dict[str, str]): Dictionary mapping tokens to their lemmas.
+        """
+        self.languages_dict = languages_dict
+        self.lemmatization_dict = lang_lemmatization_dict
+        self.language_processors = self._load_language_processors(languages_dict)
+    def _load_language_processors(
+        self, languages_dict
+    ) -> Dict[str, spacy.language.Language]:
+        """
+        Loads spacy language processors for the specified languages.
+        Returns:
+            Dict[str, spacy.language.Language]: Dictionary mapping language codes to spacy NLP pipelines.
+        """
+        processors = {}
+        for lang in languages_dict:
+            processors[lang] = spacy.load(
+                languages_dict[lang], disable=["parser", "ner"]
+            )
+            processors[lang].add_pipe("sentencizer")
+        return processors
+    def analyze_text(self, text: str, lang: str) -> List[str]:
+        """
+        Analyzes text, performing tokenization, POS tagging, and lemma mapping.
+        Args:
+            text (str): Text to process.
+            lang (str): Language code for the text.
+        Returns:
+            List[str]: List of tokens that have matching entries in the lemmatization dictionary.
+        """
+        if lang not in self.language_processors:
+            raise ValueError(f"No processing pipeline for language '{lang}'")
+        nlp = self.language_processors[lang]
+        doc = nlp(text)
+        token2lemma = self.lemmatization_dict[lang]
+        matched_tokens = [
+            lemma for tok in doc if (lemma := token2lemma.get(tok.text.lower()))
+        ]
+        return matched_tokens
+# ==================== Vectorization ====================
+class MalletVectorizer:
+    """
+    Handles the vectorization of multiple documents into a format usable by Mallet using the pipe file from the model.
+    """
+    def __init__(self, language: str, pipe_file: str) -> None:
+        # noinspection PyUnresolvedReferences
+        from cc.mallet.classify.tui import Csv2Vectors  # type: ignore # Import after JVM is started
+        self.vectorizer = Csv2Vectors()
+        self.pipe_file = pipe_file
+        self.language = language
+    def run_csv2vectors(
+        self,
+        input_file: str,
+        output_file: Optional[str] = None,
+        delete_input_file_after: bool = True,
+    ) -> str:
+        """
+        Run Csv2Vectors to vectorize the input file.
+        Simple java-internal command line interface to the Csv2Vectors class in Mallet.
+        Args:
+            input_file: Path to the csv input file to be vectorized.
+            output_file: Path where the output .mallet file should be saved.
+        """
+        if not output_file:
+            output_file = input_file + ".mallet"
+        # Arguments for Csv2Vectors java main class
+        arguments = [
+            "--input",
+            input_file,
+            "--output",
+            output_file,
+            "--keep-sequence",  # Keep sequence for feature extraction
+            "--encoding",
+            "UTF-8",
+            "--use-pipe-from",
+            self.pipe_file,
+        ]
+        logging.info("Calling mallet Csv2Vector: %s", arguments)
+        self.vectorizer.main(arguments)
+        logging.debug("Csv2Vector call finished.")
+        if log.getEffectiveLevel() != logging.DEBUG and delete_input_file_after:
+            os.remove(input_file)
+            logging.info("Cleaning up input file: %s", input_file)
+        return output_file
+class LanguageInferencer:
+    """
+    A class to manage Mallet inferencing for a specific language.
+    Loads the inferencer and pipe file during initialization.
+    """
+    def __init__(self, language: str, inferencer_file: str, pipe_file: str) -> None:
+        # noinspection PyUnresolvedReferences
+        from cc.mallet.topics.tui import InferTopics  # type: ignore # Import after JVM is started
+        self.language = language
+        self.inferencer_file = inferencer_file
+        self.inferencer = InferTopics()
+        self.pipe_file = pipe_file
+        self.vectorizer = MalletVectorizer(language=language, pipe_file=self.pipe_file)
+        if not os.path.exists(self.inferencer_file):
+            raise FileNotFoundError(
+                f"Inferencer file not found: {self.inferencer_file}"
+            )
+    def run_csv2topics(
+        self, csv_file: str, delete_mallet_file_after: bool = True
+    ) -> Dict[str, str]:
+        """
+        Perform topic inference on a single input file.
+        The input file should be in the format expected by Mallet.
+        Returns a dictionary of document_id -> topic distributions.
+        """
+        # Vectorize the input file and write to a temporary file
+        mallet_file = self.vectorizer.run_csv2vectors(csv_file)
+        topics_file = mallet_file + ".doctopics"
+        arguments = [
+            "--input",
+            mallet_file,
+            "--inferencer",
+            self.inferencer_file,
+            "--output-doc-topics",
+            topics_file,
+            "--random-seed",
+            "42",
+        ]
+        logging.info("Calling mallet InferTopics: %s", arguments)
+        self.inferencer.main(arguments)
+        logging.debug("InferTopics call finished.")
+        if log.getEffectiveLevel() != logging.DEBUG and delete_mallet_file_after:
+            os.remove(mallet_file)
+            logging.info("Cleaning up input file: %s", mallet_file)
+        return topics_file
+# ==================== Input Reader Abstraction ====================
+class InputReader(ABC):
+    """
+    Abstract base class for input readers.
+    Subclasses should implement the `read_documents` method to yield documents.
+    """
+    @abstractmethod
+    def read_documents(self) -> Generator[Tuple[str, str], None, None]:
+        """
+        Yields a tuple of (document_id, text).
+        Each implementation should handle its specific input format.
+        """
+        pass
+class JsonlInputReader(InputReader):
+    """
+    Reads input from a JSONL file, where each line contains a JSON object
+    with at least "id" and "text" fields.
+    """
+    def __init__(self, input_file: str) -> None:
+        self.input_file = input_file
+    def read_documents(self) -> Generator[Tuple[str, str], None, None]:
+        with open(self.input_file, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                document_id = data.get("id", "unknown_id")
+                text = data.get("text", "")
+                yield document_id, text
+class CsvInputReader(InputReader):
+    """
+    Reads input from a CSV file in Mallet's format (document ID, dummy class, text).
+    Assumes that the CSV has three columns: "id", "dummyclass", and "text".
+    """
+    def __init__(self, input_file: str) -> None:
+        self.input_file = input_file
+    def read_documents(self) -> Generator[Tuple[str, str], None, None]:
+        with open(self.input_file, mode="r", encoding="utf-8") as f:
+            csv_reader = csv.reader(f, delimiter="\t")
+            for row in csv_reader:
+                if len(row) < 3:
+                    continue
+                document_id, text = row[0], row[2]
+                yield document_id, text.lower()
+# ==================== Main Application ====================
+class MalletTopicInferencer:
+    """
+    MalletTopicInferencer class coordinates the process of reading input documents, identifying their language, and performing topic inference using Mallet models.
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        self.args = args
+        self.languages = set(args.languages)
+        self.language_inferencers: Optional[Dict[str, LanguageInferencer]] = None
+        self.language_lemmatizations: Optional[Dict[str, Dict[str, str]]] = None
+        self.input_reader = None
+        self.inference_results: List[Dict[str, str]] = []
+        self.language_dict: Dict[str, str] = {}
+        self.seen_languages: Set[str] = set()
+        self.stats = collections.Counter()
+    def initialize(self) -> None:
+        """Initialize the inferencers after JVM startup."""
+        self.language_inferencers = self.init_language_inferencers(self.args)
+        self.input_reader = self.build_input_reader(self.args)
+        self.language_lemmatizations = self.init_language_lemmatizations(self.args)
+        if self.args.language_file:
+            self.language_dict = self.read_language_file(self.args.language_file)
+    @staticmethod
+    def start_jvm() -> None:
+        """Start the Java Virtual Machine if not already started."""
+        if not jpype.isJVMStarted():
+            current_dir = os.getcwd()
+            source_dir = os.path.dirname(os.path.abspath(__file__))
+            # Construct classpath relative to the current directory
+            classpath = [
+                os.path.join(current_dir, "mallet/lib/mallet-deps.jar"),
+                os.path.join(current_dir, "mallet/lib/mallet.jar"),
+            ]
+            # Check if the files exist in the current directory
+            if not all(os.path.exists(path) for path in classpath):
+                # If not, construct classpath relative to the source directory
+                classpath = [
+                    os.path.join(source_dir, "mallet/lib/mallet-deps.jar"),
+                    os.path.join(source_dir, "mallet/lib/mallet.jar"),
+                ]
+            jpype.startJVM(classpath=classpath)
+            log.info(f"JVM started successfully with classpath {classpath}.")
+        else:
+            log.warning("JVM already running.")
+    def read_language_file(self, language_file: str) -> Dict[str, str]:
+        """Read the language file (JSONL) and return a dictionary of document_id -> language."""
+        language_dict = {}
+        with open(language_file, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                doc_id = data.get("doc_id")
+                language = data.get("language")
+                if doc_id and language:
+                    language_dict[doc_id] = language
+        return language_dict
+    @staticmethod
+    def load_lemmatization_file(
+        lemmatization_file_path: str,
+        bidi: bool = False,
+        lowercase: bool = True,
+        ignore_pos: bool = True,
+    ) -> Dict[str, str]:
+        """
+        Load lemmatization data from the file.
+        :param lemmatization_file_path: Path to the lemmatization file.
+        :return: A dictionary mapping tokens to their corresponding lemmas.
+        """
+        token2lemma = {}
+        n = 0
+        with open(lemmatization_file_path, "r", "utf-8") as file:
+            for line in file:
+                token, _, lemma = line.strip().split("\t")
+                if lowercase:
+                    token2lemma[token.lower()] = lemma.lower()
+                else:
+                    token2lemma[token] = lemma
+                n += 1
+        logging.info(
+            "Read %d lemmatization entries from %s", n, lemmatization_file_path
+        )
+        return token2lemma
+    def init_language_lemmatizations(
+        self, args: argparse.Namespace
+    ) -> Dict[str, Dict[str, str]]:
+        """Build a mapping of languages to their respective lemmatization dictionaries."""
+        language_lemmatizations: Dict[str, Dict[str, str]] = {}
+        for language in args.languages:
+            lemmatization_key = f"{language}_lemmatization"
+            if getattr(args, lemmatization_key, None):
+                lemmatization_file = getattr(args, lemmatization_key)
+                language_lemmatizations[language] = self.load_lemmatization_file(
+                    lemmatization_file
+                )
+            else:
+                log.info(
+                    f"Lemmatization file for language: {language} not provided by"
+                    " arguments. Skipping."
+                )
+        return language_lemmatizations
+    def identify_language(self, document_id: str, text: str) -> str:
+        """Identify the language of the text using the language file or a dummy method."""
+        # Check if the document ID is in the language dictionary
+        if document_id in self.language_dict:
+            return self.language_dict[document_id]
+        # Placeholder: Assume German ("de") for now if not found in the dictionary
+        return "de"
+    def init_language_inferencers(
+        self, args: argparse.Namespace
+    ) -> Dict[str, LanguageInferencer]:
+        """Build a mapping of languages to their respective inferencers
+        Includes the vectorizer pipe for each language as well.
+        """
+        language_inferencers: Dict[str, LanguageInferencer] = {}
+        for language in args.languages:
+            inferencer_key = f"{language}_inferencer"
+            pipe_key = f"{language}_pipe"
+            if getattr(args, inferencer_key, None) and getattr(args, pipe_key, None):
+                language_inferencers[language] = LanguageInferencer(
+                    language=language,
+                    inferencer_file=getattr(args, inferencer_key),
+                    pipe_file=getattr(args, pipe_key),
+                )
+            else:
+                log.info(
+                    f"Inferencer or pipe file for language: {language} not provided by"
+                    " arguments. Skipping."
+                )
+        return language_inferencers
+    def build_input_reader(self, args: argparse.Namespace) -> InputReader:
+        """Select the appropriate input reader based on the input format."""
+        if args.input_format == "jsonl":
+            return JsonlInputReader(args.input)
+        elif args.input_format == "csv":
+            return CsvInputReader(args.input)
+        else:
+            raise ValueError(f"Unsupported input format: {args.input_format}")
+    def process_input_file(self) -> None:
+        """Process the input file, identify language, and apply the appropriate Mallet model"""
+        temp_files_by_language = self.write_language_specific_csv_files()
+        doctopics_files = self.run_topic_inference(temp_files_by_language)
+        logging.info(doctopics_files)
+        if self.args.output_format == "csv":
+            self.merge_inference_results(doctopics_files)
+        elif self.args.output_format == "jsonl":
+            self.merge_inference_results_jsonl(doctopics_files)
+    def merge_inference_results_jsonl(self, doctopics_files_by_language):
+        args = ["--output", "<generator>"]
+        m2ta_converters = {}
+        for lang, doctopics_file in doctopics_files_by_language.items():
+            topic_model_id = self.args.__dict__[f"{lang}_model_id"]
+            if "{lang}" in topic_model_id:
+                topic_model_id.format(lang=lang)
+            args += [
+                "--topic_model",
+                topic_model_id,
+                "--topic_count",
+                str(self.args.__dict__[f"{lang}_topic_count"]),
+                "--lang",
+                lang,
+                doctopics_file,  # input comes last!
+            ]
+            m2ta_converters[lang] = m2taj.Mallet2TopicAssignment.main(args)
+        for lang, m2ta_converter in m2ta_converters.items():
+            with open(self.args.output, "w", encoding="utf-8") as out_f:
+                for row in m2ta_converter:
+                    self.stats["content_items"] += 1
+                    print(
+                        json.dumps(row, ensure_ascii=False, separators=(",", ":")),
+                        file=out_f,
+                    )
+    def merge_inference_results(
+        self, doctopics_files_by_language: Dict[str, str]
+    ) -> None:
+        """Merge the inference results from multiple languages into a single output file."""
+        logging.info(
+            "Saving CSV inference results into file %s from multiple languages: %s",
+            self.args.output,
+            doctopics_files_by_language,
+        )
+        with open(self.args.output, "w", encoding="utf-8") as out_f:
+            for language, doctopics_file in doctopics_files_by_language.items():
+                with open(doctopics_file, "r", encoding="utf-8") as f:
+                    for line in f:
+                        if line.startswith("#"):
+                            continue
+                        doc_id, topic_dist = line.strip().split("\t", 1)
+                        print(
+                            doc_id + "__" + language,
+                            topic_dist,
+                            sep="\t",
+                            end="\n",
+                            file=out_f,
+                        )
+    def write_language_specific_csv_files(self) -> Dict[str, str]:
+        """Read documents and write to language-specific temporary files"""
+        tsv_files_by_language = {}
+        for document_id, text in self.input_reader.read_documents():
+            language_code = self.identify_language(document_id, text)
+            self.stats["LANGUAGE: " + language_code] += 1
+            if language_code not in self.languages:
+                continue
+            if language_code not in tsv_files_by_language:
+                tsv_files_by_language[language_code] = tempfile.NamedTemporaryFile(
+                    delete=False,
+                    mode="w",
+                    suffix=f".{language_code}.tsv",
+                    encoding="utf-8",
+                )
+                logging.info(
+                    "Writing documents for language: %s in temp file: %s",
+                    language_code,
+                    tsv_files_by_language[language_code].name,
+                )
+            print(
+                document_id,
+                language_code,
+                text,
+                sep="\t",
+                end="\n",
+                file=tsv_files_by_language[language_code],
+            )
+        # Close all temporary files
+        for temp_file in tsv_files_by_language.values():
+            temp_file.close()
+        # noinspection PyShadowingNames
+        result = {
+            lang: temp_file.name for lang, temp_file in tsv_files_by_language.items()
+        }
+        return result
+    def run_topic_inference(
+        self, language_specific_csv_files: Dict[str, str]
+    ) -> Dict[str, str]:
+        """Run inference for each language"""
+        doctopics_files_by_language = {}
+        for language_code, csv_file in language_specific_csv_files.items():
+            inferencer = self.language_inferencers.get(language_code)
+            if not inferencer:
+                log.error(f"No inferencer found for language: {language_code}")
+                continue
+            doctopics_file = inferencer.run_csv2topics(csv_file)
+            doctopics_files_by_language[language_code] = doctopics_file
+            # Clean up the temporary vectorized file if logging level is not DEBUG
+            if log.getEffectiveLevel() != logging.DEBUG:
+                logging.info("Cleaning language specific csv file: %s", csv_file)
+                os.remove(csv_file)
+        logging.debug("Resulting doctopic files: %s", doctopics_files_by_language)
+        return doctopics_files_by_language
+    def write_results_to_output(self) -> None:
+        """Write the final merged inference results to the output file."""
+        with open(self.args.output, "w", encoding="utf-8") as out_file:
+            for result in self.inference_results:
+                out_file.write(json.dumps(result) + "\n")
+        log.info(f"All inferences merged and written to {self.args.output}")
+    def run(self) -> None:
+        """Main execution method."""
+        try:
+            self.start_jvm()
+            self.initialize()
+            self.process_input_file()
+            # self.write_results_to_output()
+        except Exception as e:
+            log.error(f"An error occurred: {e}")
+            log.error("Traceback: %s", traceback.format_exc())
+        finally:
+            jpype.shutdownJVM()
+            log.info("JVM shutdown.")
+        for key, value in sorted(self.stats.items()):
+            log.info(f"STATS: {key}: {value}")
+if __name__ == "__main__":
+    languages = ["de", "fr", "lb"]  # You can add more languages as needed
+    parser = argparse.ArgumentParser(description="Mallet Topic Inference in Python")
+    parser.add_argument("--logfile", help="Path to log file", default=None)
+    parser.add_argument(
+        "--level",
+        default="DEBUG",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Logging level $(default)s",
+    )
+    parser.add_argument(
+        "--input", help="Path to input file (%(default)s)", required=True
+    )
+    parser.add_argument(
+        "--input-format",
+        choices=["jsonl", "csv"],
+        default="jsonl",
+        help="Format of the input file",
+    )
+    parser.add_argument(
+        "--output-format",
+        choices=["jsonl", "csv"],
+        help=(
+            "Format of the output file: csv: raw Mallet output with docids patched into"
+            " numericID-LANG, jsonl: impresso JSONL format"
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        help="Path to final output file. (%(default)s)",
+        default="out.jsonl",
+    )
+    parser.add_argument(
+        "--languages",
+        nargs="+",
+        default=languages,
+        help="List of languages to support (%(default)s)",
+    )
+    parser.add_argument(
+        "--language-file",
+        help="Path to JSONL containing document_id to language mappings",
+        required=False,
+    )
+    parser.add_argument("--model_dir", help="Path to model directory", required=True)
+    # Dynamically generate arguments for each language's inferencer and pipe files
+    for lang in languages:
+        parser.add_argument(
+            f"--{lang}_inferencer",
+            help=f"Path to {lang} inferencer file",
+        )
+        parser.add_argument(f"--{lang}_pipe", help=f"Path to {lang} pipe file")
+        parser.add_argument(
+            f"--{lang}_lemmatization", help=f"Path to {lang} lemmatization file"
+        )
+    # Dynamically generate arguments for each language's inferencer and pipe files
+    for lang in languages:
+        parser.add_argument(
+            f"--{lang}_model_id",
+            default=f"tm-{lang}-all-v2.0",
+            help="Model ID can take a {lang} format placeholder (%(default)s)",
+        )
+    for lang in languages:
+        parser.add_argument(
+            f"--{lang}_topic_count",
+            default=100,
+            help="Number of topics of model (%(default)s)",
+        )
+    args = parser.parse_args()
+    logging.basicConfig(
+        filename=args.logfile,
+        level=args.level,
+        format="%(asctime)-15s %(filename)s:%(lineno)d %(levelname)s: %(message)s",
+        force=True,
+    )
+    # Automatically construct file paths if not explicitly specified
+    for lang in args.languages:
+        model_id = getattr(args, f"{lang}_model_id")
+        model_dir = args.model_dir
+        pipe_path = os.path.join(model_dir, f"{model_id}.pipe")
+        inferencer_path = os.path.join(model_dir, f"{model_id}.inferencer")
+        lemmatization_path = os.path.join(
+            model_dir, f"{model_id}.vocab.lemmatization.tsv.gz"
+        )
+        if not getattr(args, f"{lang}_pipe") and os.path.exists(pipe_path):
+            logging.info("Automatically setting pipe path to %s", pipe_path)
+            setattr(args, f"{lang}_pipe", pipe_path)
+        if not getattr(args, f"{lang}_inferencer") and os.path.exists(inferencer_path):
+            logging.info("Automatically setting inferencer path to %s", inferencer_path)
+            setattr(args, f"{lang}_inferencer", inferencer_path)
+        if not getattr(args, f"{lang}_lemmatization") and os.path.exists(
+            lemmatization_path
+        ):
+            logging.info(
+                "Automatically setting lemmatization path to %s", lemmatization_path
+            )
+            setattr(args, f"{lang}_lemmatization", lemmatization_path)
+    if not args.output_format:
+        if "jsonl" in args.output:
+            args.output_format = "jsonl"
+        else:
+            args.output_format = "csv"
+        logging.warning("Unspecified output format set to %s", args.output_format)
+    for lang in args.languages:
+        if not getattr(args, f"{lang}_inferencer") or not getattr(args, f"{lang}_pipe"):
+            logging.warning(
+                "Inferencer or pipe file not provided for language: %s. Ignoring"
+                " content items for this language.",
+                lang,
+            )
+            args.languages.remove(lang)
+    logging.info(
+        "Performing monolingual topic inference for the following languages: %s",
+        args.languages,
+    )
+    logging.info("Arguments: %s", args)
+    app = MalletTopicInferencer(args)
+    app.run()

mallet/lib/mallet-deps.jar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cca84272e6a3aee57490de4691702a5d1264d56d2a651b741f034aa09052023
+size 2644050

mallet/lib/mallet.jar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:facdf051fb2775a8b8015d154d23c7fcb1b276172ba5b517fd3877f45332cf30
+size 2235683

models/tm/tm-de-all-v2.0.inferencer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3961a2b868ea35605a27e19d6d7d74ab2dc692de8d41799e1ccb68e3af0b5b
+size 23330363

models/tm/tm-de-all-v2.0.pipe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc69ca2007dd2380cdda1c4a4f029f1325363ea89ceacbc14bf82788abd166be
+size 748181

models/tm/tm-de-all-v2.0.vocab.lemmatization.tsv.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06d163134a44cc3f338b187df9fd6b4936ea4ecf7d1157b29efddefccd6456cc
+size 1288289

models/tm/tm-fr-all-v2.0.inferencer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7acfecbf88e50ea4ca0e872ed159413bcfc8eeea2bcf51c10b6aeb13ec11864
+size 8718608

models/tm/tm-fr-all-v2.0.pipe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:916b0ff537284e232922016dd6fd6924ade4a405b69776d16adb5b8d61593af8
+size 249343

models/tm/tm-fr-all-v2.0.vocab.lemmatization.tsv.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6476a943c3126eaf249685273af3445105763c6cef22e76eb32b5618724de147
+size 311945

models/tm/tm-lb-all-v2.0.inferencer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcffa061a4de61cee46ade61ff8cdb9637839a0bb0750a1001c0a9934ec8f53e
+size 26498144

models/tm/tm-lb-all-v2.0.pipe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b08b13c18f0388100820b6cffc2753c99f4a0be48da9b3e2190926fa473283bf
+size 8060126

models/tm/tm-lb-all-v2.0.vocab.lemmatization.tsv.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b65f8503e341fad42cae0f26c93378be24208ab2453a052d742a34e6e5ee8db
+size 3893233

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+-i https://pypi.org/simple
+blis==0.7.11
+boto3==1.35.50; python_version >= '3.8'
+botocore==1.35.50; python_version >= '3.8'
+catalogue==2.0.10; python_version >= '3.6'
+certifi==2024.8.30; python_version >= '3.6'
+charset-normalizer==3.4.0; python_full_version >= '3.7.0'
+click==8.1.7; python_version >= '3.7'
+confection==0.1.5; python_version >= '3.6'
+cymem==2.0.8
+https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.6.0/de_core_news_md-3.6.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.6.0/fr_core_news_md-3.6.0.tar.gz
+idna==3.10; python_version >= '3.6'
+jinja2==3.1.4; python_version >= '3.7'
+jmespath==1.0.1; python_version >= '3.7'
+jpype1==1.5.0; python_version >= '3.7'
+langcodes==3.4.1; python_version >= '3.8'
+language-data==1.2.0
+marisa-trie==1.2.1; python_version >= '3.7'
+markupsafe==3.0.2; python_version >= '3.9'
+murmurhash==1.0.10; python_version >= '3.6'
+numpy==1.26.4; python_version >= '3.9'
+packaging==24.1; python_version >= '3.8'
+pathlib-abc==0.1.1; python_version >= '3.8'
+pathy==0.11.0; python_version >= '3.8'
+preshed==3.0.9; python_version >= '3.6'
+pydantic==1.10.18; python_version >= '3.7'
+python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+python-dotenv==1.0.1; python_version >= '3.8'
+requests==2.32.3; python_version >= '3.8'
+s3transfer==0.10.3; python_version >= '3.8'
+setuptools==75.2.0; python_version >= '3.8'
+six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+smart-open==6.4.0; python_version >= '3.6' and python_version < '4.0'
+spacy==3.6.0; python_version >= '3.6'
+spacy-legacy==3.0.12; python_version >= '3.6'
+spacy-loggers==1.0.5; python_version >= '3.6'
+spacy-lookups-data==1.0.5; python_version >= '3.6'
+srsly==2.4.8; python_version >= '3.6'
+thinc==8.1.12; python_version >= '3.6'
+tqdm==4.66.6; python_version >= '3.7'
+typer==0.9.4; python_version >= '3.6'
+typing-extensions==4.12.2; python_version >= '3.8'
+urllib3==2.2.3; python_version >= '3.10'
+wasabi==1.1.3; python_version >= '3.6'