import json import logging def process_json_files(start, end): """ Processes JSON files containing Tanach text and returns a dictionary mapping book IDs to their data. Args: start: The starting book ID (inclusive). end: The ending book ID (inclusive). Returns: A dictionary where keys are book IDs and values are dictionaries containing 'title' and 'text' fields. """ base_path = "texts" results = {} for i in range(start, end + 1): file_name = f"{base_path}/{i:02}.json" try: with open(file_name, 'r', encoding='utf-8') as file: data = json.load(file) if data: results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])} except FileNotFoundError: logging.warning(f"File {file_name} not found.") except json.JSONDecodeError as e: logging.warning(f"File {file_name} could not be read as JSON: {e}") except KeyError as e: logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") return results def flatten_text_with_line_breaks(text): """ Flattens nested lists while preserving line breaks. """ flattened_text = [] for item in text: if isinstance(item, list): flattened_text.extend(flatten_text_with_line_breaks(item)) elif isinstance(item, str): flattened_text.append(item) else: flattened_text.append(str(item)) return flattened_text def calculate_tanach_statistics(tanach_data): """ Calculates statistics for the Tanach corpus. """ # ... (rest of the function remains the same) def build_word_index(tanach_data): """ Builds a word index for efficient lookup, ensuring the last word aligns with the last second of the day. """ word_index = {} word_count = 0 total_seconds = 24 * 60 * 60 # Total seconds in a day # Calculate total words first total_words = 0 for book_id in tanach_data: for chapter in tanach_data[book_id]["text"]: flattened_chapter = flatten_text_with_line_breaks(chapter) total_words += len(flattened_chapter) # Calculate the seconds per word seconds_per_word = total_seconds / total_words if total_words > 0 else 0 for book_id in tanach_data: for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]): flattened_chapter = flatten_text_with_line_breaks(chapter) for verse_index, word in enumerate(flattened_chapter): # Calculate the target second for the current word target_second = int(word_count * seconds_per_word) # Use the target second as the key word_index[target_second] = { "book_id": book_id, "chapter_id": chapter_index, "verse_id": verse_index + 1, } word_count += 1 return word_index