Spaces:
Runtime error
Runtime error
import json | |
import logging | |
def process_json_files(start, end): | |
""" | |
Processes JSON files containing Tanach text and returns a dictionary | |
mapping book IDs to their data. | |
Args: | |
start: The starting book ID (inclusive). | |
end: The ending book ID (inclusive). | |
Returns: | |
A dictionary where keys are book IDs and values are dictionaries | |
containing 'title' and 'text' fields. | |
""" | |
base_path = "texts" | |
results = {} | |
for i in range(start, end + 1): | |
file_name = f"{base_path}/{i:02}.json" | |
try: | |
with open(file_name, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
if data: | |
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])} | |
except FileNotFoundError: | |
logging.warning(f"File {file_name} not found.") | |
except json.JSONDecodeError as e: | |
logging.warning(f"File {file_name} could not be read as JSON: {e}") | |
except KeyError as e: | |
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") | |
return results | |
def flatten_text_with_line_breaks(text): | |
""" | |
Flattens nested lists while preserving line breaks. | |
""" | |
flattened_text = [] | |
for item in text: | |
if isinstance(item, list): | |
flattened_text.extend(flatten_text_with_line_breaks(item)) | |
elif isinstance(item, str): | |
flattened_text.append(item) | |
else: | |
flattened_text.append(str(item)) | |
return flattened_text | |
def calculate_tanach_statistics(tanach_data): | |
""" | |
Calculates statistics for the Tanach corpus. | |
""" | |
# ... (rest of the function remains the same) | |
def build_word_index(tanach_data): | |
""" | |
Builds a word index for efficient lookup, ensuring the last word | |
aligns with the last second of the day. | |
""" | |
word_index = {} | |
word_count = 0 | |
total_seconds = 24 * 60 * 60 # Total seconds in a day | |
# Calculate total words first | |
total_words = 0 | |
for book_id in tanach_data: | |
for chapter in tanach_data[book_id]["text"]: | |
flattened_chapter = flatten_text_with_line_breaks(chapter) | |
total_words += len(flattened_chapter) | |
# Calculate the seconds per word | |
seconds_per_word = total_seconds / total_words if total_words > 0 else 0 | |
for book_id in tanach_data: | |
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]): | |
flattened_chapter = flatten_text_with_line_breaks(chapter) | |
for verse_index, word in enumerate(flattened_chapter): | |
# Calculate the target second for the current word | |
target_second = int(word_count * seconds_per_word) | |
# Use the target second as the key | |
word_index[target_second] = { | |
"book_id": book_id, | |
"chapter_id": chapter_index, | |
"verse_id": verse_index + 1, | |
} | |
word_count += 1 | |
return word_index | |