tanach_clock / utils.py
bartman081523
fix word position
105bad1
import json
import logging
def process_json_files(start, end):
"""
Processes JSON files containing Tanach text and returns a dictionary
mapping book IDs to their data.
Args:
start: The starting book ID (inclusive).
end: The ending book ID (inclusive).
Returns:
A dictionary where keys are book IDs and values are dictionaries
containing 'title' and 'text' fields.
"""
base_path = "texts"
results = {}
for i in range(start, end + 1):
file_name = f"{base_path}/{i:02}.json"
try:
with open(file_name, 'r', encoding='utf-8') as file:
data = json.load(file)
if data:
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
except FileNotFoundError:
logging.warning(f"File {file_name} not found.")
except json.JSONDecodeError as e:
logging.warning(f"File {file_name} could not be read as JSON: {e}")
except KeyError as e:
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
return results
def flatten_text_with_line_breaks(text):
"""
Flattens nested lists while preserving line breaks.
"""
flattened_text = []
for item in text:
if isinstance(item, list):
flattened_text.extend(flatten_text_with_line_breaks(item))
elif isinstance(item, str):
flattened_text.append(item)
else:
flattened_text.append(str(item))
return flattened_text
def calculate_tanach_statistics(tanach_data):
"""
Calculates statistics for the Tanach corpus.
"""
# ... (rest of the function remains the same)
def build_word_index(tanach_data):
"""
Builds a word index for efficient lookup, ensuring the last word
aligns with the last second of the day.
"""
word_index = {}
word_count = 0
total_seconds = 24 * 60 * 60 # Total seconds in a day
# Calculate total words first
total_words = 0
for book_id in tanach_data:
for chapter in tanach_data[book_id]["text"]:
flattened_chapter = flatten_text_with_line_breaks(chapter)
total_words += len(flattened_chapter)
# Calculate the seconds per word
seconds_per_word = total_seconds / total_words if total_words > 0 else 0
for book_id in tanach_data:
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
flattened_chapter = flatten_text_with_line_breaks(chapter)
for verse_index, word in enumerate(flattened_chapter):
# Calculate the target second for the current word
target_second = int(word_count * seconds_per_word)
# Use the target second as the key
word_index[target_second] = {
"book_id": book_id,
"chapter_id": chapter_index,
"verse_id": verse_index + 1,
}
word_count += 1
return word_index