Spaces:
Runtime error
Runtime error
File size: 2,911 Bytes
97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 105bad1 97260a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import json
import logging
def process_json_files(start, end):
"""
Processes JSON files containing Tanach text and returns a dictionary
mapping book IDs to their data.
Args:
start: The starting book ID (inclusive).
end: The ending book ID (inclusive).
Returns:
A dictionary where keys are book IDs and values are dictionaries
containing 'title' and 'text' fields.
"""
base_path = "texts"
results = {}
for i in range(start, end + 1):
file_name = f"{base_path}/{i:02}.json"
try:
with open(file_name, 'r', encoding='utf-8') as file:
data = json.load(file)
if data:
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
except FileNotFoundError:
logging.warning(f"File {file_name} not found.")
except json.JSONDecodeError as e:
logging.warning(f"File {file_name} could not be read as JSON: {e}")
except KeyError as e:
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
return results
def flatten_text_with_line_breaks(text):
"""
Flattens nested lists while preserving line breaks.
"""
flattened_text = []
for item in text:
if isinstance(item, list):
flattened_text.extend(flatten_text_with_line_breaks(item))
elif isinstance(item, str):
flattened_text.append(item)
else:
flattened_text.append(str(item))
return flattened_text
def calculate_tanach_statistics(tanach_data):
"""
Calculates statistics for the Tanach corpus.
"""
# ... (rest of the function remains the same)
def build_word_index(tanach_data):
"""
Builds a word index for efficient lookup, ensuring the last word
aligns with the last second of the day.
"""
word_index = {}
word_count = 0
total_seconds = 24 * 60 * 60 # Total seconds in a day
# Calculate total words first
total_words = 0
for book_id in tanach_data:
for chapter in tanach_data[book_id]["text"]:
flattened_chapter = flatten_text_with_line_breaks(chapter)
total_words += len(flattened_chapter)
# Calculate the seconds per word
seconds_per_word = total_seconds / total_words if total_words > 0 else 0
for book_id in tanach_data:
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
flattened_chapter = flatten_text_with_line_breaks(chapter)
for verse_index, word in enumerate(flattened_chapter):
# Calculate the target second for the current word
target_second = int(word_count * seconds_per_word)
# Use the target second as the key
word_index[target_second] = {
"book_id": book_id,
"chapter_id": chapter_index,
"verse_id": verse_index + 1,
}
word_count += 1
return word_index
|