Spaces:

neuralworm
/

tanach_clock

Runtime error

File size: 2,911 Bytes

import json
import logging

def process_json_files(start, end):
  """
  Processes JSON files containing Tanach text and returns a dictionary
  mapping book IDs to their data.

  Args:
    start: The starting book ID (inclusive).
    end: The ending book ID (inclusive).

  Returns:
    A dictionary where keys are book IDs and values are dictionaries
    containing 'title' and 'text' fields.
  """
  base_path = "texts"
  results = {}

  for i in range(start, end + 1):
    file_name = f"{base_path}/{i:02}.json"
    try:
      with open(file_name, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if data:
          results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
    except FileNotFoundError:
      logging.warning(f"File {file_name} not found.")
    except json.JSONDecodeError as e:
      logging.warning(f"File {file_name} could not be read as JSON: {e}")
    except KeyError as e:
      logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")

  return results

def flatten_text_with_line_breaks(text):
  """
  Flattens nested lists while preserving line breaks.
  """
  flattened_text = []
  for item in text:
    if isinstance(item, list):
      flattened_text.extend(flatten_text_with_line_breaks(item))
    elif isinstance(item, str):
      flattened_text.append(item)
    else:
      flattened_text.append(str(item))
  return flattened_text

def calculate_tanach_statistics(tanach_data):
  """
  Calculates statistics for the Tanach corpus.
  """
  # ... (rest of the function remains the same)

def build_word_index(tanach_data):
    """
    Builds a word index for efficient lookup, ensuring the last word
    aligns with the last second of the day.
    """
    word_index = {}
    word_count = 0
    total_seconds = 24 * 60 * 60  # Total seconds in a day

    # Calculate total words first
    total_words = 0
    for book_id in tanach_data:
        for chapter in tanach_data[book_id]["text"]:
            flattened_chapter = flatten_text_with_line_breaks(chapter)
            total_words += len(flattened_chapter)

    # Calculate the seconds per word
    seconds_per_word = total_seconds / total_words if total_words > 0 else 0

    for book_id in tanach_data:
        for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
            flattened_chapter = flatten_text_with_line_breaks(chapter)
            for verse_index, word in enumerate(flattened_chapter):
                # Calculate the target second for the current word
                target_second = int(word_count * seconds_per_word)

                # Use the target second as the key
                word_index[target_second] = {
                    "book_id": book_id,
                    "chapter_id": chapter_index,
                    "verse_id": verse_index + 1,
                }
                word_count += 1
    return word_index