File size: 2,911 Bytes
97260a5
 
 
 
105bad1
 
 
97260a5
105bad1
 
 
97260a5
105bad1
 
 
 
 
 
97260a5
105bad1
 
 
 
 
 
 
 
 
 
 
 
 
97260a5
105bad1
97260a5
 
105bad1
 
 
 
 
 
 
 
 
 
 
 
97260a5
 
105bad1
 
 
 
 
 
97260a5
105bad1
 
97260a5
105bad1
 
 
97260a5
105bad1
 
97260a5
105bad1
97260a5
105bad1
97260a5
105bad1
 
97260a5
 
 
 
 
105bad1
 
 
 
 
97260a5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import logging

def process_json_files(start, end):
  """
  Processes JSON files containing Tanach text and returns a dictionary
  mapping book IDs to their data.

  Args:
    start: The starting book ID (inclusive).
    end: The ending book ID (inclusive).

  Returns:
    A dictionary where keys are book IDs and values are dictionaries
    containing 'title' and 'text' fields.
  """
  base_path = "texts"
  results = {}

  for i in range(start, end + 1):
    file_name = f"{base_path}/{i:02}.json"
    try:
      with open(file_name, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if data:
          results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
    except FileNotFoundError:
      logging.warning(f"File {file_name} not found.")
    except json.JSONDecodeError as e:
      logging.warning(f"File {file_name} could not be read as JSON: {e}")
    except KeyError as e:
      logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")

  return results

def flatten_text_with_line_breaks(text):
  """
  Flattens nested lists while preserving line breaks.
  """
  flattened_text = []
  for item in text:
    if isinstance(item, list):
      flattened_text.extend(flatten_text_with_line_breaks(item))
    elif isinstance(item, str):
      flattened_text.append(item)
    else:
      flattened_text.append(str(item))
  return flattened_text

def calculate_tanach_statistics(tanach_data):
  """
  Calculates statistics for the Tanach corpus.
  """
  # ... (rest of the function remains the same)

def build_word_index(tanach_data):
    """
    Builds a word index for efficient lookup, ensuring the last word
    aligns with the last second of the day.
    """
    word_index = {}
    word_count = 0
    total_seconds = 24 * 60 * 60  # Total seconds in a day

    # Calculate total words first
    total_words = 0
    for book_id in tanach_data:
        for chapter in tanach_data[book_id]["text"]:
            flattened_chapter = flatten_text_with_line_breaks(chapter)
            total_words += len(flattened_chapter)

    # Calculate the seconds per word
    seconds_per_word = total_seconds / total_words if total_words > 0 else 0

    for book_id in tanach_data:
        for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
            flattened_chapter = flatten_text_with_line_breaks(chapter)
            for verse_index, word in enumerate(flattened_chapter):
                # Calculate the target second for the current word
                target_second = int(word_count * seconds_per_word)

                # Use the target second as the key
                word_index[target_second] = {
                    "book_id": book_id,
                    "chapter_id": chapter_index,
                    "verse_id": verse_index + 1,
                }
                word_count += 1
    return word_index