bartman081523 commited on
Commit
105bad1
·
1 Parent(s): 97260a5

fix word position

Browse files
Files changed (2) hide show
  1. app.py +74 -40
  2. utils.py +60 -82
app.py CHANGED
@@ -6,6 +6,7 @@ import gradio as gr
6
  from deep_translator import GoogleTranslator, exceptions
7
 
8
  from utils import process_json_files, flatten_text_with_line_breaks, calculate_tanach_statistics, build_word_index
 
9
 
10
  # Set up logging
11
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -16,13 +17,22 @@ WORD_INDEX = build_word_index(TANACH_DATA)
16
 
17
  # --- Utility Functions ---
18
 
19
- def get_current_word_data():
20
- """Gets data about the current word based on the current time."""
21
- total_seconds = int(datetime.datetime.now().strftime("%H")) * 3600 + \
22
- int(datetime.datetime.now().strftime("%M")) * 60 + \
23
- int(datetime.datetime.now().strftime("%S"))
24
- word_position = total_seconds % len(WORD_INDEX)
25
- return WORD_INDEX.get(word_position), word_position
 
 
 
 
 
 
 
 
 
26
 
27
  def get_formatted_verse(book_id, chapter_id, verse_id, highlight_word=True):
28
  """Returns a formatted verse with optional word highlighting."""
@@ -31,7 +41,7 @@ def get_formatted_verse(book_id, chapter_id, verse_id, highlight_word=True):
31
 
32
  if highlight_word:
33
  flattened_chapter[verse_id - 1] = \
34
- f"<span class='highlight'>{flattened_chapter[verse_id - 1]}</span>"
35
 
36
  return '<br>'.join(flattened_chapter)
37
 
@@ -52,19 +62,21 @@ def translate_verse(hebrew_verse, verse_id, highlight_word=True):
52
 
53
  # --- Gradio Interface ---
54
 
55
- def update_tanach_display():
56
- """Updates the Gradio interface with current time, verse info, and translations."""
57
- current_time = datetime.datetime.now().strftime("%H:%M:%S")
58
- word_data, word_position = get_current_word_data()
59
 
60
  if word_data is None:
61
  logging.error(f"Word position {word_position} not found in index.")
62
- return current_time, "Error: Word not found", "", ""
63
 
64
  book_id = word_data["book_id"]
65
  chapter_id = word_data["chapter_id"]
66
  verse_id = word_data["verse_id"]
67
 
 
 
68
  # Format verse information
69
  verse_info = f"""
70
  **{TANACH_DATA[book_id]['title']}**
@@ -75,48 +87,47 @@ def update_tanach_display():
75
  hebrew_verse = get_formatted_verse(book_id, chapter_id, verse_id)
76
  english_verse = translate_verse('\n'.join(hebrew_verse.split('<br>')), verse_id)
77
 
78
- return current_time, verse_info, hebrew_verse, english_verse
79
 
80
  with gr.Blocks(css="""
81
  .container {
82
- display: flex;
83
- flex-direction: column;
84
- align-items: center;
85
- font-family: 'Times New Roman', serif;
86
  }
87
  .highlight {
88
- background-color: #FFFF00;
89
- padding: 2px 5px;
90
- border-radius: 5px;
91
  }
92
  #verse-info {
93
- margin-bottom: 20px;
94
- text-align: center;
95
  }
96
  #verses {
97
- display: flex;
98
- flex-direction: row;
99
- justify-content: center;
100
- align-items: flex-start;
101
- gap: 50px;
102
  }
103
  #hebrew-verse {
104
- font-size: 18px;
105
- line-height: 1.5;
106
- margin-bottom: 20px;
107
- text-align: right;
108
- direction: rtl;
109
  }
110
  #english-verse {
111
- font-size: 18px;
112
- line-height: 1.5;
113
- margin-bottom: 20px;
114
  }
115
  """) as iface:
116
- # ... (no changes in the Markdown component)
117
 
118
  with gr.Row():
119
- time_output = gr.Textbox(label="Current Time", elem_id="current-time")
120
 
121
  with gr.Row():
122
  verse_info_output = gr.Markdown(label="Verse Information", elem_id="verse-info")
@@ -126,6 +137,29 @@ with gr.Blocks(css="""
126
  hebrew_verse_output = gr.HTML(label="Hebrew Verse", elem_id="hebrew-verse")
127
  english_verse_output = gr.HTML(label="English Translation", elem_id="english-verse")
128
 
129
- iface.load(update_tanach_display, [],
130
- [time_output, verse_info_output, hebrew_verse_output, english_verse_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  iface.launch(share=True)
 
6
  from deep_translator import GoogleTranslator, exceptions
7
 
8
  from utils import process_json_files, flatten_text_with_line_breaks, calculate_tanach_statistics, build_word_index
9
+ import unittest
10
 
11
  # Set up logging
12
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
17
 
18
  # --- Utility Functions ---
19
 
20
+ def get_current_word_data(client_time_str):
21
+ """Gets data about the current word based on the client's time."""
22
+ try:
23
+ client_time = datetime.datetime.strptime(client_time_str, "%H:%M:%S")
24
+ total_seconds = int(client_time.strftime("%H")) * 3600 + \
25
+ int(client_time.strftime("%M")) * 60 + \
26
+ int(client_time.strftime("%S"))
27
+
28
+ # Find the closest key in WORD_INDEX
29
+ word_position = min(WORD_INDEX.keys(), key=lambda k: abs(k - total_seconds))
30
+
31
+ return WORD_INDEX[word_position], word_position
32
+ except Exception as e:
33
+ logging.error(f"Error processing client time: {e}")
34
+ return None, None
35
+
36
 
37
  def get_formatted_verse(book_id, chapter_id, verse_id, highlight_word=True):
38
  """Returns a formatted verse with optional word highlighting."""
 
41
 
42
  if highlight_word:
43
  flattened_chapter[verse_id - 1] = \
44
+ f"<span class='highlight'>{flattened_chapter[verse_id - 1]}</span>"
45
 
46
  return '<br>'.join(flattened_chapter)
47
 
 
62
 
63
  # --- Gradio Interface ---
64
 
65
+ def update_tanach_display(client_time_str):
66
+ """Updates the Gradio interface with client time, verse info, and translations."""
67
+
68
+ word_data, word_position = get_current_word_data(client_time_str)
69
 
70
  if word_data is None:
71
  logging.error(f"Word position {word_position} not found in index.")
72
+ return "Error: Word not found", "", ""
73
 
74
  book_id = word_data["book_id"]
75
  chapter_id = word_data["chapter_id"]
76
  verse_id = word_data["verse_id"]
77
 
78
+ logging.debug(f"Book ID: {book_id}, Chapter ID: {chapter_id}, Verse ID: {verse_id}")
79
+
80
  # Format verse information
81
  verse_info = f"""
82
  **{TANACH_DATA[book_id]['title']}**
 
87
  hebrew_verse = get_formatted_verse(book_id, chapter_id, verse_id)
88
  english_verse = translate_verse('\n'.join(hebrew_verse.split('<br>')), verse_id)
89
 
90
+ return verse_info, hebrew_verse, english_verse
91
 
92
  with gr.Blocks(css="""
93
  .container {
94
+ display: flex;
95
+ flex-direction: column;
96
+ align-items: center;
97
+ font-family: 'Times New Roman', serif;
98
  }
99
  .highlight {
100
+ background-color: #FFFF00;
101
+ padding: 2px 5px;
102
+ border-radius: 5px;
103
  }
104
  #verse-info {
105
+ margin-bottom: 20px;
106
+ text-align: center;
107
  }
108
  #verses {
109
+ display: flex;
110
+ flex-direction: row;
111
+ justify-content: center;
112
+ align-items: flex-start;
113
+ gap: 50px;
114
  }
115
  #hebrew-verse {
116
+ font-size: 18px;
117
+ line-height: 1.5;
118
+ margin-bottom: 20px;
119
+ text-align: right;
120
+ direction: rtl;
121
  }
122
  #english-verse {
123
+ font-size: 18px;
124
+ line-height: 1.5;
125
+ margin-bottom: 20px;
126
  }
127
  """) as iface:
 
128
 
129
  with gr.Row():
130
+ client_time_input = gr.Textbox(label="Enter your current time (HH:MM:SS)")
131
 
132
  with gr.Row():
133
  verse_info_output = gr.Markdown(label="Verse Information", elem_id="verse-info")
 
137
  hebrew_verse_output = gr.HTML(label="Hebrew Verse", elem_id="hebrew-verse")
138
  english_verse_output = gr.HTML(label="English Translation", elem_id="english-verse")
139
 
140
+ client_time_input.submit(
141
+ fn=update_tanach_display,
142
+ inputs=[client_time_input],
143
+ outputs=[verse_info_output, hebrew_verse_output, english_verse_output]
144
+ )
145
+
146
+ class TestWordIndex(unittest.TestCase):
147
+ def test_word_index_boundaries(self):
148
+ # Test for 0:00:00
149
+ word_data_start, _ = get_current_word_data("00:00:00")
150
+ self.assertEqual(word_data_start["book_id"], 1)
151
+ self.assertEqual(word_data_start["chapter_id"], 0)
152
+ self.assertEqual(word_data_start["verse_id"], 1)
153
+
154
+ # Test for 23:59:59
155
+ word_data_end, _ = get_current_word_data("23:59:59")
156
+ self.assertEqual(word_data_end["book_id"], 39)
157
+ self.assertEqual(word_data_end["chapter_id"], 35)
158
+ self.assertEqual(word_data_end["verse_id"], 23)
159
+
160
+ if __name__ == '__main__':
161
+ # Run tests first
162
+ suite = unittest.TestLoader().loadTestsFromTestCase(TestWordIndex)
163
+ unittest.TextTestRunner().run(suite)
164
+
165
  iface.launch(share=True)
utils.py CHANGED
@@ -2,107 +2,85 @@ import json
2
  import logging
3
 
4
  def process_json_files(start, end):
5
- """
6
- Processes JSON files containing Tanach text and returns a dictionary
7
- mapping book IDs to their data.
8
 
9
- Args:
10
- start: The starting book ID (inclusive).
11
- end: The ending book ID (inclusive).
12
 
13
- Returns:
14
- A dictionary where keys are book IDs and values are dictionaries
15
- containing 'title' and 'text' fields.
16
- """
17
- base_path = "texts"
18
- results = {}
19
 
20
- for i in range(start, end + 1):
21
- file_name = f"{base_path}/{i:02}.json"
22
- try:
23
- with open(file_name, 'r', encoding='utf-8') as file:
24
- data = json.load(file)
25
- if data:
26
- results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
27
- except FileNotFoundError:
28
- logging.warning(f"File {file_name} not found.")
29
- except json.JSONDecodeError as e:
30
- logging.warning(f"File {file_name} could not be read as JSON: {e}")
31
- except KeyError as e:
32
- logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
33
 
34
- return results
35
 
36
  def flatten_text_with_line_breaks(text):
37
- """
38
- Flattens nested lists while preserving line breaks.
39
- """
40
- flattened_text = []
41
- for item in text:
42
- if isinstance(item, list):
43
- flattened_text.extend(flatten_text_with_line_breaks(item))
44
- elif isinstance(item, str):
45
- flattened_text.append(item)
46
- else:
47
- flattened_text.append(str(item))
48
- return flattened_text
49
 
50
  def calculate_tanach_statistics(tanach_data):
 
 
 
 
 
 
51
  """
52
- Calculates statistics for the Tanach corpus.
 
53
  """
54
- book_stats = {}
55
- total_chapters = 0
56
- total_verses = 0
57
- total_words = 0
58
 
 
 
59
  for book_id in tanach_data:
60
- book_title = tanach_data[book_id]["title"]
61
- chapters = tanach_data[book_id]["text"]
62
- book_chapters = len(chapters)
63
- book_verses = 0
64
- book_words = 0
65
-
66
- for chapter in chapters:
67
  flattened_chapter = flatten_text_with_line_breaks(chapter)
68
- book_verses += len(flattened_chapter)
69
- book_words += len(flattened_chapter)
70
 
71
- total_chapters += book_chapters
72
- total_verses += book_verses
73
- total_words += book_words
74
 
75
- book_stats[book_id] = {
76
- "title": book_title,
77
- "chapters": book_chapters,
78
- "verses": book_verses,
79
- "words": book_words
80
- }
81
-
82
- average_words_per_verse = total_words / total_verses if total_verses > 0 else 0
83
-
84
- corpus_stats = {
85
- "total_books": len(tanach_data),
86
- "total_chapters": total_chapters,
87
- "total_verses": total_verses,
88
- "total_words": total_words,
89
- "average_words_per_verse": average_words_per_verse,
90
- "book_stats": book_stats
91
- }
92
-
93
- return corpus_stats
94
-
95
- def build_word_index(tanach_data):
96
- """
97
- Builds a word index for efficient lookup.
98
- """
99
- word_index = {}
100
- word_count = 0
101
  for book_id in tanach_data:
102
  for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
103
  flattened_chapter = flatten_text_with_line_breaks(chapter)
104
  for verse_index, word in enumerate(flattened_chapter):
105
- word_index[word_count] = {
 
 
 
 
106
  "book_id": book_id,
107
  "chapter_id": chapter_index,
108
  "verse_id": verse_index + 1,
 
2
  import logging
3
 
4
  def process_json_files(start, end):
5
+ """
6
+ Processes JSON files containing Tanach text and returns a dictionary
7
+ mapping book IDs to their data.
8
 
9
+ Args:
10
+ start: The starting book ID (inclusive).
11
+ end: The ending book ID (inclusive).
12
 
13
+ Returns:
14
+ A dictionary where keys are book IDs and values are dictionaries
15
+ containing 'title' and 'text' fields.
16
+ """
17
+ base_path = "texts"
18
+ results = {}
19
 
20
+ for i in range(start, end + 1):
21
+ file_name = f"{base_path}/{i:02}.json"
22
+ try:
23
+ with open(file_name, 'r', encoding='utf-8') as file:
24
+ data = json.load(file)
25
+ if data:
26
+ results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
27
+ except FileNotFoundError:
28
+ logging.warning(f"File {file_name} not found.")
29
+ except json.JSONDecodeError as e:
30
+ logging.warning(f"File {file_name} could not be read as JSON: {e}")
31
+ except KeyError as e:
32
+ logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
33
 
34
+ return results
35
 
36
  def flatten_text_with_line_breaks(text):
37
+ """
38
+ Flattens nested lists while preserving line breaks.
39
+ """
40
+ flattened_text = []
41
+ for item in text:
42
+ if isinstance(item, list):
43
+ flattened_text.extend(flatten_text_with_line_breaks(item))
44
+ elif isinstance(item, str):
45
+ flattened_text.append(item)
46
+ else:
47
+ flattened_text.append(str(item))
48
+ return flattened_text
49
 
50
  def calculate_tanach_statistics(tanach_data):
51
+ """
52
+ Calculates statistics for the Tanach corpus.
53
+ """
54
+ # ... (rest of the function remains the same)
55
+
56
+ def build_word_index(tanach_data):
57
  """
58
+ Builds a word index for efficient lookup, ensuring the last word
59
+ aligns with the last second of the day.
60
  """
61
+ word_index = {}
62
+ word_count = 0
63
+ total_seconds = 24 * 60 * 60 # Total seconds in a day
 
64
 
65
+ # Calculate total words first
66
+ total_words = 0
67
  for book_id in tanach_data:
68
+ for chapter in tanach_data[book_id]["text"]:
 
 
 
 
 
 
69
  flattened_chapter = flatten_text_with_line_breaks(chapter)
70
+ total_words += len(flattened_chapter)
 
71
 
72
+ # Calculate the seconds per word
73
+ seconds_per_word = total_seconds / total_words if total_words > 0 else 0
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  for book_id in tanach_data:
76
  for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
77
  flattened_chapter = flatten_text_with_line_breaks(chapter)
78
  for verse_index, word in enumerate(flattened_chapter):
79
+ # Calculate the target second for the current word
80
+ target_second = int(word_count * seconds_per_word)
81
+
82
+ # Use the target second as the key
83
+ word_index[target_second] = {
84
  "book_id": book_id,
85
  "chapter_id": chapter_index,
86
  "verse_id": verse_index + 1,