Suprath commited on
Commit
8c1767f
·
verified ·
1 Parent(s): 7bb32ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -18
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import sys
3
- import xml.etree.ElementTree as ET
4
 
5
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
6
  os.chdir('/home/user/app/av_hubert')
@@ -17,8 +16,16 @@ os.system('pip install gradio==3.12')
17
  os.system('pip install numpy==1.23.3')
18
 
19
 
 
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
 
 
 
 
 
 
 
22
  import dlib, cv2, os
23
  import numpy as np
24
  import skvideo
@@ -37,6 +44,8 @@ from huggingface_hub import hf_hub_download
37
  import gradio as gr
38
  from pytube import YouTube
39
 
 
 
40
  user_dir = "/home/user/app/av_hubert/avhubert"
41
  utils.import_user_module(Namespace(user_dir=user_dir))
42
  data_dir = "/home/user/app/video"
@@ -126,26 +135,13 @@ def predict(process_video):
126
  ref = decode_fn(sample['target'][0].int().cpu())
127
  hypo = hypos[0][0]['tokens'].int().cpu()
128
  hypo = decode_fn(hypo)
129
-
130
- # Create XML file
131
- root = ET.Element("transcript")
132
- for i, word in enumerate(hypo.split()):
133
- word_element = ET.SubElement(root, "word")
134
- word_element.set("timecode", str(i))
135
- word_element.text = word
136
-
137
- xml_tree = ET.ElementTree(root)
138
- xml_tree.write("transcript.xml")
139
-
140
- return hypo, "transcript.xml"
141
 
142
 
143
  # ---- Gradio Layout -----
144
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
145
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
146
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
147
- xml_output = gr.File(label="Download XML", download=True)
148
-
149
  demo = gr.Blocks()
150
  demo.encrypt = False
151
  text_output = gr.Textbox()
@@ -154,7 +150,7 @@ with demo:
154
  gr.Markdown('''
155
  <div>
156
  <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
157
- This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recognize the speech from Lip Movement
158
  <figure>
159
  <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
160
  <figcaption> Speech Recognition from visual lip movement
@@ -194,11 +190,11 @@ with demo:
194
  video_out])
195
  predict_btn = gr.Button("Predict")
196
  predict_btn.click(predict, [video_out], [
197
- text_output, xml_output])
198
  with gr.Row():
199
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
200
  text_output.render()
201
 
202
 
203
 
204
- demo.launch(debug=True)
 
1
  import os
2
  import sys
 
3
 
4
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
5
  os.chdir('/home/user/app/av_hubert')
 
16
  os.system('pip install numpy==1.23.3')
17
 
18
 
19
+ # sys.path.append('/home/user/app/av_hubert')
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
22
+ print(sys.path)
23
+ print(os.listdir())
24
+ print(sys.argv, type(sys.argv))
25
+ sys.argv.append('dummy')
26
+
27
+
28
+
29
  import dlib, cv2, os
30
  import numpy as np
31
  import skvideo
 
44
  import gradio as gr
45
  from pytube import YouTube
46
 
47
+ # os.chdir('/home/user/app/av_hubert/avhubert')
48
+
49
  user_dir = "/home/user/app/av_hubert/avhubert"
50
  utils.import_user_module(Namespace(user_dir=user_dir))
51
  data_dir = "/home/user/app/video"
 
135
  ref = decode_fn(sample['target'][0].int().cpu())
136
  hypo = hypos[0][0]['tokens'].int().cpu()
137
  hypo = decode_fn(hypo)
138
+ return hypo
 
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  # ---- Gradio Layout -----
142
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
143
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
144
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 
 
145
  demo = gr.Blocks()
146
  demo.encrypt = False
147
  text_output = gr.Textbox()
 
150
  gr.Markdown('''
151
  <div>
152
  <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
153
+ This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement
154
  <figure>
155
  <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
156
  <figcaption> Speech Recognition from visual lip movement
 
190
  video_out])
191
  predict_btn = gr.Button("Predict")
192
  predict_btn.click(predict, [video_out], [
193
+ text_output])
194
  with gr.Row():
195
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
196
  text_output.render()
197
 
198
 
199
 
200
+ demo.launch(debug=True)