Suprath commited on
Commit
d1cebce
·
verified ·
1 Parent(s): 9ea956e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -80
app.py CHANGED
@@ -1,6 +1,21 @@
1
  import os
2
  import sys
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
4
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
5
  os.chdir('/home/user/app/av_hubert')
6
  os.system('git submodule init')
@@ -14,38 +29,9 @@ os.system('pip install scikit-video')
14
  os.system('pip install transformers')
15
  os.system('pip install gradio==3.12')
16
  os.system('pip install numpy==1.23.3')
17
-
18
-
19
- # sys.path.append('/home/user/app/av_hubert')
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
22
- print(sys.path)
23
- print(os.listdir())
24
- print(sys.argv, type(sys.argv))
25
- sys.argv.append('dummy')
26
-
27
-
28
-
29
- import dlib, cv2, os
30
- import numpy as np
31
- import skvideo
32
- import skvideo.io
33
- from tqdm import tqdm
34
- from preparation.align_mouth import landmarks_interpolate, crop_patch, write_video_ffmpeg
35
- from base64 import b64encode
36
- import torch
37
- import cv2
38
- import tempfile
39
- from argparse import Namespace
40
- import fairseq
41
- from fairseq import checkpoint_utils, options, tasks, utils
42
- from fairseq.dataclass.configs import GenerationConfig
43
- from huggingface_hub import hf_hub_download
44
- import gradio as gr
45
- from pytube import YouTube
46
-
47
- # os.chdir('/home/user/app/av_hubert/avhubert')
48
-
49
  user_dir = "/home/user/app/av_hubert/avhubert"
50
  utils.import_user_module(Namespace(user_dir=user_dir))
51
  data_dir = "/home/user/app/video"
@@ -110,6 +96,16 @@ def preprocess_video(input_video_path):
110
  write_video_ffmpeg(rois, mouth_roi_path, "/usr/bin/ffmpeg")
111
  return mouth_roi_path
112
 
 
 
 
 
 
 
 
 
 
 
113
  def predict(process_video):
114
  num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
115
 
@@ -135,10 +131,18 @@ def predict(process_video):
135
  ref = decode_fn(sample['target'][0].int().cpu())
136
  hypo = hypos[0][0]['tokens'].int().cpu()
137
  hypo = decode_fn(hypo)
138
- return hypo
 
 
 
 
 
 
139
 
 
140
 
141
  # ---- Gradio Layout -----
 
142
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
143
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
144
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
@@ -148,52 +152,53 @@ text_output = gr.Textbox()
148
 
149
  with demo:
150
  gr.Markdown('''
151
- <div>
152
- <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
153
- This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
154
- <figure>
155
- <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
156
- <figcaption> Speech Recognition from visual lip movement
157
- </figcaption>
158
- </figure>
159
- </div>
160
- ''')
161
- with gr.Row():
162
- gr.Markdown('''
163
- ### Reading Lip movement with youtube link using Avhubert
164
- ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
165
- ##### Step 1b. You also can upload video directly
166
- ##### Step 2. Generating landmarks surrounding mouth area
167
- ##### Step 3. Reading lip movement.
168
- ''')
169
- with gr.Row():
170
- gr.Markdown('''
171
- ### You can test by following examples:
172
- ''')
173
- examples = gr.Examples(examples=
174
- [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
175
- "https://www.youtube.com/watch?v=X8_glJn1B8o",
176
- "https://www.youtube.com/watch?v=80yqL2KzBVw"],
177
- label="Examples", inputs=[youtube_url_in])
178
- with gr.Column():
179
- youtube_url_in.render()
180
- download_youtube_btn = gr.Button("Download Youtube video")
181
- download_youtube_btn.click(get_youtube, [youtube_url_in], [
182
- video_in])
183
- print(video_in)
184
- with gr.Row():
185
- video_in.render()
186
- video_out.render()
187
- with gr.Row():
188
- detect_landmark_btn = gr.Button("Detect landmark")
189
- detect_landmark_btn.click(preprocess_video, [video_in], [
190
- video_out])
191
- predict_btn = gr.Button("Predict")
192
- predict_btn.click(predict, [video_out], [
193
- text_output])
194
- with gr.Row():
195
- # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
196
- text_output.render()
197
-
198
-
199
- demo.launch(debug=True)
 
 
1
  import os
2
  import sys
3
+ import dlib
4
+ import cv2
5
+ import numpy as np
6
+ import skvideo
7
+ import skvideo.io
8
+ from tqdm import tqdm
9
+ from preparation.align_mouth import landmarks_interpolate, crop_patch, write_video_ffmpeg
10
+ from argparse import Namespace
11
+ import fairseq
12
+ from fairseq import checkpoint_utils, options, tasks, utils
13
+ from fairseq.dataclass.configs import GenerationConfig
14
+ from huggingface_hub import hf_hub_download
15
+ import gradio as gr
16
+ from pytube import YouTube
17
 
18
+ # ---- Download AV-HuBERT and install dependencies ----
19
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
20
  os.chdir('/home/user/app/av_hubert')
21
  os.system('git submodule init')
 
29
  os.system('pip install transformers')
30
  os.system('pip install gradio==3.12')
31
  os.system('pip install numpy==1.23.3')
 
 
 
32
  sys.path.append('/home/user/app/av_hubert/avhubert')
33
 
34
+ # ---- Load AV-HuBERT models and setup Gradio interface ----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  user_dir = "/home/user/app/av_hubert/avhubert"
36
  utils.import_user_module(Namespace(user_dir=user_dir))
37
  data_dir = "/home/user/app/video"
 
96
  write_video_ffmpeg(rois, mouth_roi_path, "/usr/bin/ffmpeg")
97
  return mouth_roi_path
98
 
99
+ def extract_word_timings(hypo):
100
+ words = hypo.split()
101
+ word_timings = [(idx * 0.04, word) for idx, word in enumerate(words)]
102
+ return word_timings
103
+
104
+ def save_word_timings(word_timings, output_file):
105
+ with open(output_file, "w") as f:
106
+ for timing, word in word_timings:
107
+ f.write(f"{timing:.2f}\t{word}\n")
108
+
109
  def predict(process_video):
110
  num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
111
 
 
131
  ref = decode_fn(sample['target'][0].int().cpu())
132
  hypo = hypos[0][0]['tokens'].int().cpu()
133
  hypo = decode_fn(hypo)
134
+
135
+ # Extract word timings
136
+ word_timings = extract_word_timings(hypo)
137
+
138
+ # Save word timings to a txt file
139
+ output_file = "/home/user/app/av_hubert/avhubert/word_timings.txt"
140
+ save_word_timings(word_timings, output_file)
141
 
142
+ return hypo
143
 
144
  # ---- Gradio Layout -----
145
+
146
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
147
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
148
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 
152
 
153
  with demo:
154
  gr.Markdown('''
155
+ <div>
156
+ <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
157
+ This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
158
+ <figure>
159
+ <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
160
+ <figcaption> Speech Recognition from visual lip movement
161
+ </figcaption>
162
+ </figure>
163
+ </div>
164
+ ''')
165
+
166
+ gr.Markdown('''
167
+ ### Reading Lip movement with youtube link using Avhubert
168
+ ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
169
+ ##### Step 1b. You also can upload video directly
170
+ ##### Step 2. Generating landmarks surrounding mouth area
171
+ ##### Step 3. Reading lip movement.
172
+ ''')
173
+
174
+ gr.Markdown('''
175
+ ### You can test by following examples:
176
+ ''')
177
+
178
+ examples = gr.Examples(examples=[
179
+ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
180
+ "https://www.youtube.com/watch?v=X8_glJn1B8o",
181
+ "https://www.youtube.com/watch?v=80yqL2KzBVw"],
182
+ label="Examples", inputs=[youtube_url_in])
183
+
184
+ youtube_url_in.render()
185
+
186
+ download_youtube_btn = gr.Button("Download Youtube video")
187
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
188
+
189
+ detect_landmark_btn = gr.Button("Detect landmark")
190
+ detect_landmark_btn.click(preprocess_video, [video_in], [video_out])
191
+
192
+ predict_btn = gr.Button("Predict")
193
+ predict_btn.click(predict, [video_out], [text_output])
194
+
195
+ video_in.render()
196
+ video_out.render()
197
+ text_output.render()
198
+
199
+ # Download button for word timings file
200
+ download_word_timings_btn = gr.Download(label="Download Word Timings")
201
+ download_word_timings_btn.click(lambda: "/home/user/app/av_hubert/avhubert/word_timings.txt")
202
+
203
+ demo.launch(debug=True)
204
+