Suprath commited on
Commit
af4b5be
·
verified ·
1 Parent(s): dc33607

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
36
+ demo1.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ demo2.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ lipreading.gif filter=lfs diff=lfs merge=lfs -text
20words_mean_face.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf68b2044171e1160716df7c53e8bbfaa0ee8c61fb41171d04cb6092bb81422
3
+ size 1168
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Liptotext
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
- license: unknown
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Speech Recognition from visual lip movement
3
+ emoji: 🫧
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.16.1
8
  app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - making-demos
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ os.system('git clone https://github.com/facebookresearch/av_hubert.git')
5
+ os.chdir(r'D:\vsCode\lip2text\av_hubert')
6
+ os.system('git submodule init')
7
+ os.system('git submodule update')
8
+ os.chdir(r'D:\vsCode\lip2text\av_hubert\fairseq')
9
+ os.system('pip install ./')
10
+ os.system('pip install scipy')
11
+ os.system('pip install sentencepiece')
12
+ os.system('pip install python_speech_features')
13
+ os.system('pip install scikit-video')
14
+ os.system('pip install transformers')
15
+ os.system('pip install gradio==3.12')
16
+ os.system('pip install numpy==1.23.3')
17
+
18
+
19
+ # sys.path.append('/home/user/app/av_hubert')
20
+ sys.path.append('D:\vsCode\lip2text\av_hubert')
21
+
22
+ print(sys.path)
23
+ print(os.listdir())
24
+ print(sys.argv, type(sys.argv))
25
+ sys.argv.append('dummy')
26
+
27
+
28
+
29
+ import dlib, cv2, os
30
+ import numpy as np
31
+ import skvideo
32
+ import skvideo.io
33
+ from tqdm import tqdm
34
+ from preparation.align_mouth import landmarks_interpolate, crop_patch, write_video_ffmpeg
35
+ from base64 import b64encode
36
+ import torch
37
+ import cv2
38
+ import tempfile
39
+ from argparse import Namespace
40
+ import fairseq
41
+ from fairseq import checkpoint_utils, options, tasks, utils
42
+ from fairseq.dataclass.configs import GenerationConfig
43
+ from huggingface_hub import hf_hub_download
44
+ import gradio as gr
45
+ from pytube import YouTube
46
+
47
+ # os.chdir('/home/user/app/av_hubert/avhubert')
48
+
49
+ user_dir = "/home/user/app/av_hubert/avhubert"
50
+ utils.import_user_module(Namespace(user_dir=user_dir))
51
+ data_dir = "/home/user/app/video"
52
+
53
+ ckpt_path = hf_hub_download('vumichien/AV-HuBERT', 'model.pt')
54
+ face_detector_path = "/home/user/app/mmod_human_face_detector.dat"
55
+ face_predictor_path = "/home/user/app/shape_predictor_68_face_landmarks.dat"
56
+ mean_face_path = "/home/user/app/20words_mean_face.npy"
57
+ mouth_roi_path = "/home/user/app/roi.mp4"
58
+ modalities = ["video"]
59
+ gen_subset = "test"
60
+ gen_cfg = GenerationConfig(beam=20)
61
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
62
+ models = [model.eval().cuda() if torch.cuda.is_available() else model.eval() for model in models]
63
+ saved_cfg.task.modalities = modalities
64
+ saved_cfg.task.data = data_dir
65
+ saved_cfg.task.label_dir = data_dir
66
+ task = tasks.setup_task(saved_cfg.task)
67
+ generator = task.build_generator(models, gen_cfg)
68
+
69
+ def get_youtube(video_url):
70
+ yt = YouTube(video_url)
71
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
72
+ print("Success download video")
73
+ print(abs_video_path)
74
+ return abs_video_path
75
+
76
+ def detect_landmark(image, detector, predictor):
77
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
78
+ face_locations = detector(gray, 1)
79
+ coords = None
80
+ for (_, face_location) in enumerate(face_locations):
81
+ if torch.cuda.is_available():
82
+ rect = face_location.rect
83
+ else:
84
+ rect = face_location
85
+ shape = predictor(gray, rect)
86
+ coords = np.zeros((68, 2), dtype=np.int32)
87
+ for i in range(0, 68):
88
+ coords[i] = (shape.part(i).x, shape.part(i).y)
89
+ return coords
90
+
91
+ def preprocess_video(input_video_path):
92
+ if torch.cuda.is_available():
93
+ detector = dlib.cnn_face_detection_model_v1(face_detector_path)
94
+ else:
95
+ detector = dlib.get_frontal_face_detector()
96
+
97
+ predictor = dlib.shape_predictor(face_predictor_path)
98
+ STD_SIZE = (256, 256)
99
+ mean_face_landmarks = np.load(mean_face_path)
100
+ stablePntsIDs = [33, 36, 39, 42, 45]
101
+ videogen = skvideo.io.vread(input_video_path)
102
+ frames = np.array([frame for frame in videogen])
103
+ landmarks = []
104
+ for frame in tqdm(frames):
105
+ landmark = detect_landmark(frame, detector, predictor)
106
+ landmarks.append(landmark)
107
+ preprocessed_landmarks = landmarks_interpolate(landmarks)
108
+ rois = crop_patch(input_video_path, preprocessed_landmarks, mean_face_landmarks, stablePntsIDs, STD_SIZE,
109
+ window_margin=12, start_idx=48, stop_idx=68, crop_height=96, crop_width=96)
110
+ write_video_ffmpeg(rois, mouth_roi_path, "/usr/bin/ffmpeg")
111
+ return mouth_roi_path
112
+
113
+ def predict(process_video):
114
+ num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
115
+
116
+ tsv_cont = ["/\n", f"test-0\t{process_video}\t{None}\t{num_frames}\t{int(16_000*num_frames/25)}\n"]
117
+ label_cont = ["DUMMY\n"]
118
+ with open(f"{data_dir}/test.tsv", "w") as fo:
119
+ fo.write("".join(tsv_cont))
120
+ with open(f"{data_dir}/test.wrd", "w") as fo:
121
+ fo.write("".join(label_cont))
122
+ task.load_dataset(gen_subset, task_cfg=saved_cfg.task)
123
+
124
+ def decode_fn(x):
125
+ dictionary = task.target_dictionary
126
+ symbols_ignore = generator.symbols_to_strip_from_output
127
+ symbols_ignore.add(dictionary.pad())
128
+ return task.datasets[gen_subset].label_processors[0].decode(x, symbols_ignore)
129
+
130
+ itr = task.get_batch_iterator(dataset=task.dataset(gen_subset)).next_epoch_itr(shuffle=False)
131
+ sample = next(itr)
132
+ if torch.cuda.is_available():
133
+ sample = utils.move_to_cuda(sample)
134
+ hypos = task.inference_step(generator, models, sample)
135
+ ref = decode_fn(sample['target'][0].int().cpu())
136
+ hypo = hypos[0][0]['tokens'].int().cpu()
137
+ hypo = decode_fn(hypo)
138
+ return hypo
139
+
140
+
141
+ # ---- Gradio Layout -----
142
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
143
+ video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
144
+ video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
145
+ demo = gr.Blocks()
146
+ demo.encrypt = False
147
+ text_output = gr.Textbox()
148
+
149
+ with demo:
150
+ gr.Markdown('''
151
+ <div>
152
+ <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
153
+ This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
154
+ <figure>
155
+ <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
156
+ <figcaption> Speech Recognition from visual lip movement
157
+ </figcaption>
158
+ </figure>
159
+ </div>
160
+ ''')
161
+ with gr.Row():
162
+ gr.Markdown('''
163
+ ### Reading Lip movement with youtube link using Avhubert
164
+ ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
165
+ ##### Step 1b. You also can upload video directly
166
+ ##### Step 2. Generating landmarks surrounding mouth area
167
+ ##### Step 3. Reading lip movement.
168
+ ''')
169
+ with gr.Row():
170
+ gr.Markdown('''
171
+ ### You can test by following examples:
172
+ ''')
173
+ examples = gr.Examples(examples=
174
+ [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
175
+ "https://www.youtube.com/watch?v=X8_glJn1B8o",
176
+ "https://www.youtube.com/watch?v=80yqL2KzBVw"],
177
+ label="Examples", inputs=[youtube_url_in])
178
+ with gr.Column():
179
+ youtube_url_in.render()
180
+ download_youtube_btn = gr.Button("Download Youtube video")
181
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
182
+ video_in])
183
+ print(video_in)
184
+ with gr.Row():
185
+ video_in.render()
186
+ video_out.render()
187
+ with gr.Row():
188
+ detect_landmark_btn = gr.Button("Detect landmark")
189
+ detect_landmark_btn.click(preprocess_video, [video_in], [
190
+ video_out])
191
+ predict_btn = gr.Button("Predict")
192
+ predict_btn.click(predict, [video_out], [
193
+ text_output])
194
+ with gr.Row():
195
+ # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
196
+ text_output.render()
197
+
198
+
199
+ demo.launch(debug=True)
mmod_human_face_detector.dat ADDED
Binary file (730 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/facebookresearch/fairseq.git
2
+ scipy
3
+ sentencepiece
4
+ python_speech_features
5
+ scikit-video
6
+ scikit-image
7
+ dlib
8
+ opencv-python
9
+ pytube
10
+ httpx==0.24.1
shape_predictor_68_face_landmarks.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
3
+ size 99693937
video/lipreading.gif ADDED

Git LFS Details

  • SHA256: 8cf0498b502e01bd6eb72f0985854a64793a6b4f0513181a8bc474abc3e8e75f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.82 MB