Spaces:

Suprath
/

liptotext

Running

App Files Files Community

Suprath commited on Mar 24, 2024

Commit

af4b5be

verified ·

1 Parent(s): dc33607

Upload 8 files

Browse files

Files changed (8) hide show

.gitattributes +4 -1
20words_mean_face.npy +3 -0
README.md +8 -7
app.py +199 -0
mmod_human_face_detector.dat +0 -0
requirements.txt +10 -0
shape_predictor_68_face_landmarks.dat +3 -0
video/lipreading.gif +3 -0

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
+demo1.mp4 filter=lfs diff=lfs merge=lfs -text
+demo2.mp4 filter=lfs diff=lfs merge=lfs -text
+lipreading.gif filter=lfs diff=lfs merge=lfs -text

20words_mean_face.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbf68b2044171e1160716df7c53e8bbfaa0ee8c61fb41171d04cb6092bb81422
+size 1168

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Liptotext
-emoji: 📚
-colorFrom: red
-colorTo: red
 sdk: gradio
-sdk_version: 4.22.0
 app_file: app.py
 pinned: false
-license: unknown
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Speech Recognition from visual lip movement
+emoji: 🫧
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
+sdk_version: 3.16.1
 app_file: app.py
 pinned: false
+tags:
+- making-demos
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import sys
+os.system('git clone https://github.com/facebookresearch/av_hubert.git')
+os.chdir(r'D:\vsCode\lip2text\av_hubert')
+os.system('git submodule init')
+os.system('git submodule update')
+os.chdir(r'D:\vsCode\lip2text\av_hubert\fairseq')
+os.system('pip install ./')
+os.system('pip install scipy')
+os.system('pip install sentencepiece')
+os.system('pip install python_speech_features')
+os.system('pip install scikit-video')
+os.system('pip install transformers')
+os.system('pip install gradio==3.12')
+os.system('pip install numpy==1.23.3')
+# sys.path.append('/home/user/app/av_hubert')
+sys.path.append('D:\vsCode\lip2text\av_hubert')
+print(sys.path)
+print(os.listdir())
+print(sys.argv, type(sys.argv))
+sys.argv.append('dummy')
+import dlib, cv2, os
+import numpy as np
+import skvideo
+import skvideo.io
+from tqdm import tqdm
+from preparation.align_mouth import landmarks_interpolate, crop_patch, write_video_ffmpeg
+from base64 import b64encode
+import torch
+import cv2
+import tempfile
+from argparse import Namespace
+import fairseq
+from fairseq import checkpoint_utils, options, tasks, utils
+from fairseq.dataclass.configs import GenerationConfig
+from huggingface_hub import hf_hub_download
+import gradio as gr
+from pytube import YouTube
+# os.chdir('/home/user/app/av_hubert/avhubert')
+user_dir = "/home/user/app/av_hubert/avhubert"
+utils.import_user_module(Namespace(user_dir=user_dir))
+data_dir = "/home/user/app/video"
+ckpt_path = hf_hub_download('vumichien/AV-HuBERT', 'model.pt')
+face_detector_path = "/home/user/app/mmod_human_face_detector.dat"
+face_predictor_path = "/home/user/app/shape_predictor_68_face_landmarks.dat"
+mean_face_path = "/home/user/app/20words_mean_face.npy"
+mouth_roi_path = "/home/user/app/roi.mp4"
+modalities = ["video"]
+gen_subset = "test"
+gen_cfg = GenerationConfig(beam=20)
+models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+models = [model.eval().cuda() if torch.cuda.is_available() else model.eval() for model in models]
+saved_cfg.task.modalities = modalities
+saved_cfg.task.data = data_dir
+saved_cfg.task.label_dir = data_dir
+task = tasks.setup_task(saved_cfg.task)
+generator = task.build_generator(models, gen_cfg)
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    print("Success download video")
+    print(abs_video_path)
+    return abs_video_path
+def detect_landmark(image, detector, predictor):
+    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    face_locations  = detector(gray, 1)
+    coords = None
+    for (_, face_location) in enumerate(face_locations):
+        if torch.cuda.is_available():
+            rect = face_location.rect
+        else:
+            rect = face_location
+        shape = predictor(gray, rect)
+        coords = np.zeros((68, 2), dtype=np.int32)
+        for i in range(0, 68):
+            coords[i] = (shape.part(i).x, shape.part(i).y)
+    return coords
+def preprocess_video(input_video_path):
+    if torch.cuda.is_available():
+        detector = dlib.cnn_face_detection_model_v1(face_detector_path)
+    else:
+        detector = dlib.get_frontal_face_detector()
+    predictor = dlib.shape_predictor(face_predictor_path)
+    STD_SIZE = (256, 256)
+    mean_face_landmarks = np.load(mean_face_path)
+    stablePntsIDs = [33, 36, 39, 42, 45]
+    videogen = skvideo.io.vread(input_video_path)
+    frames = np.array([frame for frame in videogen])
+    landmarks = []
+    for frame in tqdm(frames):
+        landmark = detect_landmark(frame, detector, predictor)
+        landmarks.append(landmark)
+    preprocessed_landmarks = landmarks_interpolate(landmarks)
+    rois = crop_patch(input_video_path, preprocessed_landmarks, mean_face_landmarks, stablePntsIDs, STD_SIZE,
+                          window_margin=12, start_idx=48, stop_idx=68, crop_height=96, crop_width=96)
+    write_video_ffmpeg(rois, mouth_roi_path, "/usr/bin/ffmpeg")
+    return mouth_roi_path
+def predict(process_video):
+    num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
+    tsv_cont = ["/\n", f"test-0\t{process_video}\t{None}\t{num_frames}\t{int(16_000*num_frames/25)}\n"]
+    label_cont = ["DUMMY\n"]
+    with open(f"{data_dir}/test.tsv", "w") as fo:
+      fo.write("".join(tsv_cont))
+    with open(f"{data_dir}/test.wrd", "w") as fo:
+      fo.write("".join(label_cont))
+    task.load_dataset(gen_subset, task_cfg=saved_cfg.task)
+    def decode_fn(x):
+        dictionary = task.target_dictionary
+        symbols_ignore = generator.symbols_to_strip_from_output
+        symbols_ignore.add(dictionary.pad())
+        return task.datasets[gen_subset].label_processors[0].decode(x, symbols_ignore)
+    itr = task.get_batch_iterator(dataset=task.dataset(gen_subset)).next_epoch_itr(shuffle=False)
+    sample = next(itr)
+    if torch.cuda.is_available():
+        sample = utils.move_to_cuda(sample)
+    hypos = task.inference_step(generator, models, sample)
+    ref = decode_fn(sample['target'][0].int().cpu())
+    hypo = hypos[0][0]['tokens'].int().cpu()
+    hypo = decode_fn(hypo)
+    return hypo
+# ---- Gradio Layout -----
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
+video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
+demo = gr.Blocks()
+demo.encrypt = False
+text_output = gr.Textbox()
+with demo:
+    gr.Markdown('''
+            <div>
+            <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
+            This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
+            <figure>
+              <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
+              <figcaption> Speech Recognition from visual lip movement
+              </figcaption>
+            </figure>
+            </div>
+        ''')
+    with gr.Row():
+            gr.Markdown('''
+            ### Reading Lip movement with youtube link using Avhubert
+            ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
+            ##### Step 1b. You also can upload video directly
+            ##### Step 2. Generating landmarks surrounding mouth area
+            ##### Step 3. Reading lip movement.
+            ''')
+    with gr.Row():
+        gr.Markdown('''
+            ### You can test by following examples:
+            ''')
+    examples = gr.Examples(examples=
+            [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
+              "https://www.youtube.com/watch?v=X8_glJn1B8o",
+              "https://www.youtube.com/watch?v=80yqL2KzBVw"],
+          label="Examples", inputs=[youtube_url_in])
+    with gr.Column():
+          youtube_url_in.render()
+          download_youtube_btn = gr.Button("Download Youtube video")
+          download_youtube_btn.click(get_youtube, [youtube_url_in], [
+              video_in])
+          print(video_in)
+    with gr.Row():
+        video_in.render()
+        video_out.render()
+    with gr.Row():
+        detect_landmark_btn = gr.Button("Detect landmark")
+        detect_landmark_btn.click(preprocess_video, [video_in], [
+            video_out])
+        predict_btn = gr.Button("Predict")
+        predict_btn.click(predict, [video_out], [
+            text_output])
+    with gr.Row():
+        # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
+        text_output.render()
+demo.launch(debug=True)

mmod_human_face_detector.dat ADDED Viewed

Binary file (730 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+git+https://github.com/facebookresearch/fairseq.git
+scipy
+sentencepiece
+python_speech_features
+scikit-video
+scikit-image
+dlib
+opencv-python
+pytube
+httpx==0.24.1

shape_predictor_68_face_landmarks.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
+size 99693937

video/lipreading.gif ADDED Viewed

Git LFS Details

SHA256: 8cf0498b502e01bd6eb72f0985854a64793a6b4f0513181a8bc474abc3e8e75f
Pointer size: 132 Bytes
Size of remote file: 1.82 MB