Vaibhav Srivastav commited on
Commit
5d906de
·
0 Parent(s):

Squash for release.

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.whl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio_cached_examples/
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/#use-with-ide
112
+ .pdm.toml
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/myint/docformatter
17
+ rev: v1.7.5
18
+ hooks:
19
+ - id: docformatter
20
+ args: ["--in-place"]
21
+ - repo: https://github.com/pycqa/isort
22
+ rev: 5.12.0
23
+ hooks:
24
+ - id: isort
25
+ args: ["--profile", "black"]
26
+ - repo: https://github.com/pre-commit/mirrors-mypy
27
+ rev: v1.7.0
28
+ hooks:
29
+ - id: mypy
30
+ args: ["--ignore-missing-imports"]
31
+ additional_dependencies:
32
+ ["types-python-slugify", "types-requests", "types-PyYAML"]
33
+ - repo: https://github.com/psf/black
34
+ rev: 23.11.0
35
+ hooks:
36
+ - id: black
37
+ language_version: python3.10
38
+ args: ["--line-length", "119"]
39
+ - repo: https://github.com/kynan/nbstripout
40
+ rev: 0.6.1
41
+ hooks:
42
+ - id: nbstripout
43
+ args:
44
+ [
45
+ "--extra-keys",
46
+ "metadata.interpreter metadata.kernelspec cell.metadata.pycharm",
47
+ ]
48
+ - repo: https://github.com/nbQA-dev/nbQA
49
+ rev: 1.7.0
50
+ hooks:
51
+ - id: nbqa-black
52
+ - id: nbqa-pyupgrade
53
+ args: ["--py37-plus"]
54
+ - id: nbqa-isort
55
+ args: ["--float-to-top"]
.vscode/settings.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter",
4
+ "editor.formatOnType": true,
5
+ "editor.codeActionsOnSave": {
6
+ "source.organizeImports": true
7
+ }
8
+ },
9
+ "black-formatter.args": [
10
+ "--line-length=119"
11
+ ],
12
+ "isort.args": ["--profile", "black"],
13
+ "flake8.args": [
14
+ "--max-line-length=119"
15
+ ],
16
+ "ruff.args": [
17
+ "--line-length=119"
18
+ ],
19
+ "editor.formatOnSave": true,
20
+ "files.insertFinalNewline": true
21
+ }
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+ RUN apt-get update && \
4
+ apt-get upgrade -y && \
5
+ apt-get install -y --no-install-recommends \
6
+ git \
7
+ git-lfs \
8
+ wget \
9
+ curl \
10
+ # python build dependencies \
11
+ build-essential \
12
+ libssl-dev \
13
+ zlib1g-dev \
14
+ libbz2-dev \
15
+ libreadline-dev \
16
+ libsqlite3-dev \
17
+ libncursesw5-dev \
18
+ xz-utils \
19
+ tk-dev \
20
+ libxml2-dev \
21
+ libxmlsec1-dev \
22
+ libffi-dev \
23
+ liblzma-dev \
24
+ # gradio dependencies \
25
+ ffmpeg \
26
+ # fairseq2 dependencies \
27
+ libsndfile-dev && \
28
+ apt-get clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ RUN useradd -m -u 1000 user
32
+ USER user
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:${PATH}
35
+ WORKDIR ${HOME}/app
36
+
37
+ RUN curl https://pyenv.run | bash
38
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
39
+ ARG PYTHON_VERSION=3.10.13
40
+ RUN pyenv install ${PYTHON_VERSION} && \
41
+ pyenv global ${PYTHON_VERSION} && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir -U pip setuptools wheel && \
44
+ pip install "huggingface-hub==0.19.3" "hf-transfer==0.1.4"
45
+
46
+ COPY --chown=1000 . ${HOME}/app
47
+ RUN pip install -r ${HOME}/app/requirements.txt && \
48
+ pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/pt2.1.0/cu121 && \
49
+ pip install ${HOME}/app/whl/seamless_communication-1.0.0-py3-none-any.whl
50
+
51
+ ENV PYTHONPATH=${HOME}/app \
52
+ PYTHONUNBUFFERED=1 \
53
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
54
+ GRADIO_ALLOW_FLAGGING=never \
55
+ GRADIO_NUM_PORTS=1 \
56
+ GRADIO_SERVER_NAME=0.0.0.0 \
57
+ GRADIO_THEME=huggingface \
58
+ TQDM_POSITION=-1 \
59
+ TQDM_MININTERVAL=1 \
60
+ SYSTEM=spaces
61
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seamless M4T v2
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ suggested_hardware: t4-medium
9
+ models:
10
+ - facebook/seamless-m4t-v2-large
11
+ - facebook/SONAR
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pathlib
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ import torchaudio
10
+ from fairseq2.assets import InProcAssetMetadataProvider, asset_store
11
+ from huggingface_hub import snapshot_download
12
+ from seamless_communication.inference import Translator
13
+
14
+ from lang_list import (
15
+ ASR_TARGET_LANGUAGE_NAMES,
16
+ LANGUAGE_NAME_TO_CODE,
17
+ S2ST_TARGET_LANGUAGE_NAMES,
18
+ S2TT_TARGET_LANGUAGE_NAMES,
19
+ T2ST_TARGET_LANGUAGE_NAMES,
20
+ T2TT_TARGET_LANGUAGE_NAMES,
21
+ TEXT_SOURCE_LANGUAGE_NAMES,
22
+ )
23
+
24
+ CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", "/home/user/app/models"))
25
+ if not CHECKPOINTS_PATH.exists():
26
+ snapshot_download(repo_id="meta-private/M4Tv2", repo_type="model", local_dir=CHECKPOINTS_PATH)
27
+ asset_store.env_resolvers.clear()
28
+ asset_store.env_resolvers.append(lambda: "demo")
29
+ demo_metadata = [
30
+ {
31
+ "name": "seamlessM4T_v2_large@demo",
32
+ "checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt",
33
+ "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
34
+ },
35
+ {
36
+ "name": "vocoder_v2@demo",
37
+ "checkpoint": f"file://{CHECKPOINTS_PATH}/vocoder_v2.pt",
38
+ },
39
+ ]
40
+ asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata))
41
+
42
+ DESCRIPTION = """\
43
+ # SeamlessM4T
44
+
45
+ [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
46
+ translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
47
+ This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
48
+ translation and more, without relying on multiple separate models.
49
+ """
50
+
51
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
52
+
53
+ AUDIO_SAMPLE_RATE = 16000.0
54
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
55
+ DEFAULT_TARGET_LANGUAGE = "French"
56
+
57
+ if torch.cuda.is_available():
58
+ device = torch.device("cuda:0")
59
+ dtype = torch.float16
60
+ else:
61
+ device = torch.device("cpu")
62
+ dtype = torch.float32
63
+
64
+ translator = Translator(
65
+ model_name_or_card="seamlessM4T_v2_large",
66
+ vocoder_name_or_card="vocoder_v2",
67
+ device=device,
68
+ dtype=dtype,
69
+ apply_mintox=True,
70
+ )
71
+
72
+
73
+ def preprocess_audio(input_audio: str) -> None:
74
+ arr, org_sr = torchaudio.load(input_audio)
75
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
76
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
77
+ if new_arr.shape[1] > max_length:
78
+ new_arr = new_arr[:, :max_length]
79
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
80
+ torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
81
+
82
+
83
+ def run_s2st(
84
+ input_audio: str, source_language: str, target_language: str
85
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
86
+ preprocess_audio(input_audio)
87
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
88
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
89
+ out_texts, out_audios = translator.predict(
90
+ input=input_audio,
91
+ task_str="S2ST",
92
+ src_lang=source_language_code,
93
+ tgt_lang=target_language_code,
94
+ )
95
+ out_text = str(out_texts[0])
96
+ out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
97
+ return (int(AUDIO_SAMPLE_RATE), out_wav), out_text
98
+
99
+
100
+ def run_s2tt(input_audio: str, source_language: str, target_language: str) -> str:
101
+ preprocess_audio(input_audio)
102
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
103
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
104
+ out_texts, _ = translator.predict(
105
+ input=input_audio,
106
+ task_str="S2TT",
107
+ src_lang=source_language_code,
108
+ tgt_lang=target_language_code,
109
+ )
110
+ return str(out_texts[0])
111
+
112
+
113
+ def run_t2st(input_text: str, source_language: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
114
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
115
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
116
+ out_texts, out_audios = translator.predict(
117
+ input=input_text,
118
+ task_str="T2ST",
119
+ src_lang=source_language_code,
120
+ tgt_lang=target_language_code,
121
+ )
122
+ out_text = str(out_texts[0])
123
+ out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
124
+ return (int(AUDIO_SAMPLE_RATE), out_wav), out_text
125
+
126
+
127
+ def run_t2tt(input_text: str, source_language: str, target_language: str) -> str:
128
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
129
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
130
+ out_texts, _ = translator.predict(
131
+ input=input_text,
132
+ task_str="T2TT",
133
+ src_lang=source_language_code,
134
+ tgt_lang=target_language_code,
135
+ )
136
+ return str(out_texts[0])
137
+
138
+
139
+ def run_asr(input_audio: str, target_language: str) -> str:
140
+ preprocess_audio(input_audio)
141
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
142
+ out_texts, _ = translator.predict(
143
+ input=input_audio,
144
+ task_str="ASR",
145
+ src_lang=target_language_code,
146
+ tgt_lang=target_language_code,
147
+ )
148
+ return str(out_texts[0])
149
+
150
+
151
+ with gr.Blocks() as demo_s2st:
152
+ with gr.Row():
153
+ with gr.Column():
154
+ with gr.Group():
155
+ input_audio = gr.Audio(label="Input speech", type="filepath")
156
+ source_language = gr.Dropdown(
157
+ label="Source language",
158
+ choices=ASR_TARGET_LANGUAGE_NAMES,
159
+ value="English",
160
+ )
161
+ target_language = gr.Dropdown(
162
+ label="Target language",
163
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
164
+ value=DEFAULT_TARGET_LANGUAGE,
165
+ )
166
+ btn = gr.Button("Translate")
167
+ with gr.Column():
168
+ with gr.Group():
169
+ output_audio = gr.Audio(
170
+ label="Translated speech",
171
+ autoplay=False,
172
+ streaming=False,
173
+ type="numpy",
174
+ )
175
+ output_text = gr.Textbox(label="Translated text")
176
+
177
+ gr.Examples(
178
+ examples=[
179
+ ["assets/sample_input.mp3", "English", "French"],
180
+ ["assets/sample_input.mp3", "English", "Mandarin Chinese"],
181
+ ["assets/sample_input_2.mp3", "English", "Hindi"],
182
+ ["assets/sample_input_2.mp3", "English", "Spanish"],
183
+ ],
184
+ inputs=[input_audio, source_language, target_language],
185
+ outputs=[output_audio, output_text],
186
+ fn=run_s2st,
187
+ cache_examples=CACHE_EXAMPLES,
188
+ api_name=False,
189
+ )
190
+
191
+ btn.click(
192
+ fn=run_s2st,
193
+ inputs=[input_audio, source_language, target_language],
194
+ outputs=[output_audio, output_text],
195
+ api_name="s2st",
196
+ )
197
+
198
+ with gr.Blocks() as demo_s2tt:
199
+ with gr.Row():
200
+ with gr.Column():
201
+ with gr.Group():
202
+ input_audio = gr.Audio(label="Input speech", type="filepath")
203
+ source_language = gr.Dropdown(
204
+ label="Source language",
205
+ choices=ASR_TARGET_LANGUAGE_NAMES,
206
+ value="English",
207
+ )
208
+ target_language = gr.Dropdown(
209
+ label="Target language",
210
+ choices=S2TT_TARGET_LANGUAGE_NAMES,
211
+ value=DEFAULT_TARGET_LANGUAGE,
212
+ )
213
+ btn = gr.Button("Translate")
214
+ with gr.Column():
215
+ output_text = gr.Textbox(label="Translated text")
216
+
217
+ gr.Examples(
218
+ examples=[
219
+ ["assets/sample_input.mp3", "English", "French"],
220
+ ["assets/sample_input.mp3", "English", "Mandarin Chinese"],
221
+ ["assets/sample_input_2.mp3", "English", "Hindi"],
222
+ ["assets/sample_input_2.mp3", "English", "Spanish"],
223
+ ],
224
+ inputs=[input_audio, source_language, target_language],
225
+ outputs=output_text,
226
+ fn=run_s2tt,
227
+ cache_examples=CACHE_EXAMPLES,
228
+ api_name=False,
229
+ )
230
+
231
+ btn.click(
232
+ fn=run_s2tt,
233
+ inputs=[input_audio, source_language, target_language],
234
+ outputs=output_text,
235
+ api_name="s2tt",
236
+ )
237
+
238
+ with gr.Blocks() as demo_t2st:
239
+ with gr.Row():
240
+ with gr.Column():
241
+ with gr.Group():
242
+ input_text = gr.Textbox(label="Input text")
243
+ with gr.Row():
244
+ source_language = gr.Dropdown(
245
+ label="Source language",
246
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
247
+ value="English",
248
+ )
249
+ target_language = gr.Dropdown(
250
+ label="Target language",
251
+ choices=T2ST_TARGET_LANGUAGE_NAMES,
252
+ value=DEFAULT_TARGET_LANGUAGE,
253
+ )
254
+ btn = gr.Button("Translate")
255
+ with gr.Column():
256
+ with gr.Group():
257
+ output_audio = gr.Audio(
258
+ label="Translated speech",
259
+ autoplay=False,
260
+ streaming=False,
261
+ type="numpy",
262
+ )
263
+ output_text = gr.Textbox(label="Translated text")
264
+
265
+ gr.Examples(
266
+ examples=[
267
+ [
268
+ "My favorite animal is the elephant.",
269
+ "English",
270
+ "French",
271
+ ],
272
+ [
273
+ "My favorite animal is the elephant.",
274
+ "English",
275
+ "Mandarin Chinese",
276
+ ],
277
+ [
278
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
279
+ "English",
280
+ "Hindi",
281
+ ],
282
+ [
283
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
284
+ "English",
285
+ "Spanish",
286
+ ],
287
+ ],
288
+ inputs=[input_text, source_language, target_language],
289
+ outputs=[output_audio, output_text],
290
+ fn=run_t2st,
291
+ cache_examples=CACHE_EXAMPLES,
292
+ api_name=False,
293
+ )
294
+
295
+ gr.on(
296
+ triggers=[input_text.submit, btn.click],
297
+ fn=run_t2st,
298
+ inputs=[input_text, source_language, target_language],
299
+ outputs=[output_audio, output_text],
300
+ api_name="t2st",
301
+ )
302
+
303
+ with gr.Blocks() as demo_t2tt:
304
+ with gr.Row():
305
+ with gr.Column():
306
+ with gr.Group():
307
+ input_text = gr.Textbox(label="Input text")
308
+ with gr.Row():
309
+ source_language = gr.Dropdown(
310
+ label="Source language",
311
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
312
+ value="English",
313
+ )
314
+ target_language = gr.Dropdown(
315
+ label="Target language",
316
+ choices=T2TT_TARGET_LANGUAGE_NAMES,
317
+ value=DEFAULT_TARGET_LANGUAGE,
318
+ )
319
+ btn = gr.Button("Translate")
320
+ with gr.Column():
321
+ output_text = gr.Textbox(label="Translated text")
322
+
323
+ gr.Examples(
324
+ examples=[
325
+ [
326
+ "My favorite animal is the elephant.",
327
+ "English",
328
+ "French",
329
+ ],
330
+ [
331
+ "My favorite animal is the elephant.",
332
+ "English",
333
+ "Mandarin Chinese",
334
+ ],
335
+ [
336
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
337
+ "English",
338
+ "Hindi",
339
+ ],
340
+ [
341
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
342
+ "English",
343
+ "Spanish",
344
+ ],
345
+ ],
346
+ inputs=[input_text, source_language, target_language],
347
+ outputs=output_text,
348
+ fn=run_t2tt,
349
+ cache_examples=CACHE_EXAMPLES,
350
+ api_name=False,
351
+ )
352
+
353
+ gr.on(
354
+ triggers=[input_text.submit, btn.click],
355
+ fn=run_t2tt,
356
+ inputs=[input_text, source_language, target_language],
357
+ outputs=output_text,
358
+ api_name="t2tt",
359
+ )
360
+
361
+ with gr.Blocks() as demo_asr:
362
+ with gr.Row():
363
+ with gr.Column():
364
+ with gr.Group():
365
+ input_audio = gr.Audio(label="Input speech", type="filepath")
366
+ target_language = gr.Dropdown(
367
+ label="Target language",
368
+ choices=ASR_TARGET_LANGUAGE_NAMES,
369
+ value=DEFAULT_TARGET_LANGUAGE,
370
+ )
371
+ btn = gr.Button("Translate")
372
+ with gr.Column():
373
+ output_text = gr.Textbox(label="Translated text")
374
+
375
+ gr.Examples(
376
+ examples=[
377
+ ["assets/sample_input.mp3", "English"],
378
+ ["assets/sample_input_2.mp3", "English"],
379
+ ],
380
+ inputs=[input_audio, target_language],
381
+ outputs=output_text,
382
+ fn=run_asr,
383
+ cache_examples=CACHE_EXAMPLES,
384
+ api_name=False,
385
+ )
386
+
387
+ btn.click(
388
+ fn=run_asr,
389
+ inputs=[input_audio, target_language],
390
+ outputs=output_text,
391
+ api_name="asr",
392
+ )
393
+
394
+
395
+ with gr.Blocks(css="style.css") as demo:
396
+ gr.Markdown(DESCRIPTION)
397
+ gr.DuplicateButton(
398
+ value="Duplicate Space for private use",
399
+ elem_id="duplicate-button",
400
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
401
+ )
402
+
403
+ with gr.Tabs():
404
+ with gr.Tab(label="S2ST"):
405
+ demo_s2st.render()
406
+ with gr.Tab(label="S2TT"):
407
+ demo_s2tt.render()
408
+ with gr.Tab(label="T2ST"):
409
+ demo_t2st.render()
410
+ with gr.Tab(label="T2TT"):
411
+ demo_t2tt.render()
412
+ with gr.Tab(label="ASR"):
413
+ demo_asr.render()
414
+
415
+
416
+ if __name__ == "__main__":
417
+ demo.queue(max_size=50).launch()
assets/sample_input.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982369687f05bf8fcd6923c4ffcccda0fcce92f44eceae5a9d00a431f07ea87b
3
+ size 10272
assets/sample_input_2.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a505a4641e3f5f0ddec9508832793aa20e63d2545530b66bc04a9bd19a742e6
3
+ size 30624
lang_list.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "afr",
112
+ "amh",
113
+ "arb",
114
+ "ary",
115
+ "arz",
116
+ "asm",
117
+ "azj",
118
+ "bel",
119
+ "ben",
120
+ "bos",
121
+ "bul",
122
+ "cat",
123
+ "ceb",
124
+ "ces",
125
+ "ckb",
126
+ "cmn",
127
+ "cym",
128
+ "dan",
129
+ "deu",
130
+ "ell",
131
+ "eng",
132
+ "est",
133
+ "eus",
134
+ "fin",
135
+ "fra",
136
+ "gaz",
137
+ "gle",
138
+ "glg",
139
+ "guj",
140
+ "heb",
141
+ "hin",
142
+ "hrv",
143
+ "hun",
144
+ "hye",
145
+ "ibo",
146
+ "ind",
147
+ "isl",
148
+ "ita",
149
+ "jav",
150
+ "jpn",
151
+ "kan",
152
+ "kat",
153
+ "kaz",
154
+ "khk",
155
+ "khm",
156
+ "kir",
157
+ "kor",
158
+ "lao",
159
+ "lit",
160
+ "lug",
161
+ "luo",
162
+ "lvs",
163
+ "mai",
164
+ "mal",
165
+ "mar",
166
+ "mkd",
167
+ "mlt",
168
+ "mni",
169
+ "mya",
170
+ "nld",
171
+ "nno",
172
+ "nob",
173
+ "npi",
174
+ "nya",
175
+ "ory",
176
+ "pan",
177
+ "pbt",
178
+ "pes",
179
+ "pol",
180
+ "por",
181
+ "ron",
182
+ "rus",
183
+ "slk",
184
+ "slv",
185
+ "sna",
186
+ "snd",
187
+ "som",
188
+ "spa",
189
+ "srp",
190
+ "swe",
191
+ "swh",
192
+ "tam",
193
+ "tel",
194
+ "tgk",
195
+ "tgl",
196
+ "tha",
197
+ "tur",
198
+ "ukr",
199
+ "urd",
200
+ "uzn",
201
+ "vie",
202
+ "yor",
203
+ "yue",
204
+ "zsm",
205
+ "zul",
206
+ ]
207
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
208
+
209
+ # Target langs:
210
+ # S2ST / T2ST
211
+ s2st_target_language_codes = [
212
+ "eng",
213
+ "arb",
214
+ "ben",
215
+ "cat",
216
+ "ces",
217
+ "cmn",
218
+ "cym",
219
+ "dan",
220
+ "deu",
221
+ "est",
222
+ "fin",
223
+ "fra",
224
+ "hin",
225
+ "ind",
226
+ "ita",
227
+ "jpn",
228
+ "kor",
229
+ "mlt",
230
+ "nld",
231
+ "pes",
232
+ "pol",
233
+ "por",
234
+ "ron",
235
+ "rus",
236
+ "slk",
237
+ "spa",
238
+ "swe",
239
+ "swh",
240
+ "tel",
241
+ "tgl",
242
+ "tha",
243
+ "tur",
244
+ "ukr",
245
+ "urd",
246
+ "uzn",
247
+ "vie",
248
+ ]
249
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
250
+ T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
251
+
252
+ # S2TT / T2TT / ASR
253
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
254
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
255
+ ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==4.5.0
2
+ omegaconf==2.3.0
3
+ torch==2.1.0
4
+ torchaudio==2.1.0
style.css ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+
5
+ #duplicate-button {
6
+ margin: auto;
7
+ color: #fff;
8
+ background: #1565c0;
9
+ border-radius: 100vh;
10
+ }
whl/seamless_communication-1.0.0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df10e0c85ee0ffbc9f2e1bf8896850a52c551383df0332a94d26d9d39770c85
3
+ size 201552