avans06 commited on
Commit
8e80889
·
1 Parent(s): 1a74319

Added GGUF and CTranslate2 versions of the ALMA model, and the web UI options now include VAD Process Timeout.

Browse files

1. In the web UI, in addition to the GPTQ version, GGUF and CTranslate2 versions of the ALMA model have also been added.

2. Due to the poor support of GPTQ for CPUs. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.

3. In the web UI's VAD options, the "VAD Process Timeout (s)" (vad_process_timeout) has been added, allowing users to decide whether to continue maintaining the VAD process until the specified timeout. VRAM will continue to be occupied as long as the VAD process is not stopped. The default value for this timeout is 1800 seconds (30 minutes).

app.py CHANGED
@@ -250,6 +250,7 @@ class WhisperTranscriber:
250
  vadPadding: float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
251
  vadPromptWindow: float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
252
  vadInitialPromptMode: str = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
 
253
 
254
  diarization: bool = decodeOptions.pop("diarization", False)
255
  diarization_speakers: int = decodeOptions.pop("diarization_speakers", 2)
@@ -832,7 +833,9 @@ def create_ui(app_config: ApplicationConfig):
832
  m2m100_models = app_config.get_model_names("m2m100")
833
  mt5_models = app_config.get_model_names("mt5")
834
  ALMA_models = app_config.get_model_names("ALMA")
835
-
 
 
836
  common_whisper_inputs = lambda : {
837
  gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
838
  gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
@@ -864,6 +867,7 @@ def create_ui(app_config: ApplicationConfig):
864
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
865
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
866
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
 
867
  }
868
 
869
  common_word_timestamps_inputs = lambda : {
 
250
  vadPadding: float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
251
  vadPromptWindow: float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
252
  vadInitialPromptMode: str = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
253
+ self.vad_process_timeout: float = decodeOptions.pop("vadPocessTimeout", self.vad_process_timeout)
254
 
255
  diarization: bool = decodeOptions.pop("diarization", False)
256
  diarization_speakers: int = decodeOptions.pop("diarization_speakers", 2)
 
833
  m2m100_models = app_config.get_model_names("m2m100")
834
  mt5_models = app_config.get_model_names("mt5")
835
  ALMA_models = app_config.get_model_names("ALMA")
836
+ if not torch.cuda.is_available(): #Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
837
+ ALMA_models = list(filter(lambda alma: "GPTQ" not in alma, ALMA_models))
838
+
839
  common_whisper_inputs = lambda : {
840
  gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
841
  gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
 
867
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
868
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
869
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
870
+ gr.Number(label="VAD - Process Timeout (s)", precision=0, value=app_config.vad_process_timeout, elem_id="vadPocessTimeout"),
871
  }
872
 
873
  common_word_timestamps_inputs = lambda : {
config.json5 CHANGED
@@ -203,6 +203,32 @@
203
  "url": "TheBloke/ALMA-13B-GPTQ",
204
  "type": "huggingface"
205
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  ]
207
  },
208
  // Configuration options that will be used if they are not specified in the command line arguments.
 
203
  "url": "TheBloke/ALMA-13B-GPTQ",
204
  "type": "huggingface"
205
  },
206
+ {
207
+ "name": "ALMA-7B-GGUF-Q4_K_M/TheBloke",
208
+ "url": "TheBloke/ALMA-7B-GGUF",
209
+ "type": "huggingface",
210
+ "model_file": "alma-7b.Q4_K_M.gguf",
211
+ "tokenizer_url": "haoranxu/ALMA-7B"
212
+ },
213
+ {
214
+ "name": "ALMA-13B-GGUF-Q4_K_M/TheBloke",
215
+ "url": "TheBloke/ALMA-13B-GGUF",
216
+ "type": "huggingface",
217
+ "model_file": "alma-13b.Q4_K_M.gguf",
218
+ "tokenizer_url": "haoranxu/ALMA-13B"
219
+ },
220
+ {
221
+ "name": "ALMA-7B-ct2:int8_float16/avan",
222
+ "url": "avans06/ALMA-7B-ct2-int8_float16",
223
+ "type": "huggingface",
224
+ "tokenizer_url": "haoranxu/ALMA-7B"
225
+ },
226
+ {
227
+ "name": "ALMA-13B-ct2:int8_float16/avan",
228
+ "url": "avans06/ALMA-13B-ct2-int8_float16",
229
+ "type": "huggingface",
230
+ "tokenizer_url": "haoranxu/ALMA-13B"
231
+ },
232
  ]
233
  },
234
  // Configuration options that will be used if they are not specified in the command line arguments.
docs/options.md CHANGED
@@ -67,6 +67,9 @@ If set, any adjacent speech sections that are at most this number of seconds apa
67
  ## VAD - Max Merge Size (s)
68
  Disables merging of adjacent speech sections if they are this number of seconds long.
69
 
 
 
 
70
  ## VAD - Padding (s)
71
  The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
72
  larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
 
67
  ## VAD - Max Merge Size (s)
68
  Disables merging of adjacent speech sections if they are this number of seconds long.
69
 
70
+ ## VAD - Process Timeout (s)
71
+ This configures the number of seconds until a process is killed due to inactivity, freeing RAM and video memory. The default value is 30 minutes.
72
+
73
  ## VAD - Padding (s)
74
  The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
75
  larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
docs/translateModel.md CHANGED
@@ -20,7 +20,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
20
 
21
  | Name | Parameters | Size | type/quantize | Required VRAM |
22
  |------|------------|------|---------------|---------------|
23
- | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 480M | 1.94 GB | float32 | ≈2 GB |
24
  | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
25
  | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
26
 
@@ -28,7 +28,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
28
 
29
  | Name | Parameters | Size | type/quantize | Required VRAM |
30
  |------|------------|------|---------------|---------------|
31
- | [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) | 480M | 970 MB | float16 | ≈0.6 GB |
32
  | [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
33
  | [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
34
 
@@ -73,7 +73,6 @@ The 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. Th
73
 
74
  ## ALMA
75
 
76
- ALMA is an excellent translation model, but it is strongly discouraged to operate it on CPU.
77
  ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
78
  The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
79
 
@@ -84,12 +83,33 @@ The official support for ALMA currently includes 10 language directions: English
84
 
85
  ## ALMA-GPTQ
86
 
 
87
  GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
88
 
89
  | Name | Parameters | Size | type/quantize | Required VRAM |
90
  |------|------------|------|---------------|---------------|
91
  | [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
92
- | [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.1 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  # Options
 
20
 
21
  | Name | Parameters | Size | type/quantize | Required VRAM |
22
  |------|------------|------|---------------|---------------|
23
+ | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
24
  | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
25
  | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
26
 
 
28
 
29
  | Name | Parameters | Size | type/quantize | Required VRAM |
30
  |------|------------|------|---------------|---------------|
31
+ | [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) | 418M | 970 MB | float16 | ≈0.6 GB |
32
  | [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
33
  | [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
34
 
 
73
 
74
  ## ALMA
75
 
 
76
  ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
77
  The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
78
 
 
83
 
84
  ## ALMA-GPTQ
85
 
86
+ Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, it is strongly discouraged to operate it on CPU.
87
  GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
88
 
89
  | Name | Parameters | Size | type/quantize | Required VRAM |
90
  |------|------------|------|---------------|---------------|
91
  | [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
92
+ | [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.4 GB |
93
+
94
+ ## ALMA-GGUF
95
+
96
+ [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
97
+ GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
98
+ [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684): a series of 2-6 bit quantization methods, along with quantization mixes
99
+
100
+ | Name | Parameters | Size | type/quantize | Required VRAM |
101
+ |------|------------|------|---------------|---------------|
102
+ | [TheBloke/ALMA-7B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-7B-GGUF) | 7B | 4.08 GB | Q4_K_M(4 Bits medium) | ≈5.3 GB |
103
+ | [TheBloke/ALMA-13B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-13B-GGUF) | 13B | 7.87 GB | Q4_K_M(4 Bits medium) | ≈9.3 GB |
104
+
105
+ ## ALMA-CTranslate2
106
+
107
+ [CTranslate2](https://opennmt.net/CTranslate2/) does not currently support 4-bit quantization. Currently, it can only use int8_float16 quantization, so the file size and required VRAM will be larger than the GPTQ model quantized with 4 bits. However, it runs much faster on the CPU than GPTQ. If you plan to run ALMA in an environment without a GPU, you may consider choosing the CTranslate2 version of the ALMA model.
108
+
109
+ | Name | Parameters | Size | type/quantize | Required VRAM |
110
+ |------|------------|------|---------------|---------------|
111
+ | [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
112
+ | [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
113
 
114
 
115
  # Options
requirements-fasterWhisper.txt CHANGED
@@ -17,7 +17,9 @@ srt
17
  torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
- # Needed by ALMA(GPTQ)
21
  accelerate
22
  auto-gptq
23
- optimum
 
 
 
17
  torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
+ # Needed by ALMA-GPTQ
21
  accelerate
22
  auto-gptq
23
+ optimum
24
+ # Needed by ALMA-GGUL
25
+ ctransformers[cuda]
requirements-whisper.txt CHANGED
@@ -16,7 +16,9 @@ srt
16
  torch
17
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
18
 
19
- # Needed by ALMA(GPTQ)
20
  accelerate
21
  auto-gptq
22
- optimum
 
 
 
16
  torch
17
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
18
 
19
+ # Needed by ALMA-GPTQ
20
  accelerate
21
  auto-gptq
22
+ optimum
23
+ # Needed by ALMA-GGUL
24
+ ctransformers[cuda]
requirements.txt CHANGED
@@ -17,7 +17,9 @@ srt
17
  torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
- # Needed by ALMA(GPTQ)
21
  accelerate
22
  auto-gptq
23
- optimum
 
 
 
17
  torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
+ # Needed by ALMA-GPTQ
21
  accelerate
22
  auto-gptq
23
+ optimum
24
+ # Needed by ALMA-GGUL
25
+ ctransformers[cuda]
src/config.py CHANGED
@@ -5,7 +5,7 @@ from typing import List, Dict, Literal
5
 
6
 
7
  class ModelConfig:
8
- def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None):
9
  """
10
  Initialize a model configuration.
11
 
@@ -17,6 +17,7 @@ class ModelConfig:
17
  It can be a branch name, a tag name, or a commit id,
18
  since we use a git-based system for storing models and other artifacts on huggingface.co,
19
  so revision can be any identifier allowed by git.
 
20
  """
21
  self.name = name
22
  self.url = url
@@ -24,6 +25,7 @@ class ModelConfig:
24
  self.type = type
25
  self.tokenizer_url = tokenizer_url
26
  self.revision = revision
 
27
 
28
  VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
29
 
 
5
 
6
 
7
  class ModelConfig:
8
+ def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None, model_file: str = None,):
9
  """
10
  Initialize a model configuration.
11
 
 
17
  It can be a branch name, a tag name, or a commit id,
18
  since we use a git-based system for storing models and other artifacts on huggingface.co,
19
  so revision can be any identifier allowed by git.
20
+ model_file: The name of the model file in repo or directory.[from marella/ctransformers]
21
  """
22
  self.name = name
23
  self.url = url
 
25
  self.type = type
26
  self.tokenizer_url = tokenizer_url
27
  self.revision = revision
28
+ self.model_file = model_file
29
 
30
  VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
31
 
src/translation/translationModel.py CHANGED
@@ -65,7 +65,7 @@ class TranslationModel:
65
  if os.path.isdir(modelConfig.url):
66
  self.modelPath = modelConfig.url
67
  else:
68
- self.modelPath = download_model(
69
  modelConfig,
70
  localFilesOnly=localFilesOnly,
71
  cacheDir=downloadRoot,
@@ -137,6 +137,12 @@ class TranslationModel:
137
  If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
138
  This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
139
  https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
 
 
 
 
 
 
140
  """
141
  try:
142
  print('\n\nLoading model: %s\n\n' % self.modelPath)
@@ -152,7 +158,7 @@ class TranslationModel:
152
  elif "ALMA" in self.modelPath:
153
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
154
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
155
- self.transModel = ctranslate2.Generator(self.modelPath, device=self.device)
156
  elif "mt5" in self.modelPath:
157
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
158
  self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
@@ -160,16 +166,24 @@ class TranslationModel:
160
  self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
161
  elif "ALMA" in self.modelPath:
162
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
163
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
164
- transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
165
- if self.device == "cpu":
166
- # ALMA is an excellent translation model, but it is strongly discouraged to operate it on CPU.
167
- # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
168
- transModelConfig.quantization_config["use_exllama"] = False
169
- self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
170
- else:
171
- # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
172
- self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
 
 
 
 
 
 
 
 
173
  self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
174
  else:
175
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
@@ -180,22 +194,31 @@ class TranslationModel:
180
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
181
 
182
  except Exception as e:
183
- print(traceback.format_exc())
184
  self.release_vram()
 
 
185
 
186
  def release_vram(self):
187
  try:
188
  if torch.cuda.is_available():
189
  if "ct2" not in self.modelPath:
190
  try:
191
- device = torch.device("cpu")
192
- self.transModel.to(device)
 
193
  except Exception as e:
194
  print(traceback.format_exc())
195
  print("\tself.transModel.to cpu, error: " + str(e))
196
- del self.transTranslator
197
- del self.transTokenizer
198
- del self.transModel
 
 
 
 
 
 
 
199
  try:
200
  torch.cuda.empty_cache()
201
  except Exception as e:
@@ -205,6 +228,7 @@ class TranslationModel:
205
  gc.collect()
206
  print("release vram end.")
207
  except Exception as e:
 
208
  print("Error release vram: " + str(e))
209
 
210
 
@@ -257,7 +281,10 @@ class TranslationModel:
257
  output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
258
  result = output[0]['generated_text']
259
  elif "ALMA" in self.modelPath:
260
- output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
 
 
 
261
  result = output[0]['generated_text']
262
  else: #M2M100 & NLLB
263
  output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
@@ -332,7 +359,8 @@ def download_model(
332
  "vocab.json", #m2m100
333
  "model.safetensors",
334
  "quantize_config.json",
335
- "tokenizer.model"
 
336
  ]
337
 
338
  kwargs = {
 
65
  if os.path.isdir(modelConfig.url):
66
  self.modelPath = modelConfig.url
67
  else:
68
+ self.modelPath = modelConfig.url if getattr(modelConfig, "model_file", None) is not None else download_model(
69
  modelConfig,
70
  localFilesOnly=localFilesOnly,
71
  cacheDir=downloadRoot,
 
137
  If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
138
  This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
139
  https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
140
+
141
+ [ctransformers]
142
+ gpu_layers
143
+ means number of layers to run on GPU. Depending on how much GPU memory is available you can increase gpu_layers. Start with a larger value gpu_layers=100 and if it runs out of memory, try smaller values.
144
+ To run some of the model layers on GPU, set the `gpu_layers` parameter
145
+ https://github.com/marella/ctransformers/issues/68
146
  """
147
  try:
148
  print('\n\nLoading model: %s\n\n' % self.modelPath)
 
158
  elif "ALMA" in self.modelPath:
159
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
160
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
161
+ self.transModel = ctranslate2.Generator(self.modelPath, compute_type="auto", device=self.device)
162
  elif "mt5" in self.modelPath:
163
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
164
  self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
 
166
  self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
167
  elif "ALMA" in self.modelPath:
168
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
169
+ if "GPTQ" in self.modelPath:
170
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
171
+ if self.device == "cpu":
172
+ # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
173
+ # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
174
+ transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
175
+ transModelConfig.quantization_config["use_exllama"] = False
176
+ self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
177
+ else:
178
+ # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
179
+ self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
180
+ elif "GGUF" in self.modelPath:
181
+ import ctransformers
182
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
183
+ if self.device == "cpu":
184
+ self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
185
+ else:
186
+ self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
187
  self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
188
  else:
189
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
 
194
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
195
 
196
  except Exception as e:
 
197
  self.release_vram()
198
+ raise e
199
+
200
 
201
  def release_vram(self):
202
  try:
203
  if torch.cuda.is_available():
204
  if "ct2" not in self.modelPath:
205
  try:
206
+ if getattr(self, "transModel", None) is not None:
207
+ device = torch.device("cpu")
208
+ self.transModel.to(device)
209
  except Exception as e:
210
  print(traceback.format_exc())
211
  print("\tself.transModel.to cpu, error: " + str(e))
212
+ if getattr(self, "transTranslator", None) is not None:
213
+ del self.transTranslator
214
+ if "ct2" in self.modelPath:
215
+ if getattr(self, "transModel", None) is not None and getattr(self.transModel, "unload_model", None) is not None:
216
+ self.transModel.unload_model()
217
+
218
+ if getattr(self, "transTokenizer", None) is not None:
219
+ del self.transTokenizer
220
+ if getattr(self, "transModel", None) is not None:
221
+ del self.transModel
222
  try:
223
  torch.cuda.empty_cache()
224
  except Exception as e:
 
228
  gc.collect()
229
  print("release vram end.")
230
  except Exception as e:
231
+ print(traceback.format_exc())
232
  print("Error release vram: " + str(e))
233
 
234
 
 
281
  output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
282
  result = output[0]['generated_text']
283
  elif "ALMA" in self.modelPath:
284
+ if "GPTQ" in self.modelPath:
285
+ output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
286
+ elif "GGUF" in self.modelPath:
287
+ output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
288
  result = output[0]['generated_text']
289
  else: #M2M100 & NLLB
290
  output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
 
359
  "vocab.json", #m2m100
360
  "model.safetensors",
361
  "quantize_config.json",
362
+ "tokenizer.model",
363
+ "vocabulary.json"
364
  ]
365
 
366
  kwargs = {