update files

Files changed (11) hide show

README.md +11 -2
generate_perf_input.py +134 -0
mid.wav +0 -0
model_repo_cuda_decoder/attention_rescoring/1/.gitkeep +0 -0
model_repo_cuda_decoder/scoring/1/decoder.py +21 -19
model_repo_cuda_decoder/scoring/1/frame_reducer.py +193 -0
model_repo_cuda_decoder/scoring/1/model.py +52 -7
model_repo_cuda_decoder/scoring/1/wfst_decoding_config.yaml +2 -2
model_repo_cuda_decoder/scoring/config.pbtxt +2 -2
model_repo_cuda_decoder/scoring/config.pbtxt.template +1 -1
run.sh +1 -4

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
-### usage
 ```
 docker pull soar97/triton-wenet:22.12
 docker run -it --rm --name "wenet_tlg_test" --gpus all --shm-size 1g --net host soar97/triton-wenet:22.12
@@ -8,4 +8,13 @@ git clone https://huggingface.co/yuekai/model_repo_conformer_aishell_wenet_tlg.g
 cd model_repo_conformer_aishell_wenet_tlg
 bash run.sh
 ```

+### Tutorial
+Start Server
 ```
 docker pull soar97/triton-wenet:22.12
 docker run -it --rm --name "wenet_tlg_test" --gpus all --shm-size 1g --net host soar97/triton-wenet:22.12
 cd model_repo_conformer_aishell_wenet_tlg
 bash run.sh
 ```
+Start Client
+```
+pip3 install tritonclient[all]==2.29
+apt-get install -y libsndfile1
+pip3 install soundfile
+python3 generate_perf_input.py --audio_file ./mid.wav
+perf_analyzer -m attention_rescoring -b 1 -p 20000 --concurrency-range 100 -i gRPC --input-data=offline_input.json  -u localhost:8001
+```

generate_perf_input.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import json
+import soundfile as sf
+import numpy as np
+import argparse
+import math
+def generate_offline_input(args):
+    wav_file = args.audio_file
+    print("Reading {}".format(wav_file))
+    waveform, sample_rate = sf.read(wav_file)
+    batch_size = 1
+    mat = np.array([waveform] * batch_size, dtype=np.float32)
+    out_dict = {
+        "data": [
+            {
+                "WAV_LENS": [len(waveform)],
+                "WAV": {
+                    "shape": [len(waveform)],
+                    "content": mat.flatten().tolist(),
+                },
+            }
+        ]
+    }
+    json.dump(out_dict, open("offline_input.json", "w"))
+def generate_online_input(args):
+    wav_file = args.audio_file
+    waveform, sample_rate = sf.read(wav_file)
+    chunk_size, subsampling = args.chunk_size, args.subsampling
+    context = args.context
+    first_chunk_length = (chunk_size - 1) * subsampling + context
+    frame_length_ms, frame_shift_ms = args.frame_length_ms, args.frame_shift_ms
+    # for the first chunk,
+    # we need additional frame to generate the exact first chunk length frames
+    add_frames = math.ceil((frame_length_ms - frame_shift_ms) / frame_shift_ms)
+    first_chunk_ms = (first_chunk_length + add_frames) * frame_shift_ms
+    other_chunk_ms = chunk_size * subsampling * frame_shift_ms
+    first_chunk_s = first_chunk_ms / 1000
+    other_chunk_s = other_chunk_ms / 1000
+    wav_segs = []
+    i = 0
+    while i < len(waveform):
+        if i == 0:
+            stride = int(first_chunk_s * sample_rate)
+            wav_segs.append(waveform[i : i + stride])
+        else:
+            stride = int(other_chunk_s * sample_rate)
+            wav_segs.append(waveform[i : i + stride])
+        i += len(wav_segs[-1])
+    data = {"data": [[]]}
+    for idx, seg in enumerate(wav_segs):  # 0, num_frames + 5, 64
+        chunk_len = len(seg)
+        if idx == 0:
+            length = int(first_chunk_s * sample_rate)
+            expect_input = np.zeros((1, length), dtype=np.float32)
+        else:
+            length = int(other_chunk_s * sample_rate)
+            expect_input = np.zeros((1, length), dtype=np.float32)
+        expect_input[0][0:chunk_len] = seg
+        flat_chunk = expect_input.flatten().astype(np.float32).tolist()
+        seq = {
+            "WAV": {"content": flat_chunk, "shape": expect_input[0].shape},
+            "WAV_LENS": [chunk_len],
+        }
+        data["data"][0].append(seq)
+    json.dump(data, open("online_input.json", "w"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audio_file", type=str, default=None, help="single wav file"
+    )
+    # below is only for streaming input
+    parser.add_argument("--streaming", action="store_true", required=False)
+    parser.add_argument(
+        "--sample_rate",
+        type=int,
+        required=False,
+        default=16000,
+        help="sample rate used in training",
+    )
+    parser.add_argument(
+        "--frame_length_ms",
+        type=int,
+        required=False,
+        default=25,
+        help="frame length used in training",
+    )
+    parser.add_argument(
+        "--frame_shift_ms",
+        type=int,
+        required=False,
+        default=10,
+        help="frame shift length used in training",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        required=False,
+        default=16,
+        help="chunk size default is 16",
+    )
+    parser.add_argument(
+        "--context",
+        type=int,
+        required=False,
+        default=7,
+        help="conformer context default is 7",
+    )
+    parser.add_argument(
+        "--subsampling",
+        type=int,
+        required=False,
+        default=4,
+        help="subsampling rate default is 4",
+    )
+    args = parser.parse_args()
+    if args.streaming and os.path.exists(args.audio_file):
+        generate_online_input(args)
+    else:
+        generate_offline_input(args)

mid.wav ADDED Viewed

Binary file (160 kB). View file

model_repo_cuda_decoder/attention_rescoring/1/.gitkeep ADDED Viewed

File without changes

model_repo_cuda_decoder/scoring/1/decoder.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from typing import List
 from riva.asrlib.decoder.python_decoder import (BatchedMappedDecoderCuda,
                                                 BatchedMappedDecoderCudaConfig)
 def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
     """Make mask tensor containing indices of padded part.
@@ -81,52 +82,53 @@ class RivaWFSTDecoder:
         config.online_opts.lattice_postprocessor_opts.nbest = beam_size
         self.decoder = BatchedMappedDecoderCuda(
             config, os.path.join(tlg_dir, "TLG.fst"),
             os.path.join(tlg_dir, "words.txt"), vocab_size
         )
         self.word_id_to_word_str = load_word_symbols(os.path.join(tlg_dir, "words.txt"))
         self.nbest = beam_size
     def decode_nbest(self, logits, length):
         logits = logits.to(torch.float32).contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
-        before = logits.shape
-        if logits.shape[0] == 1:
-            logits = logits.repeat(2,1,1)
-            sequence_lengths_tensor = sequence_lengths_tensor.repeat(2)
-        print(before, logits.shape)
         results = self.decoder.decode_nbest(logits, sequence_lengths_tensor)
-        if logits.shape[0] == 1:
-            results = results[0:1]
         total_hyps, total_hyps_id = [], []
         for nbest_sentences in results:
-            nbest_list, nbest_id_list = []
             for sent in nbest_sentences:
-                # subtract 1 to get the label id, since fst decoder adds 1 to the label id
                 hyp_ids = [label - 1 for label in sent.ilabels]
-                new_hyp = remove_duplicates_and_blank(hyp_ids, eos=self.vocab_size-1, blank_id=0)
                 nbest_id_list.append(new_hyp)
-                hyp = "".join(self.word_id_to_word_str[word] for word in sent.words if word != 0)
                 nbest_list.append(hyp)
             total_hyps.append(nbest_list)
             total_hyps_id.append(nbest_id_list)
-        return total_hyps, total_hyps_id
     def decode_mbr(self, logits, length):
         logits = logits.to(torch.float32).contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
-        if logits.shape[0] == 1:
-            logits = logits.repeat(2,1,1)
-            sequence_lengths_tensor = sequence_lengths_tensor.repeat(2)
         results = self.decoder.decode_mbr(logits, sequence_lengths_tensor)
-        if logits.shape[0] == 1:
-            results = results[0:1]
         total_hyps = []
         for sent in results:
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
             total_hyps.append(hyp_zh)
-        return total_hyps

 from typing import List
 from riva.asrlib.decoder.python_decoder import (BatchedMappedDecoderCuda,
                                                 BatchedMappedDecoderCudaConfig)
+from frame_reducer import FrameReducer
 def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
     """Make mask tensor containing indices of padded part.
         config.online_opts.lattice_postprocessor_opts.nbest = beam_size
+        # config.online_opts.decoder_opts.blank_penalty = -5.0
         self.decoder = BatchedMappedDecoderCuda(
             config, os.path.join(tlg_dir, "TLG.fst"),
             os.path.join(tlg_dir, "words.txt"), vocab_size
         )
         self.word_id_to_word_str = load_word_symbols(os.path.join(tlg_dir, "words.txt"))
         self.nbest = beam_size
+        self.vocab_size = vocab_size
+        self.frame_reducer = FrameReducer(0.98)
     def decode_nbest(self, logits, length):
+        logits, length = self.frame_reducer(logits, length.cuda(), logits)
         logits = logits.to(torch.float32).contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
         results = self.decoder.decode_nbest(logits, sequence_lengths_tensor)
         total_hyps, total_hyps_id = [], []
+        max_hyp_len = 3
         for nbest_sentences in results:
+            nbest_list, nbest_id_list = [], []
             for sent in nbest_sentences:
+                # subtract 1 to get the label id,
+                # since fst decoder adds 1 to the label id
                 hyp_ids = [label - 1 for label in sent.ilabels]
+                # padding for hyps_pad_sos_eos
+                new_hyp = [self.vocab_size - 1] + remove_duplicates_and_blank(hyp_ids, eos=self.vocab_size - 1, blank_id=0) + [self.vocab_size - 1] # noqa
+                max_hyp_len = max(max_hyp_len, len(new_hyp))
                 nbest_id_list.append(new_hyp)
+                hyp = "".join(self.word_id_to_word_str[word]
+                              for word in sent.words if word != 0)
                 nbest_list.append(hyp)
+            nbest_list += [""] * (self.nbest - len(nbest_list))
             total_hyps.append(nbest_list)
+            nbest_id_list += [[self.vocab_size - 1, 0, self.vocab_size - 1]] * (self.nbest - len(nbest_id_list)) # noqa
             total_hyps_id.append(nbest_id_list)
+        return total_hyps, total_hyps_id, max_hyp_len
     def decode_mbr(self, logits, length):
+        logits, length = self.frame_reducer(logits, length.cuda(), logits)
+        # logits[:,:,0] -= 2.0
         logits = logits.to(torch.float32).contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
         results = self.decoder.decode_mbr(logits, sequence_lengths_tensor)
         total_hyps = []
         for sent in results:
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
             total_hyps.append(hyp_zh)
+        return total_hyps

model_repo_cuda_decoder/scoring/1/frame_reducer.py ADDED Viewed

	@@ -0,0 +1,193 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022  Xiaomi Corp.        (authors: Yifan   Yang,
+#                                                    Zengwei Yao,
+#                                                    Wei     Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)
+class FrameReducer(nn.Module):
+    """The encoder output is first used to calculate
+    the CTC posterior probability; then for each output frame,
+    if its blank posterior is bigger than some thresholds,
+    it will be simply discarded from the encoder output.
+    """
+    def __init__(
+        self,
+        blank_threshlod: float = 0.95,
+    ):
+        super().__init__()
+        self.blank_threshlod = blank_threshlod
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        ctc_output: torch.Tensor,
+        y_lens: Optional[torch.Tensor] = None,
+        blank_id: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x:
+              The shared encoder output with shape [N, T, C].
+            x_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `x` before padding.
+            ctc_output:
+              The CTC output with shape [N, T, vocab_size].
+            y_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `y` before padding.
+            blank_id:
+              The blank id of ctc_output.
+        Returns:
+            out:
+              The frame reduced encoder output with shape [N, T', C].
+            out_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `out` before padding.
+        """
+        N, T, C = x.size()
+        padding_mask = make_pad_mask(x_lens, x.size(1))
+        non_blank_mask = (ctc_output[:, :, blank_id] < math.log(self.blank_threshlod)) * (~padding_mask) # noqa
+        if y_lens is not None:
+            # Limit the maximum number of reduced frames
+            limit_lens = T - y_lens
+            max_limit_len = limit_lens.max().int()
+            fake_limit_indexes = torch.topk(
+                ctc_output[:, :, blank_id], max_limit_len
+            ).indices
+            T = (
+                torch.arange(max_limit_len)
+                .expand_as(
+                    fake_limit_indexes,
+                )
+                .to(device=x.device)
+            )
+            T = torch.remainder(T, limit_lens.unsqueeze(1))
+            limit_indexes = torch.gather(fake_limit_indexes, 1, T)
+            limit_mask = torch.full_like(
+                non_blank_mask,
+                False,
+                device=x.device,
+            ).scatter_(1, limit_indexes, True)
+            non_blank_mask = non_blank_mask | ~limit_mask
+        out_lens = non_blank_mask.sum(dim=1)
+        max_len = out_lens.max()
+        pad_lens_list = (
+            torch.full_like(
+                out_lens,
+                max_len.item(),
+                device=x.device,
+            )
+            - out_lens
+        )
+        max_pad_len = pad_lens_list.max()
+        out = F.pad(x, (0, 0, 0, max_pad_len))
+        valid_pad_mask = ~make_pad_mask(pad_lens_list)
+        total_valid_mask = torch.concat([non_blank_mask, valid_pad_mask], dim=1)
+        out = out[total_valid_mask].reshape(N, -1, C)
+        return out, out_lens
+if __name__ == "__main__":
+    import time
+    test_times = 10000
+    device = "cuda:0"
+    frame_reducer = FrameReducer()
+    # non zero case
+    x = torch.ones(15, 498, 384, dtype=torch.float32, device=device)
+    x_lens = torch.tensor([498] * 15, dtype=torch.int64, device=device)
+    y_lens = torch.tensor([150] * 15, dtype=torch.int64, device=device)
+    ctc_output = torch.log(
+        torch.randn(15, 498, 500, dtype=torch.float32, device=device),
+    )
+    avg_time = 0
+    for i in range(test_times):
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time()
+        x_fr, x_lens_fr = frame_reducer(x, x_lens, ctc_output, y_lens)
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time() - delta_time
+        avg_time += delta_time
+    print(x_fr.shape)
+    print(x_lens_fr)
+    print(avg_time / test_times)
+    # all zero case
+    x = torch.zeros(15, 498, 384, dtype=torch.float32, device=device)
+    x_lens = torch.tensor([498] * 15, dtype=torch.int64, device=device)
+    y_lens = torch.tensor([150] * 15, dtype=torch.int64, device=device)
+    ctc_output = torch.zeros(15, 498, 500, dtype=torch.float32, device=device)
+    avg_time = 0
+    for i in range(test_times):
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time()
+        x_fr, x_lens_fr = frame_reducer(x, x_lens, ctc_output, y_lens)
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time() - delta_time
+        avg_time += delta_time
+    print(x_fr.shape)
+    print(x_lens_fr)
+    print(avg_time / test_times)

model_repo_cuda_decoder/scoring/1/model.py CHANGED Viewed

@@ -16,7 +16,7 @@ import triton_python_backend_utils as pb_utils
 import numpy as np
 import torch
-from torch.utils.dlpack import from_dlpack
 import json
 import os
 import yaml
@@ -123,7 +123,7 @@ class TritonPythonModel:
         self.eos = eos
         self.ignore_id = ignore_id
-        if self.decoding_method == "tlg":
             self.decoder = RivaWFSTDecoder(len(self.vocabulary),
                                            self.tlg_dir,
                                            self.tlg_decoding_config,
@@ -175,12 +175,57 @@ class TritonPythonModel:
         encoder_out_len = torch.cat(encoder_out_lens_list, dim=0)
         return encoder_out, encoder_out_len, logits, batch_count_list
-    def rescore_hyps(self, total_hyps, total_tokens, encoder_out, encoder_out_len):
         """
         Rescore the hypotheses with attention rescoring
         """
-        # TODO: add attention rescoring
-        return total_hyps
     def prepare_response(self, hyps, batch_count_list):
         """
@@ -223,17 +268,17 @@ class TritonPythonModel:
         ctc_log_probs = ctc_log_probs.cuda()
         if self.decoding_method == "tlg_mbr":
             total_hyps = self.decoder.decode_mbr(ctc_log_probs, encoder_out_len)
-            # list(str), list((float), list(int)) # TODO: add token_ids, time stamps
         elif self.decoding_method == "ctc_greedy_search":
             total_hyps = ctc_greedy_search(ctc_log_probs, encoder_out_len,
                                            self.vocabulary, self.blank_id, self.eos)
         elif self.decoding_method == "tlg":
-            nbest_hyps, nbest_ids = self.decoder.decode_nbest(encoder_out, encoder_out_len)
             total_hyps = [nbest[0] for nbest in nbest_hyps]
         if self.decoding_method == "tlg" and self.rescore:
             assert self.beam_size > 1, "Beam size must be greater than 1 for rescoring"
             selected_ids = self.rescore_hyps(nbest_ids,
                                              encoder_out,
                                              encoder_out_len)
             total_hyps = [nbest[i] for nbest, i in zip(nbest_hyps, selected_ids)]

 import numpy as np
 import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
 import json
 import os
 import yaml
         self.eos = eos
         self.ignore_id = ignore_id
+        if "tlg" in self.decoding_method:
             self.decoder = RivaWFSTDecoder(len(self.vocabulary),
                                            self.tlg_dir,
                                            self.tlg_decoding_config,
         encoder_out_len = torch.cat(encoder_out_lens_list, dim=0)
         return encoder_out, encoder_out_len, logits, batch_count_list
+    def rescore_hyps(self, total_tokens, max_hyp_len, encoder_out, encoder_out_len):
         """
         Rescore the hypotheses with attention rescoring
         """
+        input1 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out))
+        input2 = pb_utils.Tensor.from_dlpack("encoder_out_lens",
+                                             to_dlpack(encoder_out_len.unsqueeze(-1)))
+        hyps_pad_sos_eos = np.zeros([len(total_tokens),
+                                     self.beam_size, max_hyp_len], dtype=np.int64)
+        hyps_lens_sos = np.zeros([len(total_tokens), self.beam_size], dtype=np.int32)
+        ctc_scores = np.zeros([len(total_tokens),
+                               self.beam_size], dtype=np.float16)  # TODO: zero here
+        for i, hyps in enumerate(total_tokens):
+            for j, hyp in enumerate(hyps):
+                hyps_pad_sos_eos[i][j][:len(hyp)] = hyp
+                hyps_lens_sos[i][j] = len(hyp) - 1
+        input3 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos)
+        input4 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos)
+        input5 = pb_utils.Tensor("ctc_score", ctc_scores)
+        input_tensors = [input1, input2, input3, input4, input5]
+        if self.bidecoder:
+            r_hyps_pad_sos_eos = np.zeros([len(total_tokens),
+                                           self.beam_size, max_hyp_len], dtype=np.int64)
+            for i, hyps in enumerate(total_tokens):
+                for j, hyp in enumerate(hyps):
+                    r_hyps_pad_sos_eos[i][j][:len(hyp)] = hyp[::-1]
+            input6 = pb_utils.Tensor.from_dlpack("r_hyps_pad_sos_eos",
+                                                 r_hyps_pad_sos_eos)
+            input_tensors.insert(-1, input6)
+        inference_request = pb_utils.InferenceRequest(
+            model_name='decoder',
+            requested_output_names=['best_index'],
+            inputs=input_tensors)
+        inference_response = inference_request.exec()
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(inference_response.error().message())
+        else:
+            # Extract the output tensors from the inference response.
+            best_index = pb_utils.get_output_tensor_by_name(inference_response,
+                                                            'best_index')
+            if best_index.is_cpu():
+                best_index = best_index.as_numpy()
+            else:
+                best_index = from_dlpack(best_index.to_dlpack())
+                best_index = best_index.cpu().numpy()
+            best_index = np.squeeze(best_index, -1).tolist()
+            return best_index
     def prepare_response(self, hyps, batch_count_list):
         """
         ctc_log_probs = ctc_log_probs.cuda()
         if self.decoding_method == "tlg_mbr":
             total_hyps = self.decoder.decode_mbr(ctc_log_probs, encoder_out_len)
         elif self.decoding_method == "ctc_greedy_search":
             total_hyps = ctc_greedy_search(ctc_log_probs, encoder_out_len,
                                            self.vocabulary, self.blank_id, self.eos)
         elif self.decoding_method == "tlg":
+            nbest_hyps, nbest_ids, max_hyp_len = self.decoder.decode_nbest(ctc_log_probs, encoder_out_len) # noqa
             total_hyps = [nbest[0] for nbest in nbest_hyps]
         if self.decoding_method == "tlg" and self.rescore:
             assert self.beam_size > 1, "Beam size must be greater than 1 for rescoring"
             selected_ids = self.rescore_hyps(nbest_ids,
+                                             max_hyp_len,
                                              encoder_out,
                                              encoder_out_len)
             total_hyps = [nbest[i] for nbest, i in zip(nbest_hyps, selected_ids)]

model_repo_cuda_decoder/scoring/1/wfst_decoding_config.yaml CHANGED Viewed

@@ -3,8 +3,8 @@ n_input_per_chunk: 50
 default_beam: 17.0
 max_active: 7000
 determinize_lattice: True
-max_batch_size: 800
-num_channels: 800
 frame_shift_seconds: 0.04
 lm_scale: 5.0
 word_ins_penalty: 0.0

 default_beam: 17.0
 max_active: 7000
 determinize_lattice: True
+max_batch_size: 200
+num_channels: 400
 frame_shift_seconds: 0.04
 lm_scale: 5.0
 word_ins_penalty: 0.0

model_repo_cuda_decoder/scoring/config.pbtxt CHANGED Viewed

@@ -35,11 +35,11 @@ parameters [
   },
   {
     key: "decoding_method",
-    value: { string_value: "tlg"} # tlg, ctc_greedy_search, cpu_ctc_beam_search, cuda_ctc_beam_search
   },
   {
     key: "attention_rescoring",
-    value: { string_value: "0"}
   },
   {
     key: "bidecoder",

   },
   {
     key: "decoding_method",
+    value: { string_value: "tlg_mbr"} # tlg, tlg_mbr, ctc_greedy_search, cpu_ctc_beam_search, cuda_ctc_beam_search
   },
   {
     key: "attention_rescoring",
+    value: { string_value: "1"}
   },
   {
     key: "bidecoder",

model_repo_cuda_decoder/scoring/config.pbtxt.template CHANGED Viewed

@@ -39,7 +39,7 @@ parameters [
   },
   {
     key: "attention_rescoring",
-    value: { string_value: "0"}
   },
   {
     key: "bidecoder",

   },
   {
     key: "attention_rescoring",
+    value: { string_value: "1"}
   },
   {
     key: "bidecoder",

run.sh CHANGED Viewed

@@ -3,7 +3,4 @@ export CUDA_VISIBLE_DEVICES="1"
 model_repo_path=model_repo_cuda_decoder
 tritonserver --model-repository $model_repo_path \
             --pinned-memory-pool-byte-size=512000000 \
-            --cuda-memory-pool-byte-size=0:1024000000 \
-            --http-port=18000 \
-            --metrics-port=18001 \
-            --grpc-port=18002

 model_repo_path=model_repo_cuda_decoder
 tritonserver --model-repository $model_repo_path \
             --pinned-memory-pool-byte-size=512000000 \
+            --cuda-memory-pool-byte-size=0:1024000000