File size: 3,131 Bytes
cd09ca8
553c308
 
 
 
 
cd09ca8
 
 
553c308
d49628e
553c308
 
 
 
cd09ca8
553c308
 
 
 
eb90129
 
 
 
 
 
 
 
 
553c308
 
cd09ca8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553c308
 
 
 
 
eb90129
 
beabc59
 
 
 
 
eb90129
 
 
 
553c308
 
 
 
 
 
 
 
 
 
 
 
 
cd09ca8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
import subprocess
from speech_recognition import AudioFile, Recognizer


def tts(text: str, language="ja", encode=False) -> object:
    """Converts text into autoplay html.
    Args:
        text (str): generated answer of bot
        language (str): language of text
        encode (bool): if True, return base64 encoded string
    Returns:
        html: autoplay object
    """
    tts_object = gTTS(text=text, lang=language, slow=False)
    if encode:
        bytes_object = BytesIO()
        tts_object.write_to_fp(bytes_object)
        bytes_object.seek(0)
        b64 = base64.b64encode(bytes_object.getvalue()).decode()
        return b64
    else:
        tts_object.save("temp.mp3")
        return "temp.mp3"


def stt(audio: object, language='ja') -> str:
    """Converts speech to text.
    Args:
        audio: record of user speech
        language (str): language of text
    Returns:
        text (str): recognized speech of user
    """
    # Create a Recognizer object
    r = Recognizer()
    # Open the audio file
    with AudioFile(audio) as source:
        # Listen for the data (load audio to memory)
        audio_data = r.record(source)
        # Transcribe the audio using Google's speech-to-text API
        text = r.recognize_google(audio_data, language=language)
    return text


def read_image_file(file) -> Image.Image:
    image = Image.open(BytesIO(file))
    return image


def pil_to_base64(img, format="jpeg", encode=False):
    if encode:
        bytes_object = BytesIO()
        img.save(bytes_object, format)
        bytes_object.seek(0)
        b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii")
        return b64
    else:
        temp_path = f"temp.{format}"
        img.save(temp_path)
        return temp_path


def base64_to_pil(img_str):
    if "base64," in img_str:
        img_str = img_str.split(",")[1]
    img_raw = base64.b64decode(img_str)
    img = Image.open(BytesIO(img_raw))
    return img


def get_hist(image):
    hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    return hist


def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    Helper function to read an audio file through ffmpeg.
    """
    ar = f"{sampling_rate}"
    ac = "1"
    format_for_conversion = "f32le"
    ffmpeg_command = [
        "ffmpeg",
        "-i",
        "pipe:0",
        "-ac",
        ac,
        "-ar",
        ar,
        "-f",
        format_for_conversion,
        "-hide_banner",
        "-loglevel",
        "quiet",
        "pipe:1",
    ]

    try:
        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    except FileNotFoundError:
        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
    output_stream = ffmpeg_process.communicate(bpayload)
    out_bytes = output_stream[0]
    audio = np.frombuffer(out_bytes, np.float32)
    return audio