FaYo commited on
Commit
d8d694f
·
1 Parent(s): 758f348
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. utils/__init__.py +30 -0
  2. utils/__pycache__/__init__.cpython-310.pyc +0 -0
  3. utils/__pycache__/model_loader.cpython-310.pyc +0 -0
  4. utils/__pycache__/tools.cpython-310.pyc +0 -0
  5. utils/__pycache__/web_configs.cpython-310.pyc +0 -0
  6. utils/agent/__init__.py +0 -0
  7. utils/agent/delivery_time_query.py +301 -0
  8. utils/asr/__init__.py +0 -0
  9. utils/asr/__pycache__/__init__.cpython-310.pyc +0 -0
  10. utils/asr/__pycache__/asr_worker.cpython-310.pyc +0 -0
  11. utils/asr/asr_worker.py +54 -0
  12. utils/digital_human/__init__.py +6 -0
  13. utils/digital_human/__pycache__/__init__.cpython-310.pyc +0 -0
  14. utils/digital_human/__pycache__/digital_human_worker.cpython-310.pyc +0 -0
  15. utils/digital_human/__pycache__/realtime_inference.cpython-310.pyc +0 -0
  16. utils/digital_human/digital_human_worker.py +36 -0
  17. utils/digital_human/musetalk/models/__pycache__/unet.cpython-310.pyc +0 -0
  18. utils/digital_human/musetalk/models/__pycache__/vae.cpython-310.pyc +0 -0
  19. utils/digital_human/musetalk/models/unet.py +47 -0
  20. utils/digital_human/musetalk/models/vae.py +149 -0
  21. utils/digital_human/musetalk/utils/__init__.py +5 -0
  22. utils/digital_human/musetalk/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  23. utils/digital_human/musetalk/utils/__pycache__/blending.cpython-310.pyc +0 -0
  24. utils/digital_human/musetalk/utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
  25. utils/digital_human/musetalk/utils/__pycache__/utils.cpython-310.pyc +0 -0
  26. utils/digital_human/musetalk/utils/blending.py +110 -0
  27. utils/digital_human/musetalk/utils/dwpose/default_runtime.py +54 -0
  28. utils/digital_human/musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py +257 -0
  29. utils/digital_human/musetalk/utils/face_detection/README.md +1 -0
  30. utils/digital_human/musetalk/utils/face_detection/__init__.py +7 -0
  31. utils/digital_human/musetalk/utils/face_detection/__pycache__/__init__.cpython-310.pyc +0 -0
  32. utils/digital_human/musetalk/utils/face_detection/__pycache__/api.cpython-310.pyc +0 -0
  33. utils/digital_human/musetalk/utils/face_detection/__pycache__/models.cpython-310.pyc +0 -0
  34. utils/digital_human/musetalk/utils/face_detection/__pycache__/utils.cpython-310.pyc +0 -0
  35. utils/digital_human/musetalk/utils/face_detection/api.py +240 -0
  36. utils/digital_human/musetalk/utils/face_detection/detection/__init__.py +1 -0
  37. utils/digital_human/musetalk/utils/face_detection/detection/__pycache__/__init__.cpython-310.pyc +0 -0
  38. utils/digital_human/musetalk/utils/face_detection/detection/__pycache__/core.cpython-310.pyc +0 -0
  39. utils/digital_human/musetalk/utils/face_detection/detection/core.py +130 -0
  40. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__init__.py +1 -0
  41. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/__init__.cpython-310.pyc +0 -0
  42. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/bbox.cpython-310.pyc +0 -0
  43. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/detect.cpython-310.pyc +0 -0
  44. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/net_s3fd.cpython-310.pyc +0 -0
  45. utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/sfd_detector.cpython-310.pyc +0 -0
  46. utils/digital_human/musetalk/utils/face_detection/detection/sfd/bbox.py +129 -0
  47. utils/digital_human/musetalk/utils/face_detection/detection/sfd/detect.py +114 -0
  48. utils/digital_human/musetalk/utils/face_detection/detection/sfd/net_s3fd.py +129 -0
  49. utils/digital_human/musetalk/utils/face_detection/detection/sfd/sfd_detector.py +59 -0
  50. utils/digital_human/musetalk/utils/face_detection/models.py +261 -0
utils/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class HParams: # Fix gpt-sovits torch.load 报缺少模块的问题
2
+ def __init__(self, **kwargs):
3
+ for k, v in kwargs.items():
4
+ if type(v) == dict:
5
+ v = HParams(**v)
6
+ self[k] = v
7
+
8
+ def keys(self):
9
+ return self.__dict__.keys()
10
+
11
+ def items(self):
12
+ return self.__dict__.items()
13
+
14
+ def values(self):
15
+ return self.__dict__.values()
16
+
17
+ def __len__(self):
18
+ return len(self.__dict__)
19
+
20
+ def __getitem__(self, key):
21
+ return getattr(self, key)
22
+
23
+ def __setitem__(self, key, value):
24
+ return setattr(self, key, value)
25
+
26
+ def __contains__(self, key):
27
+ return key in self.__dict__
28
+
29
+ def __repr__(self):
30
+ return self.__dict__.__repr__()
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.53 kB). View file
 
utils/__pycache__/model_loader.cpython-310.pyc ADDED
Binary file (997 Bytes). View file
 
utils/__pycache__/tools.cpython-310.pyc ADDED
Binary file (686 Bytes). View file
 
utils/__pycache__/web_configs.cpython-310.pyc ADDED
Binary file (2.7 kB). View file
 
utils/agent/__init__.py ADDED
File without changes
utils/agent/delivery_time_query.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from datetime import datetime
3
+ import hashlib
4
+ import json
5
+ import os
6
+ from typing import Optional, Type
7
+
8
+ import jionlp as jio
9
+ import requests
10
+ from lagent.actions.base_action import BaseAction, tool_api
11
+ from lagent.actions.parser import BaseParser, JsonParser
12
+ from lagent.schema import ActionReturn, ActionStatusCode
13
+
14
+ from utils.web_configs import WEB_CONFIGS
15
+
16
+ '''
17
+ class DeliveryTimeQueryAction(BaseAction):
18
+ """快递时效查询插件,用于根据用户提出的收货地址查询到达期限"""
19
+
20
+ def __init__(
21
+ self,
22
+ # departure_place: str,
23
+ # delivery_company_name: str,
24
+ description: Optional[dict] = None,
25
+ parser: Type[BaseParser] = JsonParser,
26
+ enable: bool = True,
27
+ ) -> None:
28
+ super().__init__(description, parser, enable)
29
+ self.departure_place = departure_place # 发货地
30
+
31
+ # 天气查询
32
+ self.weather_query_handler = WeatherQuery(departure_place, WEB_CONFIGS.AGENT_WEATHER_API_KEY)
33
+ self.delivery_time_handler = DeliveryTimeQuery(delivery_company_name, WEB_CONFIGS.AGENT_DELIVERY_TIME_API_KEY)
34
+
35
+ @tool_api
36
+ def run(self, query: str) -> ActionReturn:
37
+ """一个到货时间查询API。可以根据城市名查询到货时间信息。
38
+
39
+ Args:
40
+ query (:class:`str`): 需要查询的城市名。
41
+ """
42
+
43
+ # 获取文本中收货地,发货地后台设置
44
+ # 防止 LLM 将城市识别错误,进行兜底
45
+ city_info = jio.parse_location(query, town_village=True)
46
+ city_name = city_info["city"]
47
+
48
+ # 获取收货地代号 -> 天气
49
+ destination_weather = self.weather_query_handler(city_name)
50
+
51
+ # 获取发货地代号 -> 天气
52
+ departure_weather = self.weather_query_handler(self.departure_place)
53
+
54
+ # 获取到达时间
55
+ delivery_time = self.delivery_time_handler(self.departure_place, city_name)
56
+
57
+ final_str = (
58
+ f"今天日期:{datetime.now().strftime('%m月%d日')}\n"
59
+ f"收货地天气:{destination_weather.result[0]['content']}\n"
60
+ f"发货地天气:{departure_weather.result[0]['content']}\n"
61
+ f"物流信息:{delivery_time.result[0]['content']}\n"
62
+ "回答突出“预计送达时间”和“收货地天气”,如果收货地或者发货地遇到暴雨暴雪等极端天气,须告知用户快递到达时间会有所增加。"
63
+ )
64
+
65
+ tool_return = ActionReturn(type=self.name)
66
+ tool_return.result = [dict(type="text", content=final_str)]
67
+ return tool_return
68
+ '''
69
+ '''
70
+ class WeatherQuery:
71
+ """快递时效查询插件,用于根据用户提出的收货地址查询到达期限"""
72
+
73
+ def __init__(
74
+ self,
75
+ departure_place: str,
76
+ api_key: Optional[str] = None,
77
+ ) -> None:
78
+ self.departure_place = departure_place # 发货地
79
+
80
+ # 天气查询
81
+ # api_key = os.environ.get("WEATHER_API_KEY", key)
82
+ if api_key is None:
83
+ raise ValueError("Please set Weather API key either in the environment as WEATHER_API_KEY")
84
+ self.api_key = api_key
85
+ self.location_query_url = "https://geoapi.qweather.com/v2/city/lookup"
86
+ self.weather_query_url = "https://devapi.qweather.com/v7/weather/now"
87
+
88
+ def parse_results(self, city_name: str, results: dict) -> str:
89
+ """解析 API 返回的信息
90
+
91
+ Args:
92
+ results (dict): JSON 格式的 API 报文。
93
+
94
+ Returns:
95
+ str: 解析后的结果。
96
+ """
97
+ now = results["now"]
98
+ data = (
99
+ # f'数据观测时间: {now["obsTime"]};'
100
+ f"城市名: {city_name};"
101
+ f'温度: {now["temp"]}°C;'
102
+ f'体感温度: {now["feelsLike"]}°C;'
103
+ f'天气: {now["text"]};'
104
+ # f'风向: {now["windDir"]},角度为 {now["wind360"]}°;'
105
+ f'风力等级: {now["windScale"]},风速为 {now["windSpeed"]} km/h;'
106
+ f'相对湿度: {now["humidity"]};'
107
+ f'当前小时累计降水量: {now["precip"]} mm;'
108
+ # f'大气压强: {now["pressure"]} 百帕;'
109
+ f'能见度: {now["vis"]} km。'
110
+ )
111
+ return data
112
+
113
+ def __call__(self, query):
114
+ tool_return = ActionReturn()
115
+ status_code, response = self.search_weather_with_city(query)
116
+ if status_code == -1:
117
+ tool_return.errmsg = response
118
+ tool_return.state = ActionStatusCode.HTTP_ERROR
119
+ elif status_code == 200:
120
+ parsed_res = self.parse_results(query, response)
121
+ tool_return.result = [dict(type="text", content=str(parsed_res))]
122
+ tool_return.state = ActionStatusCode.SUCCESS
123
+ else:
124
+ tool_return.errmsg = str(status_code)
125
+ tool_return.state = ActionStatusCode.API_ERROR
126
+ return tool_return
127
+
128
+ def search_weather_with_city(self, query: str):
129
+ """根据城市名获取城市��号,然后进行天气查询
130
+
131
+ Args:
132
+ query (str): 城市名
133
+
134
+ Returns:
135
+ int: 天气接口调用状态码
136
+ dict: 天气接口返回信息
137
+ """
138
+
139
+ # 获取城市代号
140
+ try:
141
+ city_code_response = requests.get(self.location_query_url, params={"key": self.api_key, "location": query})
142
+ except Exception as e:
143
+ return -1, str(e)
144
+
145
+ if city_code_response.status_code != 200:
146
+ return city_code_response.status_code, city_code_response.json()
147
+ city_code_response = city_code_response.json()
148
+ if len(city_code_response["location"]) == 0:
149
+ return -1, "未查询到城市"
150
+ city_code = city_code_response["location"][0]["id"]
151
+
152
+ # 获取天气
153
+ try:
154
+ weather_response = requests.get(self.weather_query_url, params={"key": self.api_key, "location": city_code})
155
+ except Exception as e:
156
+ return -1, str(e)
157
+ return weather_response.status_code, weather_response.json()
158
+ '''
159
+
160
+ class DeliveryTimeQuery:
161
+ def __init__(
162
+ self,
163
+ delivery_company_name: Optional[str] = "中通",
164
+ api_key: Optional[str] = None,
165
+ ) -> None:
166
+
167
+ # 快递时效查询
168
+ # api_key = os.environ.get("DELIVERY_TIME_API_KEY", key)
169
+ if api_key is None or "," not in api_key:
170
+ raise ValueError(
171
+ 'Please set Delivery time API key either in the environment as DELIVERY_TIME_API_KEY="${e_business_id},${api_key}"'
172
+ )
173
+ self.e_business_id = api_key.split(",")[0]
174
+ self.api_key = api_key.split(",")[1]
175
+ self.api_url = "http://api.kdniao.com/api/dist" # 快递鸟
176
+ self.china_location = jio.china_location_loader()
177
+ # 快递鸟对应的
178
+ DELIVERY_COMPANY_MAP = {
179
+ "德邦": "DBL",
180
+ "邮政": "EMS",
181
+ "京东": "JD",
182
+ "极兔速递": "JTSD",
183
+ "顺丰": "SF",
184
+ "申通": "STO",
185
+ "韵达": "YD",
186
+ "圆通": "YTO",
187
+ "中通": "ZTO",
188
+ }
189
+ self.delivery_company_name = delivery_company_name
190
+ self.delivery_company_id = DELIVERY_COMPANY_MAP[delivery_company_name]
191
+
192
+ @staticmethod
193
+ def data_md5(n):
194
+ # md5加密
195
+ md5 = hashlib.md5()
196
+ md5.update(str(n).encode("utf-8"))
197
+ return md5.hexdigest()
198
+
199
+ def get_data_sign(self, n):
200
+ # 签名
201
+ md5Data = self.data_md5(json.dumps(n) + self.api_key)
202
+ res = str(base64.b64encode(md5Data.encode("utf-8")), "utf-8")
203
+ return res
204
+
205
+ def get_city_detail(self, name):
206
+ # 如果是城市名,使用第一个区名
207
+ city_info = jio.parse_location(name, town_village=True)
208
+ # china_location = jio.china_location_loader()
209
+
210
+ county_name = ""
211
+ for i in self.china_location[city_info["province"]][city_info["city"]].keys():
212
+ if "区" == i[-1]:
213
+ county_name = i
214
+ break
215
+
216
+ return {
217
+ "province": city_info["province"],
218
+ "city": city_info["city"],
219
+ "county": county_name,
220
+ }
221
+
222
+ def get_params(self, send_city, receive_city):
223
+
224
+ # 根据市查出省份和区名称
225
+ send_city_info = self.get_city_detail(send_city)
226
+ receive_city_info = self.get_city_detail(receive_city)
227
+
228
+ # 预计送达时间接口文档;https://www.yuque.com/kdnjishuzhichi/dfcrg1/ynkmts0e5owsnpvu
229
+ # 请求接口指令
230
+ RequestType = "6004"
231
+ # 组装应用级参数
232
+ RequestData = {
233
+ "ShipperCode": self.delivery_company_id,
234
+ "ReceiveArea": receive_city_info["county"],
235
+ "ReceiveCity": receive_city_info["city"],
236
+ "ReceiveProvince": receive_city_info["province"],
237
+ "SendArea": send_city_info["county"],
238
+ "SendCity": send_city_info["city"],
239
+ "SendProvince": send_city_info["province"],
240
+ }
241
+ # 组装系统级参数
242
+ data = {
243
+ "RequestData": json.dumps(RequestData),
244
+ "RequestType": RequestType,
245
+ "EBusinessID": self.e_business_id,
246
+ "DataSign": self.get_data_sign(RequestData),
247
+ "DataType": 2,
248
+ }
249
+ return data
250
+
251
+ def parse_results(self, response):
252
+
253
+ # 返回例子:
254
+ # {
255
+ # "EBusinessID" : "1000000",
256
+ # "Data" : {
257
+ # "DeliveryTime" : "06月15日下午可达",
258
+ # "SendAddress" : null,
259
+ # "ReceiveArea" : "芙蓉区",
260
+ # "SendProvince" : "广东省",
261
+ # "ReceiveProvince" : "湖南省",
262
+ # "ShipperCode" : "DBL",
263
+ # "Hour" : "52h",
264
+ # "SendArea" : "白云区",
265
+ # "ReceiveAddress" : null,
266
+ # "SendCity" : "广州市",
267
+ # "ReceiveCity" : "长沙市"
268
+ # },
269
+ # "ResultCode" : "100",
270
+ # "Success" : true
271
+ # }
272
+
273
+ response = response["Data"]
274
+ data = (
275
+ f'发货地点: {response["SendProvince"]} {response["SendCity"]};'
276
+ f'收货地点: {response["ReceiveProvince"]} {response["ReceiveCity"]};'
277
+ f'预计送达时间: {response["DeliveryTime"]};'
278
+ f"快递公司: {self.delivery_company_name};"
279
+ f'预计时效: {response["Hour"]}。'
280
+ )
281
+ return data
282
+
283
+ def __call__(self, send_city, receive_city):
284
+ tool_return = ActionReturn()
285
+ try:
286
+ res = requests.post(self.api_url, self.get_params(send_city, receive_city))
287
+ status_code = res.status_code
288
+ response = res.json()
289
+ except Exception as e:
290
+ tool_return.errmsg = str(e)
291
+ tool_return.state = ActionStatusCode.API_ERROR
292
+ return tool_return
293
+
294
+ if status_code == 200:
295
+ parsed_res = self.parse_results(response)
296
+ tool_return.result = [dict(type="text", content=str(parsed_res))]
297
+ tool_return.state = ActionStatusCode.SUCCESS
298
+ else:
299
+ tool_return.errmsg = str(status_code)
300
+ tool_return.state = ActionStatusCode.API_ERROR
301
+ return tool_return
utils/asr/__init__.py ADDED
File without changes
utils/asr/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (142 Bytes). View file
 
utils/asr/__pycache__/asr_worker.cpython-310.pyc ADDED
Binary file (1.63 kB). View file
 
utils/asr/asr_worker.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from funasr import AutoModel
3
+ import streamlit as st
4
+ from utils.web_configs import WEB_CONFIGS
5
+ from modelscope import snapshot_download
6
+ from modelscope.utils.constant import Invoke, ThirdParty
7
+ from funasr.download.name_maps_from_hub import name_maps_ms as NAME_MAPS_MS
8
+
9
+
10
+ @st.cache_resource
11
+ def load_asr_model():
12
+
13
+ # 模型下载
14
+ model_path_info = dict()
15
+ for model_name in ["paraformer-zh", "fsmn-vad", "ct-punc"]:
16
+ print(f"downloading asr model : {NAME_MAPS_MS[model_name]}")
17
+ mode_dir = snapshot_download(
18
+ NAME_MAPS_MS[model_name],
19
+ revision="master",
20
+ user_agent={Invoke.KEY: Invoke.PIPELINE, ThirdParty.KEY: "funasr"},
21
+ cache_dir=WEB_CONFIGS.ASR_MODEL_DIR,
22
+ )
23
+ model_path_info[model_name] = mode_dir
24
+ NAME_MAPS_MS[model_name] = mode_dir # 更新
25
+
26
+ print(f"ASR model path info = {model_path_info}")
27
+ # paraformer-zh is a multi-functional asr model
28
+ # use vad, punc, spk or not as you need
29
+ model = AutoModel(
30
+ model="paraformer-zh", # 语音识别,带时间戳输出,非实时
31
+ vad_model="fsmn-vad", # 语音端点检测,实时
32
+ punc_model="ct-punc", # 标点恢复
33
+ # spk_model="cam++" # 说话人确认/分割
34
+ model_path=model_path_info["paraformer-zh"],
35
+ vad_kwargs={"model_path": model_path_info["fsmn-vad"]},
36
+ punc_kwargs={"model_path": model_path_info["ct-punc"]},
37
+ )
38
+ return model
39
+
40
+
41
+ def process_asr(model: AutoModel, wav_path):
42
+ # https://github.com/modelscope/FunASR/blob/main/README_zh.md#%E5%AE%9E%E6%97%B6%E8%AF%AD%E9%9F%B3%E8%AF%86%E5%88%AB
43
+ f_start_time = datetime.datetime.now()
44
+ res = model.generate(input=wav_path, batch_size_s=50, hotword="魔搭")
45
+ delta_time = datetime.datetime.now() - f_start_time
46
+
47
+ try:
48
+ print(f"ASR using time {delta_time}s, text: ", res[0]["text"])
49
+ res_str = res[0]["text"]
50
+ except Exception as e:
51
+ print("ASR 解析失败,无法获取到文字")
52
+ return ""
53
+
54
+ return res_str
utils/digital_human/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from torch import hub
2
+ from utils.web_configs import WEB_CONFIGS
3
+ from pathlib import Path
4
+
5
+ # 部分模型会使用 torch download 下载,需要设置路径
6
+ hub.set_dir(str(Path(WEB_CONFIGS.DIGITAL_HUMAN_MODEL_DIR).joinpath("face-alignment")))
utils/digital_human/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (367 Bytes). View file
 
utils/digital_human/__pycache__/digital_human_worker.cpython-310.pyc ADDED
Binary file (1.57 kB). View file
 
utils/digital_human/__pycache__/realtime_inference.cpython-310.pyc ADDED
Binary file (13.5 kB). View file
 
utils/digital_human/digital_human_worker.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import streamlit as st
3
+ from utils.digital_human.realtime_inference import gen_digital_human_video
4
+ from utils.model_loader import DIGITAL_HUMAN_HANDLER
5
+ from utils.web_configs import WEB_CONFIGS
6
+
7
+ def show_video(video_path, autoplay=True, loop=False, muted=False):
8
+ # 需要 fp25 才能显示
9
+ with open(video_path, "rb") as f_wav:
10
+ video_bytes = f_wav.read()
11
+
12
+ print(f"Show video: {video_path}")
13
+ st.video(video_bytes, format="video/mp4", autoplay=autoplay, loop=loop, muted=muted)
14
+
15
+
16
+ def gen_digital_human_video_in_spinner(audio_path):
17
+ save_path = None
18
+ if st.session_state.gen_digital_human_checkbox and DIGITAL_HUMAN_HANDLER is not None:
19
+ with st.spinner(
20
+ "正在生成数字人,请稍等... 如果觉得生成时间太久,可以将侧边栏的【生成数字人】按钮取消选中,下次则不会生成"
21
+ ):
22
+ # save_tag = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".wav"
23
+
24
+ st.session_state.digital_human_video_path = gen_digital_human_video(
25
+ DIGITAL_HUMAN_HANDLER,
26
+ audio_path,
27
+ work_dir=str(Path(WEB_CONFIGS.DIGITAL_HUMAN_GEN_PATH).absolute()),
28
+ video_path=st.session_state.digital_human_video_path,
29
+ fps=DIGITAL_HUMAN_HANDLER.model_handler.fps,
30
+ )
31
+
32
+ st.session_state.video_placeholder.empty() # 清空
33
+ with st.session_state.video_placeholder.container():
34
+ show_video(st.session_state.digital_human_video_path)
35
+ st.toast("生成数字人视频成功!")
36
+ return save_path
utils/digital_human/musetalk/models/__pycache__/unet.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
utils/digital_human/musetalk/models/__pycache__/vae.cpython-310.pyc ADDED
Binary file (4.84 kB). View file
 
utils/digital_human/musetalk/models/unet.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+ import json
5
+
6
+ from diffusers import UNet2DConditionModel
7
+ import sys
8
+ import time
9
+ import numpy as np
10
+ import os
11
+
12
+ class PositionalEncoding(nn.Module):
13
+ def __init__(self, d_model=384, max_len=5000):
14
+ super(PositionalEncoding, self).__init__()
15
+ pe = torch.zeros(max_len, d_model)
16
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
17
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
18
+ pe[:, 0::2] = torch.sin(position * div_term)
19
+ pe[:, 1::2] = torch.cos(position * div_term)
20
+ pe = pe.unsqueeze(0)
21
+ self.register_buffer('pe', pe)
22
+
23
+ def forward(self, x):
24
+ b, seq_len, d_model = x.size()
25
+ pe = self.pe[:, :seq_len, :]
26
+ x = x + pe.to(x.device)
27
+ return x
28
+
29
+ class UNet():
30
+ def __init__(self,
31
+ unet_config,
32
+ model_path,
33
+ use_float16=False,
34
+ ):
35
+ with open(unet_config, 'r') as f:
36
+ unet_config = json.load(f)
37
+ self.model = UNet2DConditionModel(**unet_config)
38
+ self.pe = PositionalEncoding(d_model=384)
39
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+ weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device)
41
+ self.model.load_state_dict(weights)
42
+ if use_float16:
43
+ self.model = self.model.half()
44
+ self.model.to(self.device)
45
+
46
+ if __name__ == "__main__":
47
+ unet = UNet()
utils/digital_human/musetalk/models/vae.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import cv2
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchvision.transforms as transforms
8
+ from diffusers import AutoencoderKL
9
+
10
+
11
+ class VAE():
12
+ """
13
+ VAE (Variational Autoencoder) class for image processing.
14
+ """
15
+
16
+ def __init__(self, model_path="./models/sd-vae-ft-mse/", resized_img=256, use_float16=False):
17
+ """
18
+ Initialize the VAE instance.
19
+
20
+ :param model_path: Path to the trained model.
21
+ :param resized_img: The size to which images are resized.
22
+ :param use_float16: Whether to use float16 precision.
23
+ """
24
+ self.model_path = model_path
25
+ self.vae = AutoencoderKL.from_pretrained(self.model_path)
26
+
27
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+ self.vae.to(self.device)
29
+
30
+ if use_float16:
31
+ self.vae = self.vae.half()
32
+ self._use_float16 = True
33
+ else:
34
+ self._use_float16 = False
35
+
36
+ self.scaling_factor = self.vae.config.scaling_factor
37
+ self.transform = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
38
+ self._resized_img = resized_img
39
+ self._mask_tensor = self.get_mask_tensor()
40
+
41
+ def get_mask_tensor(self):
42
+ """
43
+ Creates a mask tensor for image processing.
44
+ :return: A mask tensor.
45
+ """
46
+ mask_tensor = torch.zeros((self._resized_img,self._resized_img))
47
+ mask_tensor[:self._resized_img//2,:] = 1
48
+ mask_tensor[mask_tensor< 0.5] = 0
49
+ mask_tensor[mask_tensor>= 0.5] = 1
50
+ return mask_tensor
51
+
52
+ def preprocess_img(self,img_name,half_mask=False):
53
+ """
54
+ Preprocess an image for the VAE.
55
+
56
+ :param img_name: The image file path or a list of image file paths.
57
+ :param half_mask: Whether to apply a half mask to the image.
58
+ :return: A preprocessed image tensor.
59
+ """
60
+ window = []
61
+ if isinstance(img_name, str):
62
+ window_fnames = [img_name]
63
+ for fname in window_fnames:
64
+ img = cv2.imread(fname)
65
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
66
+ img = cv2.resize(img, (self._resized_img, self._resized_img),
67
+ interpolation=cv2.INTER_LANCZOS4)
68
+ window.append(img)
69
+ else:
70
+ img = cv2.cvtColor(img_name, cv2.COLOR_BGR2RGB)
71
+ window.append(img)
72
+
73
+ x = np.asarray(window) / 255.
74
+ x = np.transpose(x, (3, 0, 1, 2))
75
+ x = torch.squeeze(torch.FloatTensor(x))
76
+ if half_mask:
77
+ x = x * (self._mask_tensor>0.5)
78
+ x = self.transform(x)
79
+
80
+ x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor
81
+ x = x.to(self.vae.device)
82
+
83
+ return x
84
+
85
+ def encode_latents(self,image):
86
+ """
87
+ Encode an image into latent variables.
88
+
89
+ :param image: The image tensor to encode.
90
+ :return: The encoded latent variables.
91
+ """
92
+ with torch.no_grad():
93
+ init_latent_dist = self.vae.encode(image.to(self.vae.dtype)).latent_dist
94
+ init_latents = self.scaling_factor * init_latent_dist.sample()
95
+ return init_latents
96
+
97
+ def decode_latents(self, latents):
98
+ """
99
+ Decode latent variables back into an image.
100
+ :param latents: The latent variables to decode.
101
+ :return: A NumPy array representing the decoded image.
102
+ """
103
+ latents = (1/ self.scaling_factor) * latents
104
+ image = self.vae.decode(latents.to(self.vae.dtype)).sample
105
+ image = (image / 2 + 0.5).clamp(0, 1)
106
+ image = image.detach().cpu().permute(0, 2, 3, 1).float().numpy()
107
+ image = (image * 255).round().astype("uint8")
108
+ image = image[...,::-1] # RGB to BGR
109
+ return image
110
+
111
+ def get_latents_for_unet(self,img):
112
+ """
113
+ Prepare latent variables for a U-Net model.
114
+ :param img: The image to process.
115
+ :return: A concatenated tensor of latents for U-Net input.
116
+ """
117
+
118
+ ref_image = self.preprocess_img(img,half_mask=True) # [1, 3, 256, 256] RGB, torch tensor
119
+ masked_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
120
+ ref_image = self.preprocess_img(img,half_mask=False) # [1, 3, 256, 256] RGB, torch tensor
121
+ ref_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
122
+ latent_model_input = torch.cat([masked_latents, ref_latents], dim=1)
123
+ return latent_model_input
124
+
125
+ if __name__ == "__main__":
126
+ vae_mode_path = "./models/sd-vae-ft-mse/"
127
+ vae = VAE(model_path = vae_mode_path,use_float16=False)
128
+ img_path = "./results/sun001_crop/00000.png"
129
+
130
+ crop_imgs_path = "./results/sun001_crop/"
131
+ latents_out_path = "./results/latents/"
132
+ if not os.path.exists(latents_out_path):
133
+ os.mkdir(latents_out_path)
134
+
135
+ files = os.listdir(crop_imgs_path)
136
+ files.sort()
137
+ files = [file for file in files if file.split(".")[-1] == "png"]
138
+
139
+ for file in files:
140
+ index = file.split(".")[0]
141
+ img_path = crop_imgs_path + file
142
+ latents = vae.get_latents_for_unet(img_path)
143
+ print(img_path,"latents",latents.size())
144
+ #torch.save(latents,os.path.join(latents_out_path,index+".pt"))
145
+ #reload_tensor = torch.load('tensor.pt')
146
+ #print(reload_tensor.size())
147
+
148
+
149
+
utils/digital_human/musetalk/utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import sys
2
+ from os.path import abspath, dirname
3
+ current_dir = dirname(abspath(__file__))
4
+ parent_dir = dirname(current_dir)
5
+ sys.path.append(parent_dir+'/utils')
utils/digital_human/musetalk/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (338 Bytes). View file
 
utils/digital_human/musetalk/utils/__pycache__/blending.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
utils/digital_human/musetalk/utils/__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (4.48 kB). View file
 
utils/digital_human/musetalk/utils/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
utils/digital_human/musetalk/utils/blending.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import numpy as np
3
+ import cv2
4
+ from face_parsing import FaceParsing
5
+
6
+
7
+ def init_face_parsing_model(
8
+ resnet_path="./models/face-parse-bisent/resnet18-5c106cde.pth", face_model_pth="./models/face-parse-bisent/79999_iter.pth"
9
+ ):
10
+ fp_model = FaceParsing(resnet_path, face_model_pth)
11
+ return fp_model
12
+
13
+
14
+ def get_crop_box(box, expand):
15
+ x, y, x1, y1 = box
16
+ x_c, y_c = (x + x1) // 2, (y + y1) // 2
17
+ w, h = x1 - x, y1 - y
18
+ s = int(max(w, h) // 2 * expand)
19
+ crop_box = [x_c - s, y_c - s, x_c + s, y_c + s]
20
+ return crop_box, s
21
+
22
+
23
+ def face_seg(image, fp_model):
24
+ seg_image = fp_model(image)
25
+ if seg_image is None:
26
+ print("error, no person_segment")
27
+ return None
28
+
29
+ seg_image = seg_image.resize(image.size)
30
+ return seg_image
31
+
32
+
33
+ def get_image(image, face, face_box, fp_model, upper_boundary_ratio=0.5, expand=1.2):
34
+ # print(image.shape)
35
+ # print(face.shape)
36
+
37
+ body = Image.fromarray(image[:, :, ::-1])
38
+ face = Image.fromarray(face[:, :, ::-1])
39
+
40
+ x, y, x1, y1 = face_box
41
+ # print(x1-x,y1-y)
42
+ crop_box, s = get_crop_box(face_box, expand)
43
+ x_s, y_s, x_e, y_e = crop_box
44
+ face_position = (x, y)
45
+
46
+ face_large = body.crop(crop_box)
47
+ ori_shape = face_large.size
48
+
49
+ mask_image = face_seg(face_large, fp_model)
50
+ mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s))
51
+ mask_image = Image.new("L", ori_shape, 0)
52
+ mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
53
+
54
+ # keep upper_boundary_ratio of talking area
55
+ width, height = mask_image.size
56
+ top_boundary = int(height * upper_boundary_ratio)
57
+ modified_mask_image = Image.new("L", ori_shape, 0)
58
+ modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
59
+
60
+ blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
61
+ mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
62
+ mask_image = Image.fromarray(mask_array)
63
+
64
+ face_large.paste(face, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
65
+ body.paste(face_large, crop_box[:2], mask_image)
66
+ body = np.array(body)
67
+ return body[:, :, ::-1]
68
+
69
+
70
+ def get_image_prepare_material(image, face_box, fp_model, upper_boundary_ratio=0.5, expand=1.2):
71
+ body = Image.fromarray(image[:, :, ::-1])
72
+
73
+ x, y, x1, y1 = face_box
74
+ # print(x1-x,y1-y)
75
+ crop_box, s = get_crop_box(face_box, expand)
76
+ x_s, y_s, x_e, y_e = crop_box
77
+
78
+ face_large = body.crop(crop_box)
79
+ ori_shape = face_large.size
80
+
81
+ mask_image = face_seg(face_large, fp_model)
82
+ mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s))
83
+ mask_image = Image.new("L", ori_shape, 0)
84
+ mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
85
+
86
+ # keep upper_boundary_ratio of talking area
87
+ width, height = mask_image.size
88
+ top_boundary = int(height * upper_boundary_ratio)
89
+ modified_mask_image = Image.new("L", ori_shape, 0)
90
+ modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
91
+
92
+ blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
93
+ mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
94
+ return mask_array, crop_box
95
+
96
+
97
+ def get_image_blending(image, face, face_box, mask_array, crop_box):
98
+ body = Image.fromarray(image[:, :, ::-1])
99
+ face = Image.fromarray(face[:, :, ::-1])
100
+
101
+ x, y, x1, y1 = face_box
102
+ x_s, y_s, x_e, y_e = crop_box
103
+ face_large = body.crop(crop_box)
104
+
105
+ mask_image = Image.fromarray(mask_array)
106
+ mask_image = mask_image.convert("L")
107
+ face_large.paste(face, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
108
+ body.paste(face_large, crop_box[:2], mask_image)
109
+ body = np.array(body)
110
+ return body[:, :, ::-1]
utils/digital_human/musetalk/utils/dwpose/default_runtime.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_scope = 'mmpose'
2
+
3
+ # hooks
4
+ default_hooks = dict(
5
+ timer=dict(type='IterTimerHook'),
6
+ logger=dict(type='LoggerHook', interval=50),
7
+ param_scheduler=dict(type='ParamSchedulerHook'),
8
+ checkpoint=dict(type='CheckpointHook', interval=10),
9
+ sampler_seed=dict(type='DistSamplerSeedHook'),
10
+ visualization=dict(type='PoseVisualizationHook', enable=False),
11
+ badcase=dict(
12
+ type='BadCaseAnalysisHook',
13
+ enable=False,
14
+ out_dir='badcase',
15
+ metric_type='loss',
16
+ badcase_thr=5))
17
+
18
+ # custom hooks
19
+ custom_hooks = [
20
+ # Synchronize model buffers such as running_mean and running_var in BN
21
+ # at the end of each epoch
22
+ dict(type='SyncBuffersHook')
23
+ ]
24
+
25
+ # multi-processing backend
26
+ env_cfg = dict(
27
+ cudnn_benchmark=False,
28
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
29
+ dist_cfg=dict(backend='nccl'),
30
+ )
31
+
32
+ # visualizer
33
+ vis_backends = [
34
+ dict(type='LocalVisBackend'),
35
+ # dict(type='TensorboardVisBackend'),
36
+ # dict(type='WandbVisBackend'),
37
+ ]
38
+ visualizer = dict(
39
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
40
+
41
+ # logger
42
+ log_processor = dict(
43
+ type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
44
+ log_level = 'INFO'
45
+ load_from = None
46
+ resume = False
47
+
48
+ # file I/O backend
49
+ backend_args = dict(backend='local')
50
+
51
+ # training/validation/testing progress
52
+ train_cfg = dict(by_epoch=True)
53
+ val_cfg = dict()
54
+ test_cfg = dict()
utils/digital_human/musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #_base_ = ['../../../_base_/default_runtime.py']
2
+ _base_ = ['default_runtime.py']
3
+
4
+ # runtime
5
+ max_epochs = 270
6
+ stage2_num_epochs = 30
7
+ base_lr = 4e-3
8
+ train_batch_size = 8
9
+ val_batch_size = 8
10
+
11
+ train_cfg = dict(max_epochs=max_epochs, val_interval=10)
12
+ randomness = dict(seed=21)
13
+
14
+ # optimizer
15
+ optim_wrapper = dict(
16
+ type='OptimWrapper',
17
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
18
+ paramwise_cfg=dict(
19
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
20
+
21
+ # learning rate
22
+ param_scheduler = [
23
+ dict(
24
+ type='LinearLR',
25
+ start_factor=1.0e-5,
26
+ by_epoch=False,
27
+ begin=0,
28
+ end=1000),
29
+ dict(
30
+ # use cosine lr from 150 to 300 epoch
31
+ type='CosineAnnealingLR',
32
+ eta_min=base_lr * 0.05,
33
+ begin=max_epochs // 2,
34
+ end=max_epochs,
35
+ T_max=max_epochs // 2,
36
+ by_epoch=True,
37
+ convert_to_iter_based=True),
38
+ ]
39
+
40
+ # automatically scaling LR based on the actual training batch size
41
+ auto_scale_lr = dict(base_batch_size=512)
42
+
43
+ # codec settings
44
+ codec = dict(
45
+ type='SimCCLabel',
46
+ input_size=(288, 384),
47
+ sigma=(6., 6.93),
48
+ simcc_split_ratio=2.0,
49
+ normalize=False,
50
+ use_dark=False)
51
+
52
+ # model settings
53
+ model = dict(
54
+ type='TopdownPoseEstimator',
55
+ data_preprocessor=dict(
56
+ type='PoseDataPreprocessor',
57
+ mean=[123.675, 116.28, 103.53],
58
+ std=[58.395, 57.12, 57.375],
59
+ bgr_to_rgb=True),
60
+ backbone=dict(
61
+ _scope_='mmdet',
62
+ type='CSPNeXt',
63
+ arch='P5',
64
+ expand_ratio=0.5,
65
+ deepen_factor=1.,
66
+ widen_factor=1.,
67
+ out_indices=(4, ),
68
+ channel_attention=True,
69
+ norm_cfg=dict(type='SyncBN'),
70
+ act_cfg=dict(type='SiLU'),
71
+ init_cfg=dict(
72
+ type='Pretrained',
73
+ prefix='backbone.',
74
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
75
+ 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
76
+ )),
77
+ head=dict(
78
+ type='RTMCCHead',
79
+ in_channels=1024,
80
+ out_channels=133,
81
+ input_size=codec['input_size'],
82
+ in_featuremap_size=(9, 12),
83
+ simcc_split_ratio=codec['simcc_split_ratio'],
84
+ final_layer_kernel_size=7,
85
+ gau_cfg=dict(
86
+ hidden_dims=256,
87
+ s=128,
88
+ expansion_factor=2,
89
+ dropout_rate=0.,
90
+ drop_path=0.,
91
+ act_fn='SiLU',
92
+ use_rel_bias=False,
93
+ pos_enc=False),
94
+ loss=dict(
95
+ type='KLDiscretLoss',
96
+ use_target_weight=True,
97
+ beta=10.,
98
+ label_softmax=True),
99
+ decoder=codec),
100
+ test_cfg=dict(flip_test=True, ))
101
+
102
+ # base dataset settings
103
+ dataset_type = 'UBody2dDataset'
104
+ data_mode = 'topdown'
105
+ data_root = 'data/UBody/'
106
+
107
+ backend_args = dict(backend='local')
108
+
109
+ scenes = [
110
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
111
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
112
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
113
+ ]
114
+
115
+ train_datasets = [
116
+ dict(
117
+ type='CocoWholeBodyDataset',
118
+ data_root='data/coco/',
119
+ data_mode=data_mode,
120
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
121
+ data_prefix=dict(img='train2017/'),
122
+ pipeline=[])
123
+ ]
124
+
125
+ for scene in scenes:
126
+ train_dataset = dict(
127
+ type=dataset_type,
128
+ data_root=data_root,
129
+ data_mode=data_mode,
130
+ ann_file=f'annotations/{scene}/train_annotations.json',
131
+ data_prefix=dict(img='images/'),
132
+ pipeline=[],
133
+ sample_interval=10)
134
+ train_datasets.append(train_dataset)
135
+
136
+ # pipelines
137
+ train_pipeline = [
138
+ dict(type='LoadImage', backend_args=backend_args),
139
+ dict(type='GetBBoxCenterScale'),
140
+ dict(type='RandomFlip', direction='horizontal'),
141
+ dict(type='RandomHalfBody'),
142
+ dict(
143
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
144
+ dict(type='TopdownAffine', input_size=codec['input_size']),
145
+ dict(type='mmdet.YOLOXHSVRandomAug'),
146
+ dict(
147
+ type='Albumentation',
148
+ transforms=[
149
+ dict(type='Blur', p=0.1),
150
+ dict(type='MedianBlur', p=0.1),
151
+ dict(
152
+ type='CoarseDropout',
153
+ max_holes=1,
154
+ max_height=0.4,
155
+ max_width=0.4,
156
+ min_holes=1,
157
+ min_height=0.2,
158
+ min_width=0.2,
159
+ p=1.0),
160
+ ]),
161
+ dict(type='GenerateTarget', encoder=codec),
162
+ dict(type='PackPoseInputs')
163
+ ]
164
+ val_pipeline = [
165
+ dict(type='LoadImage', backend_args=backend_args),
166
+ dict(type='GetBBoxCenterScale'),
167
+ dict(type='TopdownAffine', input_size=codec['input_size']),
168
+ dict(type='PackPoseInputs')
169
+ ]
170
+
171
+ train_pipeline_stage2 = [
172
+ dict(type='LoadImage', backend_args=backend_args),
173
+ dict(type='GetBBoxCenterScale'),
174
+ dict(type='RandomFlip', direction='horizontal'),
175
+ dict(type='RandomHalfBody'),
176
+ dict(
177
+ type='RandomBBoxTransform',
178
+ shift_factor=0.,
179
+ scale_factor=[0.5, 1.5],
180
+ rotate_factor=90),
181
+ dict(type='TopdownAffine', input_size=codec['input_size']),
182
+ dict(type='mmdet.YOLOXHSVRandomAug'),
183
+ dict(
184
+ type='Albumentation',
185
+ transforms=[
186
+ dict(type='Blur', p=0.1),
187
+ dict(type='MedianBlur', p=0.1),
188
+ dict(
189
+ type='CoarseDropout',
190
+ max_holes=1,
191
+ max_height=0.4,
192
+ max_width=0.4,
193
+ min_holes=1,
194
+ min_height=0.2,
195
+ min_width=0.2,
196
+ p=0.5),
197
+ ]),
198
+ dict(type='GenerateTarget', encoder=codec),
199
+ dict(type='PackPoseInputs')
200
+ ]
201
+
202
+ # data loaders
203
+ train_dataloader = dict(
204
+ batch_size=train_batch_size,
205
+ num_workers=10,
206
+ persistent_workers=True,
207
+ sampler=dict(type='DefaultSampler', shuffle=True),
208
+ dataset=dict(
209
+ type='CombinedDataset',
210
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
211
+ datasets=train_datasets,
212
+ pipeline=train_pipeline,
213
+ test_mode=False,
214
+ ))
215
+
216
+ val_dataloader = dict(
217
+ batch_size=val_batch_size,
218
+ num_workers=10,
219
+ persistent_workers=True,
220
+ drop_last=False,
221
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
222
+ dataset=dict(
223
+ type='CocoWholeBodyDataset',
224
+ data_root=data_root,
225
+ data_mode=data_mode,
226
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
227
+ bbox_file='data/coco/person_detection_results/'
228
+ 'COCO_val2017_detections_AP_H_56_person.json',
229
+ data_prefix=dict(img='coco/val2017/'),
230
+ test_mode=True,
231
+ pipeline=val_pipeline,
232
+ ))
233
+ test_dataloader = val_dataloader
234
+
235
+ # hooks
236
+ default_hooks = dict(
237
+ checkpoint=dict(
238
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
239
+
240
+ custom_hooks = [
241
+ dict(
242
+ type='EMAHook',
243
+ ema_type='ExpMomentumEMA',
244
+ momentum=0.0002,
245
+ update_buffers=True,
246
+ priority=49),
247
+ dict(
248
+ type='mmdet.PipelineSwitchHook',
249
+ switch_epoch=max_epochs - stage2_num_epochs,
250
+ switch_pipeline=train_pipeline_stage2)
251
+ ]
252
+
253
+ # evaluators
254
+ val_evaluator = dict(
255
+ type='CocoWholeBodyMetric',
256
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
257
+ test_evaluator = val_evaluator
utils/digital_human/musetalk/utils/face_detection/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time.
utils/digital_human/musetalk/utils/face_detection/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ __author__ = """Adrian Bulat"""
4
+ __email__ = '[email protected]'
5
+ __version__ = '1.0.1'
6
+
7
+ from .api import FaceAlignment, LandmarksType, NetworkSize, YOLOv8_face
utils/digital_human/musetalk/utils/face_detection/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (397 Bytes). View file
 
utils/digital_human/musetalk/utils/face_detection/__pycache__/api.cpython-310.pyc ADDED
Binary file (8.29 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/__pycache__/models.cpython-310.pyc ADDED
Binary file (7.17 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/__pycache__/utils.cpython-310.pyc ADDED
Binary file (9.93 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/api.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ import torch
4
+ from torch.utils.model_zoo import load_url
5
+ from enum import Enum
6
+ import numpy as np
7
+ import cv2
8
+ try:
9
+ import urllib.request as request_file
10
+ except BaseException:
11
+ import urllib as request_file
12
+
13
+ from .models import FAN, ResNetDepth
14
+ from .utils import *
15
+
16
+
17
+ class LandmarksType(Enum):
18
+ """Enum class defining the type of landmarks to detect.
19
+
20
+ ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
21
+ ``_2halfD`` - this points represent the projection of the 3D points into 3D
22
+ ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
23
+
24
+ """
25
+ _2D = 1
26
+ _2halfD = 2
27
+ _3D = 3
28
+
29
+
30
+ class NetworkSize(Enum):
31
+ # TINY = 1
32
+ # SMALL = 2
33
+ # MEDIUM = 3
34
+ LARGE = 4
35
+
36
+ def __new__(cls, value):
37
+ member = object.__new__(cls)
38
+ member._value_ = value
39
+ return member
40
+
41
+ def __int__(self):
42
+ return self.value
43
+
44
+
45
+
46
+ class FaceAlignment:
47
+ def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
48
+ device='cuda', flip_input=False, face_detector='sfd', verbose=False):
49
+ self.device = device
50
+ self.flip_input = flip_input
51
+ self.landmarks_type = landmarks_type
52
+ self.verbose = verbose
53
+
54
+ network_size = int(network_size)
55
+
56
+ if 'cuda' in device:
57
+ torch.backends.cudnn.benchmark = True
58
+ # torch.backends.cuda.matmul.allow_tf32 = False
59
+ # torch.backends.cudnn.benchmark = True
60
+ # torch.backends.cudnn.deterministic = False
61
+ # torch.backends.cudnn.allow_tf32 = True
62
+ # print('cuda start')
63
+
64
+
65
+ # Get the face detector
66
+ face_detector_module = __import__('face_detection.detection.' + face_detector,
67
+ globals(), locals(), [face_detector], 0)
68
+
69
+ self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)
70
+
71
+ def get_detections_for_batch(self, images):
72
+ images = images[..., ::-1]
73
+ detected_faces = self.face_detector.detect_from_batch(images.copy())
74
+ results = []
75
+
76
+ for i, d in enumerate(detected_faces):
77
+ if len(d) == 0:
78
+ results.append(None)
79
+ continue
80
+ d = d[0]
81
+ d = np.clip(d, 0, None)
82
+
83
+ x1, y1, x2, y2 = map(int, d[:-1])
84
+ results.append((x1, y1, x2, y2))
85
+
86
+ return results
87
+
88
+
89
+ class YOLOv8_face:
90
+ def __init__(self, path = 'face_detection/weights/yolov8n-face.onnx', conf_thres=0.2, iou_thres=0.5):
91
+ self.conf_threshold = conf_thres
92
+ self.iou_threshold = iou_thres
93
+ self.class_names = ['face']
94
+ self.num_classes = len(self.class_names)
95
+ # Initialize model
96
+ self.net = cv2.dnn.readNet(path)
97
+ self.input_height = 640
98
+ self.input_width = 640
99
+ self.reg_max = 16
100
+
101
+ self.project = np.arange(self.reg_max)
102
+ self.strides = (8, 16, 32)
103
+ self.feats_hw = [(math.ceil(self.input_height / self.strides[i]), math.ceil(self.input_width / self.strides[i])) for i in range(len(self.strides))]
104
+ self.anchors = self.make_anchors(self.feats_hw)
105
+
106
+ def make_anchors(self, feats_hw, grid_cell_offset=0.5):
107
+ """Generate anchors from features."""
108
+ anchor_points = {}
109
+ for i, stride in enumerate(self.strides):
110
+ h,w = feats_hw[i]
111
+ x = np.arange(0, w) + grid_cell_offset # shift x
112
+ y = np.arange(0, h) + grid_cell_offset # shift y
113
+ sx, sy = np.meshgrid(x, y)
114
+ # sy, sx = np.meshgrid(y, x)
115
+ anchor_points[stride] = np.stack((sx, sy), axis=-1).reshape(-1, 2)
116
+ return anchor_points
117
+
118
+ def softmax(self, x, axis=1):
119
+ x_exp = np.exp(x)
120
+ # 如果是列向量,则axis=0
121
+ x_sum = np.sum(x_exp, axis=axis, keepdims=True)
122
+ s = x_exp / x_sum
123
+ return s
124
+
125
+ def resize_image(self, srcimg, keep_ratio=True):
126
+ top, left, newh, neww = 0, 0, self.input_width, self.input_height
127
+ if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
128
+ hw_scale = srcimg.shape[0] / srcimg.shape[1]
129
+ if hw_scale > 1:
130
+ newh, neww = self.input_height, int(self.input_width / hw_scale)
131
+ img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
132
+ left = int((self.input_width - neww) * 0.5)
133
+ img = cv2.copyMakeBorder(img, 0, 0, left, self.input_width - neww - left, cv2.BORDER_CONSTANT,
134
+ value=(0, 0, 0)) # add border
135
+ else:
136
+ newh, neww = int(self.input_height * hw_scale), self.input_width
137
+ img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
138
+ top = int((self.input_height - newh) * 0.5)
139
+ img = cv2.copyMakeBorder(img, top, self.input_height - newh - top, 0, 0, cv2.BORDER_CONSTANT,
140
+ value=(0, 0, 0))
141
+ else:
142
+ img = cv2.resize(srcimg, (self.input_width, self.input_height), interpolation=cv2.INTER_AREA)
143
+ return img, newh, neww, top, left
144
+
145
+ def detect(self, srcimg):
146
+ input_img, newh, neww, padh, padw = self.resize_image(cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB))
147
+ scale_h, scale_w = srcimg.shape[0]/newh, srcimg.shape[1]/neww
148
+ input_img = input_img.astype(np.float32) / 255.0
149
+
150
+ blob = cv2.dnn.blobFromImage(input_img)
151
+ self.net.setInput(blob)
152
+ outputs = self.net.forward(self.net.getUnconnectedOutLayersNames())
153
+ # if isinstance(outputs, tuple):
154
+ # outputs = list(outputs)
155
+ # if float(cv2.__version__[:3])>=4.7:
156
+ # outputs = [outputs[2], outputs[0], outputs[1]] ###opencv4.7需要这一步,opencv4.5不需要
157
+ # Perform inference on the image
158
+ det_bboxes, det_conf, det_classid, landmarks = self.post_process(outputs, scale_h, scale_w, padh, padw)
159
+ return det_bboxes, det_conf, det_classid, landmarks
160
+
161
+ def post_process(self, preds, scale_h, scale_w, padh, padw):
162
+ bboxes, scores, landmarks = [], [], []
163
+ for i, pred in enumerate(preds):
164
+ stride = int(self.input_height/pred.shape[2])
165
+ pred = pred.transpose((0, 2, 3, 1))
166
+
167
+ box = pred[..., :self.reg_max * 4]
168
+ cls = 1 / (1 + np.exp(-pred[..., self.reg_max * 4:-15])).reshape((-1,1))
169
+ kpts = pred[..., -15:].reshape((-1,15)) ### x1,y1,score1, ..., x5,y5,score5
170
+
171
+ # tmp = box.reshape(self.feats_hw[i][0], self.feats_hw[i][1], 4, self.reg_max)
172
+ tmp = box.reshape(-1, 4, self.reg_max)
173
+ bbox_pred = self.softmax(tmp, axis=-1)
174
+ bbox_pred = np.dot(bbox_pred, self.project).reshape((-1,4))
175
+
176
+ bbox = self.distance2bbox(self.anchors[stride], bbox_pred, max_shape=(self.input_height, self.input_width)) * stride
177
+ kpts[:, 0::3] = (kpts[:, 0::3] * 2.0 + (self.anchors[stride][:, 0].reshape((-1,1)) - 0.5)) * stride
178
+ kpts[:, 1::3] = (kpts[:, 1::3] * 2.0 + (self.anchors[stride][:, 1].reshape((-1,1)) - 0.5)) * stride
179
+ kpts[:, 2::3] = 1 / (1+np.exp(-kpts[:, 2::3]))
180
+
181
+ bbox -= np.array([[padw, padh, padw, padh]]) ###合理使用广播法则
182
+ bbox *= np.array([[scale_w, scale_h, scale_w, scale_h]])
183
+ kpts -= np.tile(np.array([padw, padh, 0]), 5).reshape((1,15))
184
+ kpts *= np.tile(np.array([scale_w, scale_h, 1]), 5).reshape((1,15))
185
+
186
+ bboxes.append(bbox)
187
+ scores.append(cls)
188
+ landmarks.append(kpts)
189
+
190
+ bboxes = np.concatenate(bboxes, axis=0)
191
+ scores = np.concatenate(scores, axis=0)
192
+ landmarks = np.concatenate(landmarks, axis=0)
193
+
194
+ bboxes_wh = bboxes.copy()
195
+ bboxes_wh[:, 2:4] = bboxes[:, 2:4] - bboxes[:, 0:2] ####xywh
196
+ classIds = np.argmax(scores, axis=1)
197
+ confidences = np.max(scores, axis=1) ####max_class_confidence
198
+
199
+ mask = confidences>self.conf_threshold
200
+ bboxes_wh = bboxes_wh[mask] ###合理使用广播法则
201
+ confidences = confidences[mask]
202
+ classIds = classIds[mask]
203
+ landmarks = landmarks[mask]
204
+
205
+ indices = cv2.dnn.NMSBoxes(bboxes_wh.tolist(), confidences.tolist(), self.conf_threshold,
206
+ self.iou_threshold).flatten()
207
+ if len(indices) > 0:
208
+ mlvl_bboxes = bboxes_wh[indices]
209
+ confidences = confidences[indices]
210
+ classIds = classIds[indices]
211
+ landmarks = landmarks[indices]
212
+ return mlvl_bboxes, confidences, classIds, landmarks
213
+ else:
214
+ print('nothing detect')
215
+ return np.array([]), np.array([]), np.array([]), np.array([])
216
+
217
+ def distance2bbox(self, points, distance, max_shape=None):
218
+ x1 = points[:, 0] - distance[:, 0]
219
+ y1 = points[:, 1] - distance[:, 1]
220
+ x2 = points[:, 0] + distance[:, 2]
221
+ y2 = points[:, 1] + distance[:, 3]
222
+ if max_shape is not None:
223
+ x1 = np.clip(x1, 0, max_shape[1])
224
+ y1 = np.clip(y1, 0, max_shape[0])
225
+ x2 = np.clip(x2, 0, max_shape[1])
226
+ y2 = np.clip(y2, 0, max_shape[0])
227
+ return np.stack([x1, y1, x2, y2], axis=-1)
228
+
229
+ def draw_detections(self, image, boxes, scores, kpts):
230
+ for box, score, kp in zip(boxes, scores, kpts):
231
+ x, y, w, h = box.astype(int)
232
+ # Draw rectangle
233
+ cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), thickness=3)
234
+ cv2.putText(image, "face:"+str(round(score,2)), (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), thickness=2)
235
+ for i in range(5):
236
+ cv2.circle(image, (int(kp[i * 3]), int(kp[i * 3 + 1])), 4, (0, 255, 0), thickness=-1)
237
+ # cv2.putText(image, str(i), (int(kp[i * 3]), int(kp[i * 3 + 1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), thickness=1)
238
+ return image
239
+
240
+ ROOT = os.path.dirname(os.path.abspath(__file__))
utils/digital_human/musetalk/utils/face_detection/detection/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .core import FaceDetector
utils/digital_human/musetalk/utils/face_detection/detection/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (233 Bytes). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/__pycache__/core.cpython-310.pyc ADDED
Binary file (4.88 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/core.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import glob
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+ import torch
6
+ import cv2
7
+
8
+
9
+ class FaceDetector(object):
10
+ """An abstract class representing a face detector.
11
+
12
+ Any other face detection implementation must subclass it. All subclasses
13
+ must implement ``detect_from_image``, that return a list of detected
14
+ bounding boxes. Optionally, for speed considerations detect from path is
15
+ recommended.
16
+ """
17
+
18
+ def __init__(self, device, verbose):
19
+ self.device = device
20
+ self.verbose = verbose
21
+
22
+ if verbose:
23
+ if 'cpu' in device:
24
+ logger = logging.getLogger(__name__)
25
+ logger.warning("Detection running on CPU, this may be potentially slow.")
26
+
27
+ if 'cpu' not in device and 'cuda' not in device:
28
+ if verbose:
29
+ logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
30
+ raise ValueError
31
+
32
+ def detect_from_image(self, tensor_or_path):
33
+ """Detects faces in a given image.
34
+
35
+ This function detects the faces present in a provided BGR(usually)
36
+ image. The input can be either the image itself or the path to it.
37
+
38
+ Arguments:
39
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
40
+ to an image or the image itself.
41
+
42
+ Example::
43
+
44
+ >>> path_to_image = 'data/image_01.jpg'
45
+ ... detected_faces = detect_from_image(path_to_image)
46
+ [A list of bounding boxes (x1, y1, x2, y2)]
47
+ >>> image = cv2.imread(path_to_image)
48
+ ... detected_faces = detect_from_image(image)
49
+ [A list of bounding boxes (x1, y1, x2, y2)]
50
+
51
+ """
52
+ raise NotImplementedError
53
+
54
+ def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
55
+ """Detects faces from all the images present in a given directory.
56
+
57
+ Arguments:
58
+ path {string} -- a string containing a path that points to the folder containing the images
59
+
60
+ Keyword Arguments:
61
+ extensions {list} -- list of string containing the extensions to be
62
+ consider in the following format: ``.extension_name`` (default:
63
+ {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
64
+ folder recursively (default: {False}) show_progress_bar {bool} --
65
+ display a progressbar (default: {True})
66
+
67
+ Example:
68
+ >>> directory = 'data'
69
+ ... detected_faces = detect_from_directory(directory)
70
+ {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
71
+
72
+ """
73
+ if self.verbose:
74
+ logger = logging.getLogger(__name__)
75
+
76
+ if len(extensions) == 0:
77
+ if self.verbose:
78
+ logger.error("Expected at list one extension, but none was received.")
79
+ raise ValueError
80
+
81
+ if self.verbose:
82
+ logger.info("Constructing the list of images.")
83
+ additional_pattern = '/**/*' if recursive else '/*'
84
+ files = []
85
+ for extension in extensions:
86
+ files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
87
+
88
+ if self.verbose:
89
+ logger.info("Finished searching for images. %s images found", len(files))
90
+ logger.info("Preparing to run the detection.")
91
+
92
+ predictions = {}
93
+ for image_path in tqdm(files, disable=not show_progress_bar):
94
+ if self.verbose:
95
+ logger.info("Running the face detector on image: %s", image_path)
96
+ predictions[image_path] = self.detect_from_image(image_path)
97
+
98
+ if self.verbose:
99
+ logger.info("The detector was successfully run on all %s images", len(files))
100
+
101
+ return predictions
102
+
103
+ @property
104
+ def reference_scale(self):
105
+ raise NotImplementedError
106
+
107
+ @property
108
+ def reference_x_shift(self):
109
+ raise NotImplementedError
110
+
111
+ @property
112
+ def reference_y_shift(self):
113
+ raise NotImplementedError
114
+
115
+ @staticmethod
116
+ def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
117
+ """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
118
+
119
+ Arguments:
120
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
121
+ """
122
+ if isinstance(tensor_or_path, str):
123
+ return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
124
+ elif torch.is_tensor(tensor_or_path):
125
+ # Call cpu in case its coming from cuda
126
+ return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
127
+ elif isinstance(tensor_or_path, np.ndarray):
128
+ return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
129
+ else:
130
+ raise TypeError
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .sfd_detector import SFDDetector as FaceDetector
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (258 Bytes). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/bbox.cpython-310.pyc ADDED
Binary file (4.26 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/detect.cpython-310.pyc ADDED
Binary file (3.8 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/net_s3fd.cpython-310.pyc ADDED
Binary file (3.91 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/sfd/__pycache__/sfd_detector.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
utils/digital_human/musetalk/utils/face_detection/detection/sfd/bbox.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ import sys
4
+ import cv2
5
+ import random
6
+ import datetime
7
+ import time
8
+ import math
9
+ import argparse
10
+ import numpy as np
11
+ import torch
12
+
13
+ try:
14
+ from iou import IOU
15
+ except BaseException:
16
+ # IOU cython speedup 10x
17
+ def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
18
+ sa = abs((ax2 - ax1) * (ay2 - ay1))
19
+ sb = abs((bx2 - bx1) * (by2 - by1))
20
+ x1, y1 = max(ax1, bx1), max(ay1, by1)
21
+ x2, y2 = min(ax2, bx2), min(ay2, by2)
22
+ w = x2 - x1
23
+ h = y2 - y1
24
+ if w < 0 or h < 0:
25
+ return 0.0
26
+ else:
27
+ return 1.0 * w * h / (sa + sb - w * h)
28
+
29
+
30
+ def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
31
+ xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
32
+ dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
33
+ dw, dh = math.log(ww / aww), math.log(hh / ahh)
34
+ return dx, dy, dw, dh
35
+
36
+
37
+ def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
38
+ xc, yc = dx * aww + axc, dy * ahh + ayc
39
+ ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
40
+ x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
41
+ return x1, y1, x2, y2
42
+
43
+
44
+ def nms(dets, thresh):
45
+ if 0 == len(dets):
46
+ return []
47
+ x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
48
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
49
+ order = scores.argsort()[::-1]
50
+
51
+ keep = []
52
+ while order.size > 0:
53
+ i = order[0]
54
+ keep.append(i)
55
+ xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
56
+ xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
57
+
58
+ w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
59
+ ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
60
+
61
+ inds = np.where(ovr <= thresh)[0]
62
+ order = order[inds + 1]
63
+
64
+ return keep
65
+
66
+
67
+ def encode(matched, priors, variances):
68
+ """Encode the variances from the priorbox layers into the ground truth boxes
69
+ we have matched (based on jaccard overlap) with the prior boxes.
70
+ Args:
71
+ matched: (tensor) Coords of ground truth for each prior in point-form
72
+ Shape: [num_priors, 4].
73
+ priors: (tensor) Prior boxes in center-offset form
74
+ Shape: [num_priors,4].
75
+ variances: (list[float]) Variances of priorboxes
76
+ Return:
77
+ encoded boxes (tensor), Shape: [num_priors, 4]
78
+ """
79
+
80
+ # dist b/t match center and prior's center
81
+ g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
82
+ # encode variance
83
+ g_cxcy /= (variances[0] * priors[:, 2:])
84
+ # match wh / prior wh
85
+ g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
86
+ g_wh = torch.log(g_wh) / variances[1]
87
+ # return target for smooth_l1_loss
88
+ return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
89
+
90
+
91
+ def decode(loc, priors, variances):
92
+ """Decode locations from predictions using priors to undo
93
+ the encoding we did for offset regression at train time.
94
+ Args:
95
+ loc (tensor): location predictions for loc layers,
96
+ Shape: [num_priors,4]
97
+ priors (tensor): Prior boxes in center-offset form.
98
+ Shape: [num_priors,4].
99
+ variances: (list[float]) Variances of priorboxes
100
+ Return:
101
+ decoded bounding box predictions
102
+ """
103
+
104
+ boxes = torch.cat((
105
+ priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
106
+ priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
107
+ boxes[:, :2] -= boxes[:, 2:] / 2
108
+ boxes[:, 2:] += boxes[:, :2]
109
+ return boxes
110
+
111
+ def batch_decode(loc, priors, variances):
112
+ """Decode locations from predictions using priors to undo
113
+ the encoding we did for offset regression at train time.
114
+ Args:
115
+ loc (tensor): location predictions for loc layers,
116
+ Shape: [num_priors,4]
117
+ priors (tensor): Prior boxes in center-offset form.
118
+ Shape: [num_priors,4].
119
+ variances: (list[float]) Variances of priorboxes
120
+ Return:
121
+ decoded bounding box predictions
122
+ """
123
+
124
+ boxes = torch.cat((
125
+ priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
126
+ priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
127
+ boxes[:, :, :2] -= boxes[:, :, 2:] / 2
128
+ boxes[:, :, 2:] += boxes[:, :, :2]
129
+ return boxes
utils/digital_human/musetalk/utils/face_detection/detection/sfd/detect.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ import os
5
+ import sys
6
+ import cv2
7
+ import random
8
+ import datetime
9
+ import math
10
+ import argparse
11
+ import numpy as np
12
+
13
+ import scipy.io as sio
14
+ import zipfile
15
+ from .net_s3fd import s3fd
16
+ from .bbox import *
17
+
18
+
19
+ def detect(net, img, device):
20
+ img = img - np.array([104, 117, 123])
21
+ img = img.transpose(2, 0, 1)
22
+ img = img.reshape((1,) + img.shape)
23
+
24
+ if 'cuda' in device:
25
+ torch.backends.cudnn.benchmark = True
26
+
27
+ img = torch.from_numpy(img).float().to(device)
28
+ BB, CC, HH, WW = img.size()
29
+ with torch.no_grad():
30
+ olist = net(img)
31
+
32
+ bboxlist = []
33
+ for i in range(len(olist) // 2):
34
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
35
+ olist = [oelem.data.cpu() for oelem in olist]
36
+ for i in range(len(olist) // 2):
37
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
38
+ FB, FC, FH, FW = ocls.size() # feature map size
39
+ stride = 2**(i + 2) # 4,8,16,32,64,128
40
+ anchor = stride * 4
41
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
42
+ for Iindex, hindex, windex in poss:
43
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
44
+ score = ocls[0, 1, hindex, windex]
45
+ loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
46
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
47
+ variances = [0.1, 0.2]
48
+ box = decode(loc, priors, variances)
49
+ x1, y1, x2, y2 = box[0] * 1.0
50
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
51
+ bboxlist.append([x1, y1, x2, y2, score])
52
+ bboxlist = np.array(bboxlist)
53
+ if 0 == len(bboxlist):
54
+ bboxlist = np.zeros((1, 5))
55
+
56
+ return bboxlist
57
+
58
+ def batch_detect(net, imgs, device):
59
+ imgs = imgs - np.array([104, 117, 123])
60
+ imgs = imgs.transpose(0, 3, 1, 2)
61
+
62
+ if 'cuda' in device:
63
+ torch.backends.cudnn.benchmark = True
64
+
65
+ imgs = torch.from_numpy(imgs).float().to(device)
66
+ BB, CC, HH, WW = imgs.size()
67
+ with torch.no_grad():
68
+ olist = net(imgs)
69
+ # print(olist)
70
+
71
+ bboxlist = []
72
+ for i in range(len(olist) // 2):
73
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
74
+
75
+ olist = [oelem.cpu() for oelem in olist]
76
+ for i in range(len(olist) // 2):
77
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
78
+ FB, FC, FH, FW = ocls.size() # feature map size
79
+ stride = 2**(i + 2) # 4,8,16,32,64,128
80
+ anchor = stride * 4
81
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
82
+ for Iindex, hindex, windex in poss:
83
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
84
+ score = ocls[:, 1, hindex, windex]
85
+ loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
86
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
87
+ variances = [0.1, 0.2]
88
+ box = batch_decode(loc, priors, variances)
89
+ box = box[:, 0] * 1.0
90
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
91
+ bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
92
+ bboxlist = np.array(bboxlist)
93
+ if 0 == len(bboxlist):
94
+ bboxlist = np.zeros((1, BB, 5))
95
+
96
+ return bboxlist
97
+
98
+ def flip_detect(net, img, device):
99
+ img = cv2.flip(img, 1)
100
+ b = detect(net, img, device)
101
+
102
+ bboxlist = np.zeros(b.shape)
103
+ bboxlist[:, 0] = img.shape[1] - b[:, 2]
104
+ bboxlist[:, 1] = b[:, 1]
105
+ bboxlist[:, 2] = img.shape[1] - b[:, 0]
106
+ bboxlist[:, 3] = b[:, 3]
107
+ bboxlist[:, 4] = b[:, 4]
108
+ return bboxlist
109
+
110
+
111
+ def pts_to_bb(pts):
112
+ min_x, min_y = np.min(pts, axis=0)
113
+ max_x, max_y = np.max(pts, axis=0)
114
+ return np.array([min_x, min_y, max_x, max_y])
utils/digital_human/musetalk/utils/face_detection/detection/sfd/net_s3fd.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ class L2Norm(nn.Module):
7
+ def __init__(self, n_channels, scale=1.0):
8
+ super(L2Norm, self).__init__()
9
+ self.n_channels = n_channels
10
+ self.scale = scale
11
+ self.eps = 1e-10
12
+ self.weight = nn.Parameter(torch.Tensor(self.n_channels))
13
+ self.weight.data *= 0.0
14
+ self.weight.data += self.scale
15
+
16
+ def forward(self, x):
17
+ norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
18
+ x = x / norm * self.weight.view(1, -1, 1, 1)
19
+ return x
20
+
21
+
22
+ class s3fd(nn.Module):
23
+ def __init__(self):
24
+ super(s3fd, self).__init__()
25
+ self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
26
+ self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
27
+
28
+ self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
29
+ self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
30
+
31
+ self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
32
+ self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
33
+ self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
34
+
35
+ self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
36
+ self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
37
+ self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
38
+
39
+ self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
40
+ self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
41
+ self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
42
+
43
+ self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3)
44
+ self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0)
45
+
46
+ self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
47
+ self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
48
+
49
+ self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0)
50
+ self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
51
+
52
+ self.conv3_3_norm = L2Norm(256, scale=10)
53
+ self.conv4_3_norm = L2Norm(512, scale=8)
54
+ self.conv5_3_norm = L2Norm(512, scale=5)
55
+
56
+ self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
57
+ self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
58
+ self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
59
+ self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
60
+ self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
61
+ self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
62
+
63
+ self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1)
64
+ self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1)
65
+ self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
66
+ self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
67
+ self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1)
68
+ self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
69
+
70
+ def forward(self, x):
71
+ h = F.relu(self.conv1_1(x))
72
+ h = F.relu(self.conv1_2(h))
73
+ h = F.max_pool2d(h, 2, 2)
74
+
75
+ h = F.relu(self.conv2_1(h))
76
+ h = F.relu(self.conv2_2(h))
77
+ h = F.max_pool2d(h, 2, 2)
78
+
79
+ h = F.relu(self.conv3_1(h))
80
+ h = F.relu(self.conv3_2(h))
81
+ h = F.relu(self.conv3_3(h))
82
+ f3_3 = h
83
+ h = F.max_pool2d(h, 2, 2)
84
+
85
+ h = F.relu(self.conv4_1(h))
86
+ h = F.relu(self.conv4_2(h))
87
+ h = F.relu(self.conv4_3(h))
88
+ f4_3 = h
89
+ h = F.max_pool2d(h, 2, 2)
90
+
91
+ h = F.relu(self.conv5_1(h))
92
+ h = F.relu(self.conv5_2(h))
93
+ h = F.relu(self.conv5_3(h))
94
+ f5_3 = h
95
+ h = F.max_pool2d(h, 2, 2)
96
+
97
+ h = F.relu(self.fc6(h))
98
+ h = F.relu(self.fc7(h))
99
+ ffc7 = h
100
+ h = F.relu(self.conv6_1(h))
101
+ h = F.relu(self.conv6_2(h))
102
+ f6_2 = h
103
+ h = F.relu(self.conv7_1(h))
104
+ h = F.relu(self.conv7_2(h))
105
+ f7_2 = h
106
+
107
+ f3_3 = self.conv3_3_norm(f3_3)
108
+ f4_3 = self.conv4_3_norm(f4_3)
109
+ f5_3 = self.conv5_3_norm(f5_3)
110
+
111
+ cls1 = self.conv3_3_norm_mbox_conf(f3_3)
112
+ reg1 = self.conv3_3_norm_mbox_loc(f3_3)
113
+ cls2 = self.conv4_3_norm_mbox_conf(f4_3)
114
+ reg2 = self.conv4_3_norm_mbox_loc(f4_3)
115
+ cls3 = self.conv5_3_norm_mbox_conf(f5_3)
116
+ reg3 = self.conv5_3_norm_mbox_loc(f5_3)
117
+ cls4 = self.fc7_mbox_conf(ffc7)
118
+ reg4 = self.fc7_mbox_loc(ffc7)
119
+ cls5 = self.conv6_2_mbox_conf(f6_2)
120
+ reg5 = self.conv6_2_mbox_loc(f6_2)
121
+ cls6 = self.conv7_2_mbox_conf(f7_2)
122
+ reg6 = self.conv7_2_mbox_loc(f7_2)
123
+
124
+ # max-out background label
125
+ chunk = torch.chunk(cls1, 4, 1)
126
+ bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
127
+ cls1 = torch.cat([bmax, chunk[3]], dim=1)
128
+
129
+ return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]
utils/digital_human/musetalk/utils/face_detection/detection/sfd/sfd_detector.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ from torch.utils.model_zoo import load_url
4
+
5
+ from ..core import FaceDetector
6
+
7
+ from .net_s3fd import s3fd
8
+ from .bbox import *
9
+ from .detect import *
10
+
11
+ models_urls = {
12
+ 's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
13
+ }
14
+
15
+
16
+ class SFDDetector(FaceDetector):
17
+ def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
18
+ super(SFDDetector, self).__init__(device, verbose)
19
+
20
+ # Initialise the face detector
21
+ if not os.path.isfile(path_to_detector):
22
+ model_weights = load_url(models_urls['s3fd'])
23
+ else:
24
+ model_weights = torch.load(path_to_detector)
25
+
26
+ self.face_detector = s3fd()
27
+ self.face_detector.load_state_dict(model_weights)
28
+ self.face_detector.to(device)
29
+ self.face_detector.eval()
30
+
31
+ def detect_from_image(self, tensor_or_path):
32
+ image = self.tensor_or_path_to_ndarray(tensor_or_path)
33
+
34
+ bboxlist = detect(self.face_detector, image, device=self.device)
35
+ keep = nms(bboxlist, 0.3)
36
+ bboxlist = bboxlist[keep, :]
37
+ bboxlist = [x for x in bboxlist if x[-1] > 0.5]
38
+
39
+ return bboxlist
40
+
41
+ def detect_from_batch(self, images):
42
+ bboxlists = batch_detect(self.face_detector, images, device=self.device)
43
+ keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
44
+ bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
45
+ bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
46
+
47
+ return bboxlists
48
+
49
+ @property
50
+ def reference_scale(self):
51
+ return 195
52
+
53
+ @property
54
+ def reference_x_shift(self):
55
+ return 0
56
+
57
+ @property
58
+ def reference_y_shift(self):
59
+ return 0
utils/digital_human/musetalk/utils/face_detection/models.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+
7
+ def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
8
+ "3x3 convolution with padding"
9
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3,
10
+ stride=strd, padding=padding, bias=bias)
11
+
12
+
13
+ class ConvBlock(nn.Module):
14
+ def __init__(self, in_planes, out_planes):
15
+ super(ConvBlock, self).__init__()
16
+ self.bn1 = nn.BatchNorm2d(in_planes)
17
+ self.conv1 = conv3x3(in_planes, int(out_planes / 2))
18
+ self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
19
+ self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
20
+ self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
21
+ self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
22
+
23
+ if in_planes != out_planes:
24
+ self.downsample = nn.Sequential(
25
+ nn.BatchNorm2d(in_planes),
26
+ nn.ReLU(True),
27
+ nn.Conv2d(in_planes, out_planes,
28
+ kernel_size=1, stride=1, bias=False),
29
+ )
30
+ else:
31
+ self.downsample = None
32
+
33
+ def forward(self, x):
34
+ residual = x
35
+
36
+ out1 = self.bn1(x)
37
+ out1 = F.relu(out1, True)
38
+ out1 = self.conv1(out1)
39
+
40
+ out2 = self.bn2(out1)
41
+ out2 = F.relu(out2, True)
42
+ out2 = self.conv2(out2)
43
+
44
+ out3 = self.bn3(out2)
45
+ out3 = F.relu(out3, True)
46
+ out3 = self.conv3(out3)
47
+
48
+ out3 = torch.cat((out1, out2, out3), 1)
49
+
50
+ if self.downsample is not None:
51
+ residual = self.downsample(residual)
52
+
53
+ out3 += residual
54
+
55
+ return out3
56
+
57
+
58
+ class Bottleneck(nn.Module):
59
+
60
+ expansion = 4
61
+
62
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
63
+ super(Bottleneck, self).__init__()
64
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
65
+ self.bn1 = nn.BatchNorm2d(planes)
66
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
67
+ padding=1, bias=False)
68
+ self.bn2 = nn.BatchNorm2d(planes)
69
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
70
+ self.bn3 = nn.BatchNorm2d(planes * 4)
71
+ self.relu = nn.ReLU(inplace=True)
72
+ self.downsample = downsample
73
+ self.stride = stride
74
+
75
+ def forward(self, x):
76
+ residual = x
77
+
78
+ out = self.conv1(x)
79
+ out = self.bn1(out)
80
+ out = self.relu(out)
81
+
82
+ out = self.conv2(out)
83
+ out = self.bn2(out)
84
+ out = self.relu(out)
85
+
86
+ out = self.conv3(out)
87
+ out = self.bn3(out)
88
+
89
+ if self.downsample is not None:
90
+ residual = self.downsample(x)
91
+
92
+ out += residual
93
+ out = self.relu(out)
94
+
95
+ return out
96
+
97
+
98
+ class HourGlass(nn.Module):
99
+ def __init__(self, num_modules, depth, num_features):
100
+ super(HourGlass, self).__init__()
101
+ self.num_modules = num_modules
102
+ self.depth = depth
103
+ self.features = num_features
104
+
105
+ self._generate_network(self.depth)
106
+
107
+ def _generate_network(self, level):
108
+ self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))
109
+
110
+ self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))
111
+
112
+ if level > 1:
113
+ self._generate_network(level - 1)
114
+ else:
115
+ self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))
116
+
117
+ self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))
118
+
119
+ def _forward(self, level, inp):
120
+ # Upper branch
121
+ up1 = inp
122
+ up1 = self._modules['b1_' + str(level)](up1)
123
+
124
+ # Lower branch
125
+ low1 = F.avg_pool2d(inp, 2, stride=2)
126
+ low1 = self._modules['b2_' + str(level)](low1)
127
+
128
+ if level > 1:
129
+ low2 = self._forward(level - 1, low1)
130
+ else:
131
+ low2 = low1
132
+ low2 = self._modules['b2_plus_' + str(level)](low2)
133
+
134
+ low3 = low2
135
+ low3 = self._modules['b3_' + str(level)](low3)
136
+
137
+ up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
138
+
139
+ return up1 + up2
140
+
141
+ def forward(self, x):
142
+ return self._forward(self.depth, x)
143
+
144
+
145
+ class FAN(nn.Module):
146
+
147
+ def __init__(self, num_modules=1):
148
+ super(FAN, self).__init__()
149
+ self.num_modules = num_modules
150
+
151
+ # Base part
152
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
153
+ self.bn1 = nn.BatchNorm2d(64)
154
+ self.conv2 = ConvBlock(64, 128)
155
+ self.conv3 = ConvBlock(128, 128)
156
+ self.conv4 = ConvBlock(128, 256)
157
+
158
+ # Stacking part
159
+ for hg_module in range(self.num_modules):
160
+ self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
161
+ self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
162
+ self.add_module('conv_last' + str(hg_module),
163
+ nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
164
+ self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
165
+ self.add_module('l' + str(hg_module), nn.Conv2d(256,
166
+ 68, kernel_size=1, stride=1, padding=0))
167
+
168
+ if hg_module < self.num_modules - 1:
169
+ self.add_module(
170
+ 'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
171
+ self.add_module('al' + str(hg_module), nn.Conv2d(68,
172
+ 256, kernel_size=1, stride=1, padding=0))
173
+
174
+ def forward(self, x):
175
+ x = F.relu(self.bn1(self.conv1(x)), True)
176
+ x = F.avg_pool2d(self.conv2(x), 2, stride=2)
177
+ x = self.conv3(x)
178
+ x = self.conv4(x)
179
+
180
+ previous = x
181
+
182
+ outputs = []
183
+ for i in range(self.num_modules):
184
+ hg = self._modules['m' + str(i)](previous)
185
+
186
+ ll = hg
187
+ ll = self._modules['top_m_' + str(i)](ll)
188
+
189
+ ll = F.relu(self._modules['bn_end' + str(i)]
190
+ (self._modules['conv_last' + str(i)](ll)), True)
191
+
192
+ # Predict heatmaps
193
+ tmp_out = self._modules['l' + str(i)](ll)
194
+ outputs.append(tmp_out)
195
+
196
+ if i < self.num_modules - 1:
197
+ ll = self._modules['bl' + str(i)](ll)
198
+ tmp_out_ = self._modules['al' + str(i)](tmp_out)
199
+ previous = previous + ll + tmp_out_
200
+
201
+ return outputs
202
+
203
+
204
+ class ResNetDepth(nn.Module):
205
+
206
+ def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68):
207
+ self.inplanes = 64
208
+ super(ResNetDepth, self).__init__()
209
+ self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3,
210
+ bias=False)
211
+ self.bn1 = nn.BatchNorm2d(64)
212
+ self.relu = nn.ReLU(inplace=True)
213
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
214
+ self.layer1 = self._make_layer(block, 64, layers[0])
215
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
216
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
217
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
218
+ self.avgpool = nn.AvgPool2d(7)
219
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
220
+
221
+ for m in self.modules():
222
+ if isinstance(m, nn.Conv2d):
223
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
224
+ m.weight.data.normal_(0, math.sqrt(2. / n))
225
+ elif isinstance(m, nn.BatchNorm2d):
226
+ m.weight.data.fill_(1)
227
+ m.bias.data.zero_()
228
+
229
+ def _make_layer(self, block, planes, blocks, stride=1):
230
+ downsample = None
231
+ if stride != 1 or self.inplanes != planes * block.expansion:
232
+ downsample = nn.Sequential(
233
+ nn.Conv2d(self.inplanes, planes * block.expansion,
234
+ kernel_size=1, stride=stride, bias=False),
235
+ nn.BatchNorm2d(planes * block.expansion),
236
+ )
237
+
238
+ layers = []
239
+ layers.append(block(self.inplanes, planes, stride, downsample))
240
+ self.inplanes = planes * block.expansion
241
+ for i in range(1, blocks):
242
+ layers.append(block(self.inplanes, planes))
243
+
244
+ return nn.Sequential(*layers)
245
+
246
+ def forward(self, x):
247
+ x = self.conv1(x)
248
+ x = self.bn1(x)
249
+ x = self.relu(x)
250
+ x = self.maxpool(x)
251
+
252
+ x = self.layer1(x)
253
+ x = self.layer2(x)
254
+ x = self.layer3(x)
255
+ x = self.layer4(x)
256
+
257
+ x = self.avgpool(x)
258
+ x = x.view(x.size(0), -1)
259
+ x = self.fc(x)
260
+
261
+ return x