Kimata commited on
Commit
8871fa0
·
1 Parent(s): 10efb9c

update changes

Browse files
__pycache__/inference.cpython-39.pyc ADDED
Binary file (6.2 kB). View file
 
app.py CHANGED
@@ -6,7 +6,7 @@ title="Multimodal deepfake detector"
6
  description="Deepfake detection for videos, images and audio modalities."
7
 
8
 
9
- video_interface = gr.Interface(pipeline.deepfakes_video_predict,
10
  gr.Video(),
11
  "text",
12
  examples = ["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
@@ -14,14 +14,14 @@ video_interface = gr.Interface(pipeline.deepfakes_video_predict,
14
  )
15
 
16
 
17
- image_interface = gr.Interface(pipeline.deepfakes_image_predict,
18
  gr.Image(),
19
  "text",
20
  examples = ["images/lady.jpg", "images/fake_image.jpg"],
21
  cache_examples=False
22
  )
23
 
24
- audio_interface = gr.Interface(pipeline.deepfakes_audio_predict,
25
  gr.Audio(),
26
  "text",
27
  examples = ["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],
 
6
  description="Deepfake detection for videos, images and audio modalities."
7
 
8
 
9
+ video_interface = gr.Interface(inference.deepfakes_video_predict,
10
  gr.Video(),
11
  "text",
12
  examples = ["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
 
14
  )
15
 
16
 
17
+ image_interface = gr.Interface(inference.deepfakes_image_predict,
18
  gr.Image(),
19
  "text",
20
  examples = ["images/lady.jpg", "images/fake_image.jpg"],
21
  cache_examples=False
22
  )
23
 
24
+ audio_interface = gr.Interface(inference.deepfakes_spec_predict,
25
  gr.Audio(),
26
  "text",
27
  examples = ["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],
inference.py CHANGED
@@ -5,8 +5,6 @@ import argparse
5
  import numpy as np
6
  import torch.nn as nn
7
  from models.TMC import ETMC
8
-
9
- from torchsummary import summary
10
  from models import image
11
 
12
  #Set random seed for reproducibility.
@@ -90,66 +88,70 @@ def load_spec_modality_model(args):
90
  spec_encoder.eval()
91
  return spec_encoder
92
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def preprocess_img(face):
94
  face = face / 255
95
  face = cv2.resize(face, (256, 256))
96
- face = face.permute(2, 0, 1) #(W, H, C) -> (C, W, H)
97
- face = torch.unsqueeze(face, dim = 0)
98
- face_pt = torch.Tensor(face)
99
  return face_pt
100
 
101
  def preprocess_audio(audio_file):
102
- audio = torch.unsqueeze(audio_file, dim = 0)
103
- audio_pt = torch.Tensor(audio)
104
  return audio_pt
105
 
106
  def deepfakes_spec_predict(input_audio):
107
- audio = preprocess_audio(input_audio)
108
-
109
- #Load audio and multimodal model.
110
- multimodal = load_multimodal_model()
111
- spec_model = load_spec_modality_model()
112
-
113
  spec_grads = spec_model.forward(audio)
114
  multimodal_grads = multimodal.spec_depth[0].forward(spec_grads)
115
 
116
  out = nn.Softmax()(multimodal_grads)
117
  max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
118
  max_value = out[max] #Actual value of the tensor.
 
119
 
120
  if max_value > 0.5:
121
  preds = round(100 - (max_value*100), 3)
122
- text2 = f"The audio is REAL. \n Deepfakes Confidence: {preds}%"
123
 
124
  else:
125
  preds = round(max_value*100, 3)
126
- text2 = "The audio is FAKE. \n Deepfakes Confidence: {preds}%"
127
 
128
- return max, max_value, text2
129
 
130
  def deepfakes_image_predict(input_image):
131
  face = preprocess_img(input_image)
132
 
133
- #Load image and multimodal model.
134
- multimodal = load_multimodal_model()
135
- img_model = load_img_modality_model()
136
-
137
  img_grads = img_model.forward(face)
138
  multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
139
 
140
  out = nn.Softmax()(multimodal_grads)
141
- max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
142
  max_value = out[max] #Actual value of the tensor.
 
143
 
144
  if max_value > 0.5:
145
  preds = round(100 - (max_value*100), 3)
146
- text2 = f"The image is REAL. \n Deepfakes Confidence: {preds}%"
147
 
148
  else:
149
  preds = round(max_value*100, 3)
150
- text2 = "The image is FAKE. \n Deepfakes Confidence: {preds}%"
151
 
152
- return max, max_value, text2
153
 
154
 
155
  def preprocess_video(input_video, n_frames = 5):
@@ -181,9 +183,7 @@ def preprocess_video(input_video, n_frames = 5):
181
  def deepfakes_video_predict(input_video):
182
  '''Perform inference on a video.'''
183
  video_frames = preprocess_video(input_video)
184
- #Load image and multimodal model.
185
- multimodal = load_multimodal_model()
186
- img_model = load_img_modality_model()
187
  real_grads = []
188
  fake_grads = []
189
 
@@ -192,38 +192,19 @@ def deepfakes_video_predict(input_video):
192
  multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
193
 
194
  out = nn.Softmax()(multimodal_grads)
195
- real_grads.append(out[0].detach().numpy())
196
- fake_grads.append(out[1].detch().numpy())
197
- # max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
198
- # max_value = out[max] #Actual value of the tensor.
199
 
200
  real_grads_mean = np.mean(real_grads)
201
  fake_grads_mean = np.mean(fake_grads)
202
 
203
  if real_grads_mean > fake_grads_mean:
204
  res = round(real_grads_mean * 100, 3)
205
- text = f"The video is REAL. \n Deepfakes Confidence: {res}%"
206
  else:
207
  res = round(100 - (real_grads_mean * 100), 3)
208
- text = f"The video is FAKE. \n Deepfakes Confidence: {res}%"
209
  return text
210
 
211
-
212
-
213
-
214
-
215
- def cli_main():
216
- parser = argparse.ArgumentParser(description="Train Models")
217
- get_args(parser)
218
- args, remaining_args = parser.parse_known_args()
219
- assert remaining_args == [], remaining_args
220
- # image_multimodal_inference(args)
221
- # spec_multimodal_inference(args)
222
- model_summary(args)
223
-
224
- if __name__ == "__main__":
225
- import warnings
226
- warnings.filterwarnings("ignore")
227
- cli_main()
228
-
229
-
 
5
  import numpy as np
6
  import torch.nn as nn
7
  from models.TMC import ETMC
 
 
8
  from models import image
9
 
10
  #Set random seed for reproducibility.
 
88
  spec_encoder.eval()
89
  return spec_encoder
90
 
91
+
92
+ #Load models.
93
+ parser = argparse.ArgumentParser(description="Train Models")
94
+ get_args(parser)
95
+ args, remaining_args = parser.parse_known_args()
96
+ assert remaining_args == [], remaining_args
97
+
98
+ multimodal = load_multimodal_model(args)
99
+ spec_model = load_spec_modality_model(args)
100
+ img_model = load_img_modality_model(args)
101
+
102
+
103
  def preprocess_img(face):
104
  face = face / 255
105
  face = cv2.resize(face, (256, 256))
106
+ face = face.transpose(2, 0, 1) #(W, H, C) -> (C, W, H)
107
+ face_pt = torch.unsqueeze(torch.Tensor(face), dim = 0)
 
108
  return face_pt
109
 
110
  def preprocess_audio(audio_file):
111
+ audio_pt = torch.unsqueeze(torch.Tensor(audio_file), dim = 0)
 
112
  return audio_pt
113
 
114
  def deepfakes_spec_predict(input_audio):
115
+ x, _ = input_audio
116
+ audio = preprocess_audio(x)
 
 
 
 
117
  spec_grads = spec_model.forward(audio)
118
  multimodal_grads = multimodal.spec_depth[0].forward(spec_grads)
119
 
120
  out = nn.Softmax()(multimodal_grads)
121
  max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
122
  max_value = out[max] #Actual value of the tensor.
123
+ max_value = np.argmax(out[max].detach().numpy())
124
 
125
  if max_value > 0.5:
126
  preds = round(100 - (max_value*100), 3)
127
+ text2 = f"The audio is REAL."
128
 
129
  else:
130
  preds = round(max_value*100, 3)
131
+ text2 = f"The audio is FAKE."
132
 
133
+ return text2
134
 
135
  def deepfakes_image_predict(input_image):
136
  face = preprocess_img(input_image)
137
 
 
 
 
 
138
  img_grads = img_model.forward(face)
139
  multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
140
 
141
  out = nn.Softmax()(multimodal_grads)
142
+ max = torch.argmax(out, dim=-1) #Index of the max value in the tensor.
143
  max_value = out[max] #Actual value of the tensor.
144
+ max_value = np.argmax(out[max].detach().numpy())
145
 
146
  if max_value > 0.5:
147
  preds = round(100 - (max_value*100), 3)
148
+ text2 = f"The image is REAL."
149
 
150
  else:
151
  preds = round(max_value*100, 3)
152
+ text2 = f"The image is FAKE."
153
 
154
+ return text2
155
 
156
 
157
  def preprocess_video(input_video, n_frames = 5):
 
183
  def deepfakes_video_predict(input_video):
184
  '''Perform inference on a video.'''
185
  video_frames = preprocess_video(input_video)
186
+
 
 
187
  real_grads = []
188
  fake_grads = []
189
 
 
192
  multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
193
 
194
  out = nn.Softmax()(multimodal_grads)
195
+ real_grads.append(out.cpu().detach().numpy()[0])
196
+ print(f"Video out tensor shape is: {out.shape}, {out}")
197
+
198
+ fake_grads.append(out.cpu().detach().numpy()[0])
199
 
200
  real_grads_mean = np.mean(real_grads)
201
  fake_grads_mean = np.mean(fake_grads)
202
 
203
  if real_grads_mean > fake_grads_mean:
204
  res = round(real_grads_mean * 100, 3)
205
+ text = f"The video is REAL."
206
  else:
207
  res = round(100 - (real_grads_mean * 100), 3)
208
+ text = f"The video is FAKE."
209
  return text
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ moviepy
6
  librosa
7
  ffmpeg
8
  albumentations
9
- opencv-python
 
 
6
  librosa
7
  ffmpeg
8
  albumentations
9
+ opencv-python
10
+ torchsummary