Denys Rozumnyi
commited on
Commit
·
ca70147
1
Parent(s):
5ee3f67
update
Browse files- geom_solver.py +59 -42
- testing.ipynb +44 -24
geom_solver.py
CHANGED
@@ -8,6 +8,7 @@ import itertools
|
|
8 |
import torch
|
9 |
from pytorch3d.renderer import PerspectiveCameras
|
10 |
from hoho.color_mappings import gestalt_color_mapping
|
|
|
11 |
|
12 |
def my_empty_solution():
|
13 |
return np.zeros((18,3)), [(0, 0)]
|
@@ -56,7 +57,8 @@ class GeomSolver(object):
|
|
56 |
|
57 |
in_this_image = np.array([cki in p.image_ids for p in self.points3D.values()])
|
58 |
uv = torch.round(self.pyt_cameras[ki].transform_points(self.verts)[:, :2]).cpu().numpy().astype(int)
|
59 |
-
|
|
|
60 |
proj_uv.append((uv, uv_inl))
|
61 |
uv = uv[uv_inl]
|
62 |
|
@@ -113,32 +115,45 @@ class GeomSolver(object):
|
|
113 |
def process_vertices(self):
|
114 |
human_entry = self.human_entry
|
115 |
|
116 |
-
col_cams = [hoho.Rt_to_eye_target(human_entry['
|
117 |
-
eye, target, up, fov = col_cams[0]
|
118 |
|
119 |
cameras, images, self.points3D = human_entry['cameras'], human_entry['images'], human_entry['points3d']
|
120 |
colmap_cameras_tf = list(human_entry['images'].keys())
|
121 |
self.xyz = np.stack([p.xyz for p in self.points3D.values()])
|
122 |
color = np.stack([p.rgb for p in self.points3D.values()])
|
123 |
self.gests = [np.array(gest0) for gest0 in human_entry['gestalt']]
|
124 |
-
for ki in range(1, len(self.gests)):
|
125 |
-
|
126 |
-
|
|
|
|
|
127 |
|
128 |
gestalt_camcet = np.stack([eye for eye, target, up, fov in itertools.starmap(hoho.Rt_to_eye_target, zip(*[human_entry[k] for k in 'ade20k K R t'.split()]))])
|
129 |
col_camcet = np.stack([eye for eye, target, up, fov in col_cams])
|
130 |
self.gestalt_to_colmap_cams = [colmap_cameras_tf[np.argmin(((gcam - col_camcet)**2).sum(1)**0.5)] for gcam in gestalt_camcet]
|
131 |
self.broken_cams = np.array([np.min(((gcam - col_camcet)**2).sum(1)**0.5) for gcam in gestalt_camcet]) > 300
|
132 |
|
133 |
-
self.height, self.width = cameras[1].height, cameras[1].width
|
134 |
N = len(self.gestalt_to_colmap_cams)
|
135 |
-
K = to_K(*human_entry['cameras'][1].params)[None].repeat(N, 0)
|
136 |
R = np.stack([quaternion_to_rotation_matrix(human_entry['images'][self.gestalt_to_colmap_cams[ind]].qvec) for ind in range(N)])
|
137 |
T = np.stack([human_entry['images'][self.gestalt_to_colmap_cams[ind]].tvec for ind in range(N)])
|
138 |
|
139 |
R = np.linalg.inv(R)
|
140 |
-
image_size =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
self.pyt_cameras = PerspectiveCameras(device=self.device, R=R, T=T, in_ndc=False, focal_length=K[:, 0, :1], principal_point=K[:, :2, 2], image_size=image_size)
|
|
|
|
|
142 |
self.verts = torch.from_numpy(self.xyz.astype(np.float32)).to(self.device)
|
143 |
|
144 |
centers_apex, assigned_apex = self.cluster_points(['apex'])
|
@@ -211,9 +226,10 @@ class GeomSolver(object):
|
|
211 |
dist = cv2.distanceTransform(1-edge_mask, cv2.DIST_L2, 3)
|
212 |
per_type_dists[etype] = dist
|
213 |
edge_dists.append(per_type_dists)
|
214 |
-
|
|
|
215 |
uv = torch.round(self.pyt_cameras[ki].transform_points(pyt_centers)[:, :2]).cpu().numpy().astype(int)
|
216 |
-
uv_inl = (uv[:, 0] >= 0) * (uv[:, 1] >= 0) * (uv[:, 0] <
|
217 |
uv = uv[uv_inl]
|
218 |
uvs.append(uv)
|
219 |
|
@@ -221,37 +237,38 @@ class GeomSolver(object):
|
|
221 |
thresholds_min_mean = {0 : [5, 7], 1 : [9, 25], 2: [30, 1000]}
|
222 |
# thresholds_min_mean = {0 : [1, 7], 1 : [1, 25], 2: [1, 1000]}
|
223 |
for i in range(pyt_centers.shape[0]):
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
255 |
if len(edges) == 0:
|
256 |
edges.append((0, 0))
|
257 |
return edges
|
|
|
8 |
import torch
|
9 |
from pytorch3d.renderer import PerspectiveCameras
|
10 |
from hoho.color_mappings import gestalt_color_mapping
|
11 |
+
from PIL import Image
|
12 |
|
13 |
def my_empty_solution():
|
14 |
return np.zeros((18,3)), [(0, 0)]
|
|
|
57 |
|
58 |
in_this_image = np.array([cki in p.image_ids for p in self.points3D.values()])
|
59 |
uv = torch.round(self.pyt_cameras[ki].transform_points(self.verts)[:, :2]).cpu().numpy().astype(int)
|
60 |
+
height, width = dist.shape
|
61 |
+
uv_inl = (uv[:, 0] >= 0) * (uv[:, 1] >= 0) * (uv[:, 0] < width) * (uv[:, 1] < height) * in_this_image
|
62 |
proj_uv.append((uv, uv_inl))
|
63 |
uv = uv[uv_inl]
|
64 |
|
|
|
115 |
def process_vertices(self):
|
116 |
human_entry = self.human_entry
|
117 |
|
118 |
+
col_cams = [hoho.Rt_to_eye_target(Image.new('RGB', (human_entry['cameras'][colmap_img.camera_id].width, human_entry['cameras'][colmap_img.camera_id].height)), to_K(*human_entry['cameras'][colmap_img.camera_id].params), quaternion_to_rotation_matrix(colmap_img.qvec), colmap_img.tvec) for colmap_img in human_entry['images'].values()]
|
119 |
+
# eye, target, up, fov = col_cams[0]
|
120 |
|
121 |
cameras, images, self.points3D = human_entry['cameras'], human_entry['images'], human_entry['points3d']
|
122 |
colmap_cameras_tf = list(human_entry['images'].keys())
|
123 |
self.xyz = np.stack([p.xyz for p in self.points3D.values()])
|
124 |
color = np.stack([p.rgb for p in self.points3D.values()])
|
125 |
self.gests = [np.array(gest0) for gest0 in human_entry['gestalt']]
|
126 |
+
# for ki in range(1, len(self.gests)):
|
127 |
+
# if self.gests[ki].shape != self.gests[0].shape:
|
128 |
+
# self.gests[ki] = self.gests[ki].transpose(1,0,2)
|
129 |
+
|
130 |
+
to_camera_ids = np.array([colmap_img.camera_id for colmap_img in human_entry['images'].values()])
|
131 |
|
132 |
gestalt_camcet = np.stack([eye for eye, target, up, fov in itertools.starmap(hoho.Rt_to_eye_target, zip(*[human_entry[k] for k in 'ade20k K R t'.split()]))])
|
133 |
col_camcet = np.stack([eye for eye, target, up, fov in col_cams])
|
134 |
self.gestalt_to_colmap_cams = [colmap_cameras_tf[np.argmin(((gcam - col_camcet)**2).sum(1)**0.5)] for gcam in gestalt_camcet]
|
135 |
self.broken_cams = np.array([np.min(((gcam - col_camcet)**2).sum(1)**0.5) for gcam in gestalt_camcet]) > 300
|
136 |
|
|
|
137 |
N = len(self.gestalt_to_colmap_cams)
|
|
|
138 |
R = np.stack([quaternion_to_rotation_matrix(human_entry['images'][self.gestalt_to_colmap_cams[ind]].qvec) for ind in range(N)])
|
139 |
T = np.stack([human_entry['images'][self.gestalt_to_colmap_cams[ind]].tvec for ind in range(N)])
|
140 |
|
141 |
R = np.linalg.inv(R)
|
142 |
+
image_size = []
|
143 |
+
K = []
|
144 |
+
for ind in range(N):
|
145 |
+
cid = to_camera_ids[np.array(colmap_cameras_tf) == self.gestalt_to_colmap_cams[ind]][0]
|
146 |
+
sz = np.array([cameras[cid].height, cameras[cid].width])
|
147 |
+
image_size.append(sz)
|
148 |
+
K.append(to_K(*human_entry['cameras'][cid].params))
|
149 |
+
image_size = np.stack(image_size)
|
150 |
+
K = np.stack(K)
|
151 |
+
# K = to_K(*human_entry['cameras'][1].params)[None].repeat(N, 0)
|
152 |
+
# self.height, self.width = cameras[1].height, cameras[1].width
|
153 |
+
# image_size = torch.Tensor([self.height, self.width]).repeat(N, 1)
|
154 |
self.pyt_cameras = PerspectiveCameras(device=self.device, R=R, T=T, in_ndc=False, focal_length=K[:, 0, :1], principal_point=K[:, :2, 2], image_size=image_size)
|
155 |
+
|
156 |
+
|
157 |
self.verts = torch.from_numpy(self.xyz.astype(np.float32)).to(self.device)
|
158 |
|
159 |
centers_apex, assigned_apex = self.cluster_points(['apex'])
|
|
|
226 |
dist = cv2.distanceTransform(1-edge_mask, cv2.DIST_L2, 3)
|
227 |
per_type_dists[etype] = dist
|
228 |
edge_dists.append(per_type_dists)
|
229 |
+
height, width, _ = gest.shape
|
230 |
+
|
231 |
uv = torch.round(self.pyt_cameras[ki].transform_points(pyt_centers)[:, :2]).cpu().numpy().astype(int)
|
232 |
+
uv_inl = (uv[:, 0] >= 0) * (uv[:, 1] >= 0) * (uv[:, 0] < width) * (uv[:, 1] < height)
|
233 |
uv = uv[uv_inl]
|
234 |
uvs.append(uv)
|
235 |
|
|
|
237 |
thresholds_min_mean = {0 : [5, 7], 1 : [9, 25], 2: [30, 1000]}
|
238 |
# thresholds_min_mean = {0 : [1, 7], 1 : [1, 25], 2: [1, 1000]}
|
239 |
for i in range(pyt_centers.shape[0]):
|
240 |
+
for j in range(i+1, pyt_centers.shape[0]):
|
241 |
+
etype = (self.is_apex[i] + self.is_apex[j])
|
242 |
+
|
243 |
+
points_inter = pyt_centers[i][None] + torch.linspace(0, 1, 20)[:, None].to(self.device) * (pyt_centers[j][None] - pyt_centers[i][None])
|
244 |
+
min_mean_dist = 1000
|
245 |
+
all_dists = []
|
246 |
+
best_ki = -1
|
247 |
+
best_uvi = -1
|
248 |
+
for ki in range(N):
|
249 |
+
cki = self.gestalt_to_colmap_cams[ki]
|
250 |
+
|
251 |
+
if not ( (cki in center_visibility[i]) or (cki in center_visibility[j]) ):
|
252 |
+
continue
|
253 |
+
if self.broken_cams[ki]:
|
254 |
+
continue
|
255 |
+
|
256 |
+
height, width, _ = self.gests[ki].shape
|
257 |
+
uvi = torch.round(self.pyt_cameras[ki].transform_points(points_inter)[:, :2]).cpu().numpy().astype(int)
|
258 |
+
if (uvi <= 0).any() or (uvi[:,0] >= width).any() or (uvi[:,1] >= height).any():
|
259 |
+
continue
|
260 |
+
mean_dist = edge_dists[ki][etype][uvi[:,1], uvi[:,0]].mean()
|
261 |
+
all_dists.append(mean_dist)
|
262 |
+
if mean_dist < min_mean_dist:
|
263 |
+
min_mean_dist = mean_dist
|
264 |
+
best_ki = ki
|
265 |
+
best_uvi = uvi
|
266 |
+
|
267 |
+
if best_ki == -1:
|
268 |
+
continue
|
269 |
+
ths = thresholds_min_mean[etype]
|
270 |
+
if min_mean_dist < ths[0] and np.mean(all_dists) < ths[1]:
|
271 |
+
edges.append((i,j))
|
272 |
if len(edges) == 0:
|
273 |
edges.append((0, 0))
|
274 |
return edges
|
testing.ipynb
CHANGED
@@ -176,7 +176,7 @@
|
|
176 |
},
|
177 |
{
|
178 |
"cell_type": "code",
|
179 |
-
"execution_count":
|
180 |
"id": "88f4fc8f-efa9-404b-9073-c7d4a73f9075",
|
181 |
"metadata": {},
|
182 |
"outputs": [
|
@@ -184,21 +184,38 @@
|
|
184 |
"name": "stdout",
|
185 |
"output_type": "stream",
|
186 |
"text": [
|
187 |
-
"2.
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
"\
|
196 |
-
"\
|
197 |
-
"
|
198 |
-
"
|
199 |
-
"
|
200 |
-
"
|
201 |
-
"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
]
|
203 |
}
|
204 |
],
|
@@ -209,7 +226,7 @@
|
|
209 |
"torch.manual_seed(0)\n",
|
210 |
"# One shard of the dataset\n",
|
211 |
"dataset = wds.WebDataset(hf_hub_download(repo_id='usm3d/hoho-train-set',\n",
|
212 |
-
" filename='data/train/
|
213 |
" repo_type=\"dataset\"))\n",
|
214 |
"\n",
|
215 |
"dataset = dataset.decode()\n",
|
@@ -217,10 +234,10 @@
|
|
217 |
"sc0 = []\n",
|
218 |
"sc = []\n",
|
219 |
"for ki, entry in enumerate(dataset):\n",
|
220 |
-
" # if ki < 153: wrong camera ids\n",
|
221 |
-
"
|
222 |
-
" # if ki < 162: different cameras and different image sizes\n",
|
223 |
-
"
|
224 |
" solver = GeomSolver()\n",
|
225 |
" vertices, edges = solver.solve(entry)\n",
|
226 |
" \n",
|
@@ -258,22 +275,25 @@
|
|
258 |
},
|
259 |
{
|
260 |
"cell_type": "code",
|
261 |
-
"execution_count":
|
262 |
"id": "a34d0a2b-9986-47cc-8a3f-c15397370c4d",
|
263 |
"metadata": {},
|
264 |
"outputs": [
|
265 |
{
|
266 |
"data": {
|
267 |
"text/plain": [
|
268 |
-
"
|
269 |
]
|
270 |
},
|
271 |
-
"execution_count":
|
272 |
"metadata": {},
|
273 |
"output_type": "execute_result"
|
274 |
}
|
275 |
],
|
276 |
"source": [
|
|
|
|
|
|
|
277 |
"ki"
|
278 |
]
|
279 |
},
|
|
|
176 |
},
|
177 |
{
|
178 |
"cell_type": "code",
|
179 |
+
"execution_count": 195,
|
180 |
"id": "88f4fc8f-efa9-404b-9073-c7d4a73f9075",
|
181 |
"metadata": {},
|
182 |
"outputs": [
|
|
|
184 |
"name": "stdout",
|
185 |
"output_type": "stream",
|
186 |
"text": [
|
187 |
+
"2.4183324229440974 2.038011551581092\n",
|
188 |
+
"1.9395643500589714 2.0791329825966307\n",
|
189 |
+
"2.815541573372287 2.5366888251094535\n",
|
190 |
+
"2.3254810143936755 1.648047653136293\n",
|
191 |
+
"2.3286533191380765 1.9072141098956248\n",
|
192 |
+
"2.342868026940067 1.9669375395419237\n",
|
193 |
+
"2.04873204164575 1.8261703137595002\n",
|
194 |
+
"2.167980973024546 1.5160824709801253\n",
|
195 |
+
"2.571328055702413 2.020824999345514\n",
|
196 |
+
"2.0127597511603774 2.0591274017651693\n",
|
197 |
+
"1.961080548873729 1.8200081675273143\n",
|
198 |
+
"2.1617889927813057 1.6349180819215263\n",
|
199 |
+
"2.0817899095268078 1.5980454240474442\n",
|
200 |
+
"2.484359575186481 1.9029737257539197\n",
|
201 |
+
"2.2997637652539 1.794924573842486\n",
|
202 |
+
"2.440276048704617 2.5473207175534065\n",
|
203 |
+
"2.135748984421359 1.6773471477819646\n",
|
204 |
+
"2.3955711940075584 2.5549005605155117\n",
|
205 |
+
"2.516982526006783 2.2057496708814113\n",
|
206 |
+
"2.521576698581939 1.615795718071218\n",
|
207 |
+
"1.8495344082304994 1.57835603284867\n",
|
208 |
+
"2.932654420428436 1.8880643234492105\n",
|
209 |
+
"1.9387187366052139 1.6981983307675732\n",
|
210 |
+
"2.135296338517323 1.8054484663488366\n",
|
211 |
+
"2.527322316920579 1.7985373132549314\n",
|
212 |
+
"2.681606928809334 2.2261780476238493\n",
|
213 |
+
"2.4613439446210306 1.6635009702924486\n",
|
214 |
+
"2.3497006297365304 1.815488520909667\n",
|
215 |
+
"2.2946897561795074 1.6075314409425536\n",
|
216 |
+
"2.2292662656029454 2.059612285543195\n",
|
217 |
+
"Averages\n",
|
218 |
+
"2.312343783912538 1.9030379122529486\n"
|
219 |
]
|
220 |
}
|
221 |
],
|
|
|
226 |
"torch.manual_seed(0)\n",
|
227 |
"# One shard of the dataset\n",
|
228 |
"dataset = wds.WebDataset(hf_hub_download(repo_id='usm3d/hoho-train-set',\n",
|
229 |
+
" filename='data/train/hoho_v3_001-of-032.tar.gz',\n",
|
230 |
" repo_type=\"dataset\"))\n",
|
231 |
"\n",
|
232 |
"dataset = dataset.decode()\n",
|
|
|
234 |
"sc0 = []\n",
|
235 |
"sc = []\n",
|
236 |
"for ki, entry in enumerate(dataset):\n",
|
237 |
+
" # if ki < 153: # wrong camera ids\n",
|
238 |
+
" # continue\n",
|
239 |
+
" # if ki < 162: # different cameras and different image sizes\n",
|
240 |
+
" # continue\n",
|
241 |
" solver = GeomSolver()\n",
|
242 |
" vertices, edges = solver.solve(entry)\n",
|
243 |
" \n",
|
|
|
275 |
},
|
276 |
{
|
277 |
"cell_type": "code",
|
278 |
+
"execution_count": 185,
|
279 |
"id": "a34d0a2b-9986-47cc-8a3f-c15397370c4d",
|
280 |
"metadata": {},
|
281 |
"outputs": [
|
282 |
{
|
283 |
"data": {
|
284 |
"text/plain": [
|
285 |
+
"0"
|
286 |
]
|
287 |
},
|
288 |
+
"execution_count": 185,
|
289 |
"metadata": {},
|
290 |
"output_type": "execute_result"
|
291 |
}
|
292 |
],
|
293 |
"source": [
|
294 |
+
"# human_entry['images'][1]\n",
|
295 |
+
"# # human_entry['cameras'][1].width\n",
|
296 |
+
"# Image.new('RGB', (human_entry['cameras'][1].width, human_entry['cameras'][1].height)).height\n",
|
297 |
"ki"
|
298 |
]
|
299 |
},
|