alnaba1 commited on
Commit
7e90fdf
·
1 Parent(s): 361a274

Add gridpartitioning algorithm, documentation.

Browse files
DiverseSelector/dissimilarity_based.py CHANGED
@@ -26,6 +26,8 @@
26
  from DiverseSelector.base import SelectionBase
27
  from DiverseSelector.metric import ComputeDistanceMatrix
28
  import numpy as np
 
 
29
 
30
  __all__ = [
31
  "DissimilaritySelection",
@@ -46,16 +48,41 @@ class DissimilaritySelection(SelectionBase):
46
  arr_dist=None,
47
  method="maxmin",
48
  r=1,
49
- k=2,
 
 
 
50
  **kwargs,
51
  ):
52
- """Base class for dissimilarity based subset selection."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  super().__init__(metric, random_seed, feature_type, mol_file, feature_file, num_selected)
54
  self.initialization = initialization
55
  self.arr_dist = arr_dist
56
  self.method = method
57
  self.r = r
58
  self.k = k
 
 
 
59
  # super(DissimilaritySelection, self).__init__(**kwargs)
60
  self.__dict__.update(kwargs)
61
 
@@ -94,11 +121,29 @@ class DissimilaritySelection(SelectionBase):
94
  pass
95
 
96
  def select(self, dissimilarity_function='brutestrength'):
97
- """Select the subset molecules with optimal diversity.
 
 
 
 
 
 
 
98
 
99
- Algorithm is adapted from https://doi.org/10.1016/S1093-3263(98)80008-9
100
  """
101
  def brutestrength(selected=None, n_selected=self.num_selected, method=self.method):
 
 
 
 
 
 
 
 
 
 
 
 
102
  if selected is None:
103
  selected = [self.starting_idx]
104
  return brutestrength(selected, n_selected, method)
@@ -132,10 +177,131 @@ class DissimilaritySelection(SelectionBase):
132
  else:
133
  raise ValueError(f"Method {method} not supported, choose maxmin or maxsum.")
134
 
135
- def sphereexclusion(selected=None, n_selected=self.num_selected, r=self.r, order=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  if selected is None:
137
  selected = []
138
- return sphereexclusion(selected, n_selected, r, order)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  if order is None:
141
  ref = [self.starting_idx]
@@ -146,11 +312,11 @@ class DissimilaritySelection(SelectionBase):
146
  data_point = self.features[idx]
147
  distance_sq = 0
148
  for i, point in enumerate(ref_point):
149
- distance_sq += (ref_point[i] - point) ** 2
150
  distances.append((distance_sq, idx))
151
  distances.sort()
152
  order = [idx for dist, idx in distances]
153
- return sphereexclusion(selected, n_selected, r, order)
154
 
155
  for idx in order:
156
  if len(selected) == 0:
@@ -161,11 +327,11 @@ class DissimilaritySelection(SelectionBase):
161
  data_point = self.features[idx]
162
  selected_point = self.features[selected_idx]
163
  distance_sq = 0
164
- for i, point in enumerate(data_point):
165
- distance_sq += (selected_point[i] - point) ** 2
166
  distances.append(np.sqrt(distance_sq))
167
  min_dist = min(distances)
168
- if min_dist > r:
169
  selected.append(idx)
170
  if len(selected) == n_selected:
171
  return selected
@@ -174,6 +340,20 @@ class DissimilaritySelection(SelectionBase):
174
 
175
  def optisim(selected=None, n_selected=self.num_selected, k=self.k,
176
  r=self.r, recycling=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if selected is None:
178
  selected = [self.starting_idx]
179
  return optisim(selected, n_selected, k, r, recycling)
@@ -192,7 +372,9 @@ class DissimilaritySelection(SelectionBase):
192
  selected.append(max(zip(subsample.values(), subsample.keys()))[1])
193
  return optisim(selected, n_selected, k, r, recycling)
194
  return selected
195
- index_new = candidates[np.random.randint(0, len(candidates))]
 
 
196
  distances = []
197
  for selected_idx in selected:
198
  data_point = self.features[index_new]
@@ -213,6 +395,7 @@ class DissimilaritySelection(SelectionBase):
213
  return optisim(selected, n_selected, k, r, recycling)
214
 
215
  algorithms = {'brutestrength': brutestrength,
 
216
  'sphereexclusion': sphereexclusion,
217
  'optisim': optisim}
218
  return algorithms[dissimilarity_function]()
 
26
  from DiverseSelector.base import SelectionBase
27
  from DiverseSelector.metric import ComputeDistanceMatrix
28
  import numpy as np
29
+ from sklearn.decomposition import PCA
30
+ from sklearn.preprocessing import StandardScaler
31
 
32
  __all__ = [
33
  "DissimilaritySelection",
 
48
  arr_dist=None,
49
  method="maxmin",
50
  r=1,
51
+ k=10,
52
+ cells=5,
53
+ max_dim=2,
54
+ grid_method="equisized_independent",
55
  **kwargs,
56
  ):
57
+ """DissimilaritySelection Class initialization.
58
+
59
+ Parameters
60
+ ----------
61
+ initialization
62
+ metric
63
+ random_seed
64
+ feature_type
65
+ mol_file
66
+ feature_file
67
+ num_selected
68
+ arr_dist
69
+ method
70
+ r
71
+ k
72
+ cells
73
+ max_dim
74
+ grid_method
75
+ kwargs
76
+ """
77
  super().__init__(metric, random_seed, feature_type, mol_file, feature_file, num_selected)
78
  self.initialization = initialization
79
  self.arr_dist = arr_dist
80
  self.method = method
81
  self.r = r
82
  self.k = k
83
+ self.cells = cells
84
+ self.max_dim = max_dim
85
+ self.grid_method = grid_method
86
  # super(DissimilaritySelection, self).__init__(**kwargs)
87
  self.__dict__.update(kwargs)
88
 
 
121
  pass
122
 
123
  def select(self, dissimilarity_function='brutestrength'):
124
+ """Select method containing all dissimilarity algorithms.
125
+
126
+ Parameters
127
+ ----------
128
+ dissimilarity_function
129
+
130
+ Returns
131
+ -------
132
 
 
133
  """
134
  def brutestrength(selected=None, n_selected=self.num_selected, method=self.method):
135
+ """Brute Strength dissimilarity algorithm with maxmin and maxsum methods.
136
+
137
+ Parameters
138
+ ----------
139
+ selected
140
+ n_selected
141
+ method
142
+
143
+ Returns
144
+ -------
145
+
146
+ """
147
  if selected is None:
148
  selected = [self.starting_idx]
149
  return brutestrength(selected, n_selected, method)
 
177
  else:
178
  raise ValueError(f"Method {method} not supported, choose maxmin or maxsum.")
179
 
180
+ def gridpartitioning(selected=None, n_selected=self.num_selected, cells=self.cells, max_dim=self.max_dim,
181
+ array=self.features, grid_method=self.grid_method):
182
+ """Grid partitioning dissimilarity algorithm with equifrequent/equisized and independent/dependent grid
183
+ partitioning methods.
184
+
185
+ Parameters
186
+ ----------
187
+ selected
188
+ n_selected
189
+ cells
190
+ max_dim
191
+ array
192
+ grid_method
193
+
194
+ Returns
195
+ -------
196
+
197
+ """
198
  if selected is None:
199
  selected = []
200
+ return gridpartitioning(selected, n_selected, cells, max_dim, array, grid_method)
201
+
202
+ data_dim = len(array[0])
203
+ if data_dim > max_dim:
204
+ norm_data = StandardScaler().fit_transform(array)
205
+ pca = PCA(n_components=max_dim)
206
+ principalComponents = pca.fit_transform(norm_data)
207
+ return gridpartitioning(selected, n_selected, cells, max_dim, principalComponents, grid_method)
208
+
209
+ if grid_method == "equisized_independent":
210
+ axis_info = []
211
+ for i in range(data_dim):
212
+ axis_min, axis_max = min(array[:, i]), max(array[:, i])
213
+ cell_length = (axis_max - axis_min) / cells
214
+ axis_info.append([axis_min, axis_max, cell_length])
215
+ bins = {}
216
+ for index, point in enumerate(array):
217
+ point_bin = []
218
+ for dim, value in enumerate(point):
219
+ if value == axis_info[dim][0]:
220
+ index_bin = 0
221
+ elif value == axis_info[dim][1]:
222
+ index_bin = cells - 1
223
+ else:
224
+ index_bin = int((value - axis_info[dim][0]) // axis_info[dim][2])
225
+ point_bin.append(index_bin)
226
+ bins.setdefault(tuple(point_bin), [])
227
+ bins[tuple(point_bin)].append(index)
228
+
229
+ elif grid_method == "equisized_dependent":
230
+ bins = {}
231
+ for i in range(data_dim):
232
+ if len(bins) == 0:
233
+ axis_min, axis_max = min(array[:, i]), max(array[:, i])
234
+ cell_length = (axis_max - axis_min) / cells
235
+ axis_info = [axis_min, axis_max, cell_length]
236
+
237
+ for index, point in enumerate(array):
238
+ point_bin = []
239
+ if point[i] == axis_info[0]:
240
+ index_bin = 0
241
+ elif point[i] == axis_info[1]:
242
+ index_bin = cells - 1
243
+ else:
244
+ index_bin = int((point[i] - axis_info[0]) // axis_info[2])
245
+ point_bin.append(index_bin)
246
+ bins.setdefault(tuple(point_bin), [])
247
+ bins[tuple(point_bin)].append(index)
248
+ else:
249
+ new_bins = {}
250
+ for bin_idx in bins:
251
+ axis_min, axis_max = min(array[bins[bin_idx], i]), max(array[bins[bin_idx], i])
252
+ cell_length = (axis_max - axis_min) / cells
253
+ axis_info = [axis_min, axis_max, cell_length]
254
+
255
+ for point_idx in bins[bin_idx]:
256
+ point_bin = [num for num in bin_idx]
257
+ if array[point_idx][i] == axis_info[0]:
258
+ index_bin = 0
259
+ elif array[point_idx][i] == axis_info[1]:
260
+ index_bin = cells - 1
261
+ else:
262
+ index_bin = int((array[point_idx][i] - axis_info[0]) // axis_info[2])
263
+ point_bin.append(index_bin)
264
+ new_bins.setdefault(tuple(point_bin), [])
265
+ new_bins[tuple(point_bin)].append(point_idx)
266
+ bins = new_bins
267
+
268
+ elif grid_method == "equifrequent_independent":
269
+ raise NotImplemented(f"{grid_method} not implemented.")
270
+ elif grid_method == "equifrequent_dependent":
271
+ raise NotImplemented(f"{grid_method} not implemented.")
272
+ else:
273
+ raise ValueError(f"{grid_method} not a valid method")
274
+
275
+ old_len = 0
276
+ rng = np.random.default_rng(seed=42)
277
+ while len(selected) < n_selected:
278
+ for bin_idx in bins:
279
+ if len(bins[bin_idx]) > 0:
280
+ mol_id = bins[bin_idx].pop(rng.integers(low=0, high=len(bins[bin_idx]), size=1)[0])
281
+ selected.append(mol_id)
282
+
283
+ if len(selected) == old_len:
284
+ break
285
+ old_len = len(selected)
286
+ return selected
287
+
288
+ def sphereexclusion(selected=None, n_selected=12, s_max=1, order=None):
289
+ """Directed sphere exclusion dissimilarity algorithm.
290
+
291
+ Parameters
292
+ ----------
293
+ selected
294
+ n_selected
295
+ s_max
296
+ order
297
+
298
+ Returns
299
+ -------
300
+
301
+ """
302
+ if selected is None:
303
+ selected = []
304
+ return sphereexclusion(selected, n_selected, s_max, order)
305
 
306
  if order is None:
307
  ref = [self.starting_idx]
 
312
  data_point = self.features[idx]
313
  distance_sq = 0
314
  for i, point in enumerate(ref_point):
315
+ distance_sq += (point - data_point[i]) ** 2
316
  distances.append((distance_sq, idx))
317
  distances.sort()
318
  order = [idx for dist, idx in distances]
319
+ return sphereexclusion(selected, n_selected, s_max, order)
320
 
321
  for idx in order:
322
  if len(selected) == 0:
 
327
  data_point = self.features[idx]
328
  selected_point = self.features[selected_idx]
329
  distance_sq = 0
330
+ for i in range(len(data_point)):
331
+ distance_sq += (selected_point[i] - data_point[i]) ** 2
332
  distances.append(np.sqrt(distance_sq))
333
  min_dist = min(distances)
334
+ if min_dist > s_max:
335
  selected.append(idx)
336
  if len(selected) == n_selected:
337
  return selected
 
340
 
341
  def optisim(selected=None, n_selected=self.num_selected, k=self.k,
342
  r=self.r, recycling=None):
343
+ """Optisim dissimilarity algorithm.
344
+
345
+ Parameters
346
+ ----------
347
+ selected
348
+ n_selected
349
+ k
350
+ r
351
+ recycling
352
+
353
+ Returns
354
+ -------
355
+
356
+ """
357
  if selected is None:
358
  selected = [self.starting_idx]
359
  return optisim(selected, n_selected, k, r, recycling)
 
372
  selected.append(max(zip(subsample.values(), subsample.keys()))[1])
373
  return optisim(selected, n_selected, k, r, recycling)
374
  return selected
375
+ rng = np.random.default_rng(seed=self.random_seed)
376
+ random_int = rng.integers(low=0, high=len(candidates), size=1)[0]
377
+ index_new = candidates[random_int]
378
  distances = []
379
  for selected_idx in selected:
380
  data_point = self.features[index_new]
 
395
  return optisim(selected, n_selected, k, r, recycling)
396
 
397
  algorithms = {'brutestrength': brutestrength,
398
+ 'gridpartitioning': gridpartitioning,
399
  'sphereexclusion': sphereexclusion,
400
  'optisim': optisim}
401
  return algorithms[dissimilarity_function]()
DiverseSelector/test/test_DissimilarityBased.py CHANGED
@@ -29,29 +29,84 @@ from DiverseSelector import DissimilaritySelection
29
  from DiverseSelector.test.common import generate_synthetic_data
30
  from numpy.testing import assert_equal
31
 
 
 
 
 
 
 
32
 
33
- def test_minmax_selector_3_100():
34
- """Testing the MinMax selection algorithm with predefined starting point."""
35
- # in the function name:
36
- # 3 means that the number of clusters is 3
37
- # 100 means that the number of total data pints is 100
38
- _, class_labels, arr_dist = generate_synthetic_data(n_samples=100,
39
- n_features=2,
40
- n_clusters=3,
41
- pairwise_dist=True,
42
- metric="euclidean",
43
- random_state=42)
44
- model = DissimilaritySelection(num_selected=12,
45
  arr_dist=arr_dist,
46
  random_seed=42)
47
- model.starting_idx = 0
48
- selected = model.select()
 
49
 
50
  # make sure all the selected indices are the same with expectation
51
- assert_equal([0, 94, 3, 50, 64, 85, 93, 83, 34, 59, 49, 72], selected)
52
 
53
- # make sure number of selected molecules is correct in reach cluster
54
- selected_labels_count = Counter(class_labels[selected])
55
- assert_equal(selected_labels_count[0], 4)
56
- assert_equal(selected_labels_count[1], 4)
57
- assert_equal(selected_labels_count[2], 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  from DiverseSelector.test.common import generate_synthetic_data
30
  from numpy.testing import assert_equal
31
 
32
+ coords, class_labels, arr_dist = generate_synthetic_data(n_samples=100,
33
+ n_features=2,
34
+ n_clusters=1,
35
+ pairwise_dist=True,
36
+ metric="euclidean",
37
+ random_state=42)
38
 
39
+ def test_brutestrength_maxmin():
40
+ """Testing brutestrength algorithm with maxmin."""
41
+ selector = DissimilaritySelection(num_selected=12,
 
 
 
 
 
 
 
 
 
42
  arr_dist=arr_dist,
43
  random_seed=42)
44
+ selector.starting_idx = 0
45
+ selector.features = coords
46
+ selected_ids = selector.select()
47
 
48
  # make sure all the selected indices are the same with expectation
49
+ assert_equal([0, 57, 95, 41, 67, 26, 3, 16, 12, 6, 62, 48], selected_ids)
50
 
51
+ def test_brutestrength_maxsum():
52
+ """Testing brutestrength algorithm with maxsum."""
53
+ selector = DissimilaritySelection(num_selected=12,
54
+ arr_dist=arr_dist,
55
+ random_seed=42,
56
+ method="maxsum")
57
+ selector.starting_idx = 0
58
+ selector.features = coords
59
+ selected_ids = selector.select()
60
+
61
+ # make sure all the selected indices are the same with expectation
62
+ assert_equal([0, 57, 25, 41, 95, 9, 8, 21, 13, 68, 37, 54], selected_ids)
63
+
64
+ def test_gridpartitioning_equisized_independent():
65
+ """Testing gridpartitioning algorithm with equisized independent partitioning method."""
66
+ selector = DissimilaritySelection(num_selected=12,
67
+ arr_dist=arr_dist,
68
+ random_seed=42)
69
+ selector.starting_idx = 0
70
+ selector.features = coords
71
+ selected_ids = selector.select("gridpartitioning")
72
+
73
+ # make sure all the selected indices are the same with expectation
74
+ assert_equal([15, 87, 70, 66, 49, 68, 8, 22, 10, 13, 19, 44, 76, 72, 25, 84, 73, 57, 65, 86], selected_ids)
75
+
76
+ def test_gridpartitioning_equisized_dependent():
77
+ """Testing gridpartitioning algorithm with equisized dependent partitioning method."""
78
+ selector = DissimilaritySelection(num_selected=12,
79
+ arr_dist=arr_dist,
80
+ random_seed=42,
81
+ grid_method="equisized_dependent")
82
+ selector.starting_idx = 0
83
+ selector.features = coords
84
+ selected_ids = selector.select("gridpartitioning")
85
+
86
+ # make sure all the selected indices are the same with expectation
87
+ assert_equal([0, 87, 68, 59, 50, 79, 4, 41, 30, 33, 71, 98, 73, 80, 65, 19, 10, 25, 55, 54, 37, 57, 86],
88
+ selected_ids)
89
+
90
+ def test_sphereexclusion():
91
+ """Testing sphereexclusion algorithm."""
92
+ selector = DissimilaritySelection(num_selected=12,
93
+ arr_dist=arr_dist,
94
+ random_seed=42)
95
+ selector.starting_idx = 0
96
+ selector.features = coords
97
+ selected_ids = selector.select("sphereexclusion")
98
+
99
+ # make sure all the selected indices are the same with expectation
100
+ assert_equal([17, 31, 90, 6, 12, 76, 26, 81, 2, 14, 57], selected_ids)
101
+
102
+ def test_optisim():
103
+ """Testing optisim algorithm."""
104
+ selector = DissimilaritySelection(num_selected=12,
105
+ arr_dist=arr_dist,
106
+ random_seed=42)
107
+ selector.starting_idx = 0
108
+ selector.features = coords
109
+ selected_ids = selector.select("optisim")
110
+
111
+ # make sure all the selected indices are the same with expectation
112
+ assert_equal([0, 13, 21, 9, 8, 18, 57, 39, 65, 25], selected_ids)