feiyang-cai commited on
Commit
a7655fc
·
1 Parent(s): 62cd725

upload utils

Browse files
Files changed (1) hide show
  1. utils.py +149 -146
utils.py CHANGED
@@ -15,28 +15,54 @@ import pickle
15
  from sklearn import preprocessing
16
  import json
17
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  from rdkit import RDLogger, Chem
20
  # Suppress RDKit INFO messages
21
  RDLogger.DisableLog('rdApp.*')
22
 
23
  # we have a dictionary to store the task types of the models
24
- task_types = {
25
- "admet_ppbr_az": "regression",
26
- "admet_half_life_obach": "regression",
27
- }
 
28
 
29
  # read the dataset descriptions
30
  with open("dataset_descriptions.json", "r") as f:
31
  dataset_description_temp = json.load(f)
32
 
33
  dataset_descriptions = dict()
 
 
 
34
 
35
  for dataset in dataset_description_temp:
36
  dataset_name = dataset.lower()
37
  dataset_descriptions[dataset_name] = \
38
- f"{dataset_name} is a {dataset_description_temp[dataset]['task_type']} task, " + \
39
- f"where the goal is to {dataset_description_temp[dataset]['description']}."
 
 
 
 
40
 
41
  class Scaler:
42
  def __init__(self, log=False):
@@ -114,33 +140,32 @@ class DataCollator(object):
114
  return self.sme.augment([molecule])[0]
115
 
116
  def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
117
-
118
- sources = []
119
- targets = []
120
 
121
- for example in instances:
122
- smiles = example['smiles'].strip()
123
- smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
124
-
125
- # get the properties except the smiles and mol_id cols
126
- #props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
127
- source = f"{self.molecule_start_str}{smiles}{self.end_str}"
128
- sources.append(source)
129
 
130
- # Tokenize
131
- tokenized_sources_with_prompt = self.tokenizer(
132
- sources,
133
- max_length=self.source_max_len,
134
- truncation=True,
135
- add_special_tokens=False,
136
- )
137
- input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
138
- input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
139
-
140
- data_dict = {
141
- 'input_ids': input_ids,
142
- 'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
143
- }
144
 
145
  return data_dict
146
 
@@ -198,7 +223,11 @@ class MolecularPropertyPredictionModel():
198
  adapter_id = candidate_models[adapter_name]
199
  print(f"loading {adapter_name} from {adapter_id}...")
200
  self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
201
- self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
 
 
 
 
202
 
203
  #self.base_model.to("cuda")
204
  #print(self.base_model)
@@ -209,131 +238,105 @@ class MolecularPropertyPredictionModel():
209
  # switched: adapter is switched successfully
210
  # error: adapter is not found
211
 
212
- if adapter_name == self.adapter_name:
213
- return "keep"
214
- # switch adapter
215
- #try:
216
- #self.adapter_name = adapter_name
217
- #print(self.adapter_name, adapter_id)
218
- #self.lora_model = PeftModel.from_pretrained(self.base_model, adapter_id, token = os.environ.get("TOKEN"))
219
- #self.lora_model.to("cuda")
220
- #print(self.lora_model)
221
-
222
- self.base_model.set_adapter(adapter_name)
223
- self.base_model.eval()
224
- print(f"switch to {adapter_name} successfully")
225
-
226
- #if adapter_name not in self.apapter_scaler_path:
227
- # self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
228
- if os.path.exists(self.apapter_scaler_path[adapter_name]):
229
- self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
230
- else:
231
- self.scaler = None
232
-
233
- self.adapter_name = adapter_name
234
-
235
- return "switched"
236
- #except Exception as e:
237
- # print(e)
238
- # # handle error
239
- # return "error"
240
 
241
- @spaces.GPU(duration=10)
242
  def predict(self, valid_df, task_type):
243
- test_dataset = Dataset.from_pandas(valid_df)
244
- # construct the dataloader
245
- test_loader = torch.utils.data.DataLoader(
246
- test_dataset,
247
- batch_size=32,
248
- collate_fn=self.data_collator,
249
- )
250
 
251
- # predict
252
- y_pred = []
253
- for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
254
- with torch.no_grad():
255
- batch = {k: v.to(self.base_model.device) for k, v in batch.items()}
256
- outputs = self.base_model(**batch)
257
- if task_type == "regression": # TODO: check if the model is regression or classification
258
- y_pred.append(outputs.logits.cpu().detach().numpy())
259
- else:
260
- y_pred.append((torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy())
 
 
 
 
 
 
 
 
 
 
261
 
262
- y_pred = np.concatenate(y_pred, axis=0)
263
- if task_type=="regression" and self.scaler is not None:
264
- y_pred = self.scaler.inverse_transform(y_pred)
265
-
266
-
267
- return y_pred
268
-
269
- @spaces.GPU(duration=60)
270
- def predict_long(self, valid_df, task_type):
271
- test_dataset = Dataset.from_pandas(valid_df)
272
- # construct the dataloader
273
- test_loader = torch.utils.data.DataLoader(
274
- test_dataset,
275
- batch_size=32,
276
- collate_fn=self.data_collator,
277
- )
278
-
279
- # predict
280
- y_pred = []
281
- for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
282
- with torch.no_grad():
283
- batch = {k: v.to(self.base_model.device) for k, v in batch.items()}
284
- outputs = self.base_model(**batch)
285
- if task_type == "regression": # TODO: check if the model is regression or classification
286
- y_pred.append(outputs.logits.cpu().detach().numpy())
287
- else:
288
- y_pred.append((torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy())
289
-
290
- y_pred = np.concatenate(y_pred, axis=0)
291
- if task_type=="regression" and self.scaler is not None:
292
- y_pred = self.scaler.inverse_transform(y_pred)
293
 
294
 
295
  return y_pred
296
 
297
  def predict_single_smiles(self, smiles, task_type):
298
- assert task_type in ["regression", "classification"]
 
299
 
300
- # check the SMILES string is valid
301
- if not Chem.MolFromSmiles(smiles):
302
- return None
303
 
304
- valid_df = pd.DataFrame([smiles], columns=['smiles'])
305
- results = self.predict(valid_df, task_type)
306
- # predict
307
  return results.item()
308
 
309
  def predict_file(self, df, task_type):
310
- # we should add the index first
311
- df = df.reset_index()
312
- # we need to check the SMILES strings are valid, the invalid ones will be moved to the last
313
- valid_idx = []
314
- invalid_idx = []
315
- for idx, smiles in enumerate(df['smiles']):
316
- if Chem.MolFromSmiles(smiles):
317
- valid_idx.append(idx)
318
- else:
319
- invalid_idx.append(idx)
320
- valid_df = df.loc[valid_idx]
321
- # get the smiles list
322
- valid_df_smiles = valid_df['smiles'].tolist()
323
-
324
- input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
325
- results = self.predict_long(input_df, task_type)
326
-
327
- # add the results to the dataframe
328
- df.loc[valid_idx, 'prediction'] = results
329
- df.loc[invalid_idx, 'prediction'] = np.nan
330
-
331
- # drop the index column
332
- df = df.drop(columns=['index'])
333
-
334
- # phrase file
335
- return df
336
-
337
-
338
-
339
-
 
15
  from sklearn import preprocessing
16
  import json
17
  import spaces
18
+ import time
19
+
20
+ class calculateDuration:
21
+ def __init__(self, activity_name=""):
22
+ self.activity_name = activity_name
23
+
24
+ def __enter__(self):
25
+ self.start_time = time.time()
26
+ return self
27
+
28
+ def __exit__(self, exc_type, exc_value, traceback):
29
+ self.end_time = time.time()
30
+ self.elapsed_time = self.end_time - self.start_time
31
+ if self.activity_name:
32
+ print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds")
33
+ else:
34
+ print(f"Elapsed time: {self.elapsed_time:.6f} seconds")
35
+
36
 
37
  from rdkit import RDLogger, Chem
38
  # Suppress RDKit INFO messages
39
  RDLogger.DisableLog('rdApp.*')
40
 
41
  # we have a dictionary to store the task types of the models
42
+ #task_types = {
43
+ # "admet_bioavailability_ma": "classification",
44
+ # "admet_ppbr_az": "regression",
45
+ # "admet_half_life_obach": "regression",
46
+ #}
47
 
48
  # read the dataset descriptions
49
  with open("dataset_descriptions.json", "r") as f:
50
  dataset_description_temp = json.load(f)
51
 
52
  dataset_descriptions = dict()
53
+ dataset_property_names = dict()
54
+ dataset_task_types = dict()
55
+ dataset_property_names_to_dataset = dict()
56
 
57
  for dataset in dataset_description_temp:
58
  dataset_name = dataset.lower()
59
  dataset_descriptions[dataset_name] = \
60
+ f"{dataset_description_temp[dataset]['task_name']} is a {dataset_description_temp[dataset]['task_type']} task, " + \
61
+ f"where the goal is to {dataset_description_temp[dataset]['description']}. \n" + \
62
+ f"More information can be found at {dataset_description_temp[dataset]['url']}."
63
+ dataset_property_names[dataset_name] = dataset_description_temp[dataset]['task_name']
64
+ dataset_property_names_to_dataset[dataset_description_temp[dataset]['task_name']] = dataset_name
65
+ dataset_task_types[dataset_name] = dataset_description_temp[dataset]['task_type']
66
 
67
  class Scaler:
68
  def __init__(self, log=False):
 
140
  return self.sme.augment([molecule])[0]
141
 
142
  def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
143
+ with calculateDuration("DataCollator"):
144
+ sources = []
 
145
 
146
+ for example in instances:
147
+ smiles = example['smiles'].strip()
148
+ smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
149
+
150
+ # get the properties except the smiles and mol_id cols
151
+ #props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
152
+ source = f"{self.molecule_start_str}{smiles}{self.end_str}"
153
+ sources.append(source)
154
 
155
+ # Tokenize
156
+ tokenized_sources_with_prompt = self.tokenizer(
157
+ sources,
158
+ max_length=self.source_max_len,
159
+ truncation=True,
160
+ add_special_tokens=False,
161
+ )
162
+ input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
163
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
164
+
165
+ data_dict = {
166
+ 'input_ids': input_ids,
167
+ 'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
168
+ }
169
 
170
  return data_dict
171
 
 
223
  adapter_id = candidate_models[adapter_name]
224
  print(f"loading {adapter_name} from {adapter_id}...")
225
  self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
226
+ try:
227
+ self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
228
+ except:
229
+ self.apapter_scaler_path[adapter_name] = None
230
+ assert dataset_task_types[adapter_name] == "classification", f"{adapter_name} is not a regression task."
231
 
232
  #self.base_model.to("cuda")
233
  #print(self.base_model)
 
238
  # switched: adapter is switched successfully
239
  # error: adapter is not found
240
 
241
+ with calculateDuration("switching adapter"):
242
+ if adapter_name == self.adapter_name:
243
+ return "keep"
244
+ # switch adapter
245
+ try:
246
+ #self.adapter_name = adapter_name
247
+ #print(self.adapter_name, adapter_id)
248
+ #self.lora_model = PeftModel.from_pretrained(self.base_model, adapter_id, token = os.environ.get("TOKEN"))
249
+ #self.lora_model.to("cuda")
250
+ #print(self.lora_model)
251
+
252
+ self.base_model.set_adapter(adapter_name)
253
+ self.base_model.eval()
254
+
255
+ #if adapter_name not in self.apapter_scaler_path:
256
+ # self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
257
+ if self.apapter_scaler_path[adapter_name] and os.path.exists(self.apapter_scaler_path[adapter_name]):
258
+ self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
259
+ else:
260
+ self.scaler = None
261
+
262
+ self.adapter_name = adapter_name
263
+
264
+ return "switched"
265
+ except Exception as e:
266
+ # handle error
267
+ return "error"
 
268
 
269
+ @spaces.GPU(duration=20)
270
  def predict(self, valid_df, task_type):
 
 
 
 
 
 
 
271
 
272
+ with calculateDuration("predicting"):
273
+ with calculateDuration("construct dataloader"):
274
+ test_dataset = Dataset.from_pandas(valid_df)
275
+ # construct the dataloader
276
+ test_loader = torch.utils.data.DataLoader(
277
+ test_dataset,
278
+ batch_size=16,
279
+ collate_fn=self.data_collator,
280
+ )
281
+
282
+ # predict
283
+ y_pred = []
284
+ for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
285
+ with torch.no_grad():
286
+ batch = {k: v.to(self.base_model.device) for k, v in batch.items()}
287
+ outputs = self.base_model(**batch)
288
+ if task_type == "regression": # TODO: check if the model is regression or classification
289
+ y_pred.append(outputs.logits.cpu().detach().numpy())
290
+ else:
291
+ y_pred.append((torch.sigmoid(outputs.logits)).cpu().detach().numpy())
292
 
293
+ y_pred = np.concatenate(y_pred, axis=0)
294
+ if task_type=="regression" and self.scaler is not None:
295
+ y_pred = self.scaler.inverse_transform(y_pred)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
 
298
  return y_pred
299
 
300
  def predict_single_smiles(self, smiles, task_type):
301
+ with calculateDuration("predicting a single SMILES"):
302
+ assert task_type in ["regression", "classification"]
303
 
304
+ # check the SMILES string is valid
305
+ if not Chem.MolFromSmiles(smiles):
306
+ return None
307
 
308
+ valid_df = pd.DataFrame([smiles], columns=['smiles'])
309
+ results = self.predict(valid_df, task_type)
310
+ # predict
311
  return results.item()
312
 
313
  def predict_file(self, df, task_type):
314
+ with calculateDuration("predicting a file"):
315
+ # we should add the index first
316
+ df = df.reset_index()
317
+
318
+ with calculateDuration("pre-checking SMILES"):
319
+ # we need to check the SMILES strings are valid, the invalid ones will be moved to the last
320
+ valid_idx = []
321
+ invalid_idx = []
322
+ for idx, smiles in enumerate(df['smiles']):
323
+ if Chem.MolFromSmiles(smiles):
324
+ valid_idx.append(idx)
325
+ else:
326
+ invalid_idx.append(idx)
327
+ valid_df = df.loc[valid_idx]
328
+ # get the smiles list
329
+ valid_df_smiles = valid_df['smiles'].tolist()
330
+
331
+ input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
332
+ results = self.predict(input_df, task_type)
333
+
334
+ # add the results to the dataframe
335
+ df.loc[valid_idx, 'prediction'] = results
336
+ df.loc[invalid_idx, 'prediction'] = np.nan
337
+
338
+ # drop the index column
339
+ df = df.drop(columns=['index'])
340
+
341
+ # phrase file
342
+ return df