Spaces:
Running
on
Zero
Running
on
Zero
feiyang-cai
commited on
Commit
·
a7655fc
1
Parent(s):
62cd725
upload utils
Browse files
utils.py
CHANGED
@@ -15,28 +15,54 @@ import pickle
|
|
15 |
from sklearn import preprocessing
|
16 |
import json
|
17 |
import spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
from rdkit import RDLogger, Chem
|
20 |
# Suppress RDKit INFO messages
|
21 |
RDLogger.DisableLog('rdApp.*')
|
22 |
|
23 |
# we have a dictionary to store the task types of the models
|
24 |
-
task_types = {
|
25 |
-
"
|
26 |
-
"
|
27 |
-
|
|
|
28 |
|
29 |
# read the dataset descriptions
|
30 |
with open("dataset_descriptions.json", "r") as f:
|
31 |
dataset_description_temp = json.load(f)
|
32 |
|
33 |
dataset_descriptions = dict()
|
|
|
|
|
|
|
34 |
|
35 |
for dataset in dataset_description_temp:
|
36 |
dataset_name = dataset.lower()
|
37 |
dataset_descriptions[dataset_name] = \
|
38 |
-
f"{
|
39 |
-
f"where the goal is to {dataset_description_temp[dataset]['description']}."
|
|
|
|
|
|
|
|
|
40 |
|
41 |
class Scaler:
|
42 |
def __init__(self, log=False):
|
@@ -114,33 +140,32 @@ class DataCollator(object):
|
|
114 |
return self.sme.augment([molecule])[0]
|
115 |
|
116 |
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
|
117 |
-
|
118 |
-
|
119 |
-
targets = []
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
|
145 |
return data_dict
|
146 |
|
@@ -198,7 +223,11 @@ class MolecularPropertyPredictionModel():
|
|
198 |
adapter_id = candidate_models[adapter_name]
|
199 |
print(f"loading {adapter_name} from {adapter_id}...")
|
200 |
self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
|
201 |
-
|
|
|
|
|
|
|
|
|
202 |
|
203 |
#self.base_model.to("cuda")
|
204 |
#print(self.base_model)
|
@@ -209,131 +238,105 @@ class MolecularPropertyPredictionModel():
|
|
209 |
# switched: adapter is switched successfully
|
210 |
# error: adapter is not found
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
# return "error"
|
240 |
|
241 |
-
@spaces.GPU(duration=
|
242 |
def predict(self, valid_df, task_type):
|
243 |
-
test_dataset = Dataset.from_pandas(valid_df)
|
244 |
-
# construct the dataloader
|
245 |
-
test_loader = torch.utils.data.DataLoader(
|
246 |
-
test_dataset,
|
247 |
-
batch_size=32,
|
248 |
-
collate_fn=self.data_collator,
|
249 |
-
)
|
250 |
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
return y_pred
|
268 |
-
|
269 |
-
@spaces.GPU(duration=60)
|
270 |
-
def predict_long(self, valid_df, task_type):
|
271 |
-
test_dataset = Dataset.from_pandas(valid_df)
|
272 |
-
# construct the dataloader
|
273 |
-
test_loader = torch.utils.data.DataLoader(
|
274 |
-
test_dataset,
|
275 |
-
batch_size=32,
|
276 |
-
collate_fn=self.data_collator,
|
277 |
-
)
|
278 |
-
|
279 |
-
# predict
|
280 |
-
y_pred = []
|
281 |
-
for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
|
282 |
-
with torch.no_grad():
|
283 |
-
batch = {k: v.to(self.base_model.device) for k, v in batch.items()}
|
284 |
-
outputs = self.base_model(**batch)
|
285 |
-
if task_type == "regression": # TODO: check if the model is regression or classification
|
286 |
-
y_pred.append(outputs.logits.cpu().detach().numpy())
|
287 |
-
else:
|
288 |
-
y_pred.append((torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy())
|
289 |
-
|
290 |
-
y_pred = np.concatenate(y_pred, axis=0)
|
291 |
-
if task_type=="regression" and self.scaler is not None:
|
292 |
-
y_pred = self.scaler.inverse_transform(y_pred)
|
293 |
|
294 |
|
295 |
return y_pred
|
296 |
|
297 |
def predict_single_smiles(self, smiles, task_type):
|
298 |
-
|
|
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
return results.item()
|
308 |
|
309 |
def predict_file(self, df, task_type):
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
15 |
from sklearn import preprocessing
|
16 |
import json
|
17 |
import spaces
|
18 |
+
import time
|
19 |
+
|
20 |
+
class calculateDuration:
|
21 |
+
def __init__(self, activity_name=""):
|
22 |
+
self.activity_name = activity_name
|
23 |
+
|
24 |
+
def __enter__(self):
|
25 |
+
self.start_time = time.time()
|
26 |
+
return self
|
27 |
+
|
28 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
29 |
+
self.end_time = time.time()
|
30 |
+
self.elapsed_time = self.end_time - self.start_time
|
31 |
+
if self.activity_name:
|
32 |
+
print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds")
|
33 |
+
else:
|
34 |
+
print(f"Elapsed time: {self.elapsed_time:.6f} seconds")
|
35 |
+
|
36 |
|
37 |
from rdkit import RDLogger, Chem
|
38 |
# Suppress RDKit INFO messages
|
39 |
RDLogger.DisableLog('rdApp.*')
|
40 |
|
41 |
# we have a dictionary to store the task types of the models
|
42 |
+
#task_types = {
|
43 |
+
# "admet_bioavailability_ma": "classification",
|
44 |
+
# "admet_ppbr_az": "regression",
|
45 |
+
# "admet_half_life_obach": "regression",
|
46 |
+
#}
|
47 |
|
48 |
# read the dataset descriptions
|
49 |
with open("dataset_descriptions.json", "r") as f:
|
50 |
dataset_description_temp = json.load(f)
|
51 |
|
52 |
dataset_descriptions = dict()
|
53 |
+
dataset_property_names = dict()
|
54 |
+
dataset_task_types = dict()
|
55 |
+
dataset_property_names_to_dataset = dict()
|
56 |
|
57 |
for dataset in dataset_description_temp:
|
58 |
dataset_name = dataset.lower()
|
59 |
dataset_descriptions[dataset_name] = \
|
60 |
+
f"{dataset_description_temp[dataset]['task_name']} is a {dataset_description_temp[dataset]['task_type']} task, " + \
|
61 |
+
f"where the goal is to {dataset_description_temp[dataset]['description']}. \n" + \
|
62 |
+
f"More information can be found at {dataset_description_temp[dataset]['url']}."
|
63 |
+
dataset_property_names[dataset_name] = dataset_description_temp[dataset]['task_name']
|
64 |
+
dataset_property_names_to_dataset[dataset_description_temp[dataset]['task_name']] = dataset_name
|
65 |
+
dataset_task_types[dataset_name] = dataset_description_temp[dataset]['task_type']
|
66 |
|
67 |
class Scaler:
|
68 |
def __init__(self, log=False):
|
|
|
140 |
return self.sme.augment([molecule])[0]
|
141 |
|
142 |
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
|
143 |
+
with calculateDuration("DataCollator"):
|
144 |
+
sources = []
|
|
|
145 |
|
146 |
+
for example in instances:
|
147 |
+
smiles = example['smiles'].strip()
|
148 |
+
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
|
149 |
+
|
150 |
+
# get the properties except the smiles and mol_id cols
|
151 |
+
#props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
|
152 |
+
source = f"{self.molecule_start_str}{smiles}{self.end_str}"
|
153 |
+
sources.append(source)
|
154 |
|
155 |
+
# Tokenize
|
156 |
+
tokenized_sources_with_prompt = self.tokenizer(
|
157 |
+
sources,
|
158 |
+
max_length=self.source_max_len,
|
159 |
+
truncation=True,
|
160 |
+
add_special_tokens=False,
|
161 |
+
)
|
162 |
+
input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
|
163 |
+
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
164 |
+
|
165 |
+
data_dict = {
|
166 |
+
'input_ids': input_ids,
|
167 |
+
'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
|
168 |
+
}
|
169 |
|
170 |
return data_dict
|
171 |
|
|
|
223 |
adapter_id = candidate_models[adapter_name]
|
224 |
print(f"loading {adapter_name} from {adapter_id}...")
|
225 |
self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
|
226 |
+
try:
|
227 |
+
self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
|
228 |
+
except:
|
229 |
+
self.apapter_scaler_path[adapter_name] = None
|
230 |
+
assert dataset_task_types[adapter_name] == "classification", f"{adapter_name} is not a regression task."
|
231 |
|
232 |
#self.base_model.to("cuda")
|
233 |
#print(self.base_model)
|
|
|
238 |
# switched: adapter is switched successfully
|
239 |
# error: adapter is not found
|
240 |
|
241 |
+
with calculateDuration("switching adapter"):
|
242 |
+
if adapter_name == self.adapter_name:
|
243 |
+
return "keep"
|
244 |
+
# switch adapter
|
245 |
+
try:
|
246 |
+
#self.adapter_name = adapter_name
|
247 |
+
#print(self.adapter_name, adapter_id)
|
248 |
+
#self.lora_model = PeftModel.from_pretrained(self.base_model, adapter_id, token = os.environ.get("TOKEN"))
|
249 |
+
#self.lora_model.to("cuda")
|
250 |
+
#print(self.lora_model)
|
251 |
+
|
252 |
+
self.base_model.set_adapter(adapter_name)
|
253 |
+
self.base_model.eval()
|
254 |
+
|
255 |
+
#if adapter_name not in self.apapter_scaler_path:
|
256 |
+
# self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
|
257 |
+
if self.apapter_scaler_path[adapter_name] and os.path.exists(self.apapter_scaler_path[adapter_name]):
|
258 |
+
self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
|
259 |
+
else:
|
260 |
+
self.scaler = None
|
261 |
+
|
262 |
+
self.adapter_name = adapter_name
|
263 |
+
|
264 |
+
return "switched"
|
265 |
+
except Exception as e:
|
266 |
+
# handle error
|
267 |
+
return "error"
|
|
|
268 |
|
269 |
+
@spaces.GPU(duration=20)
|
270 |
def predict(self, valid_df, task_type):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
+
with calculateDuration("predicting"):
|
273 |
+
with calculateDuration("construct dataloader"):
|
274 |
+
test_dataset = Dataset.from_pandas(valid_df)
|
275 |
+
# construct the dataloader
|
276 |
+
test_loader = torch.utils.data.DataLoader(
|
277 |
+
test_dataset,
|
278 |
+
batch_size=16,
|
279 |
+
collate_fn=self.data_collator,
|
280 |
+
)
|
281 |
+
|
282 |
+
# predict
|
283 |
+
y_pred = []
|
284 |
+
for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
|
285 |
+
with torch.no_grad():
|
286 |
+
batch = {k: v.to(self.base_model.device) for k, v in batch.items()}
|
287 |
+
outputs = self.base_model(**batch)
|
288 |
+
if task_type == "regression": # TODO: check if the model is regression or classification
|
289 |
+
y_pred.append(outputs.logits.cpu().detach().numpy())
|
290 |
+
else:
|
291 |
+
y_pred.append((torch.sigmoid(outputs.logits)).cpu().detach().numpy())
|
292 |
|
293 |
+
y_pred = np.concatenate(y_pred, axis=0)
|
294 |
+
if task_type=="regression" and self.scaler is not None:
|
295 |
+
y_pred = self.scaler.inverse_transform(y_pred)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
|
298 |
return y_pred
|
299 |
|
300 |
def predict_single_smiles(self, smiles, task_type):
|
301 |
+
with calculateDuration("predicting a single SMILES"):
|
302 |
+
assert task_type in ["regression", "classification"]
|
303 |
|
304 |
+
# check the SMILES string is valid
|
305 |
+
if not Chem.MolFromSmiles(smiles):
|
306 |
+
return None
|
307 |
|
308 |
+
valid_df = pd.DataFrame([smiles], columns=['smiles'])
|
309 |
+
results = self.predict(valid_df, task_type)
|
310 |
+
# predict
|
311 |
return results.item()
|
312 |
|
313 |
def predict_file(self, df, task_type):
|
314 |
+
with calculateDuration("predicting a file"):
|
315 |
+
# we should add the index first
|
316 |
+
df = df.reset_index()
|
317 |
+
|
318 |
+
with calculateDuration("pre-checking SMILES"):
|
319 |
+
# we need to check the SMILES strings are valid, the invalid ones will be moved to the last
|
320 |
+
valid_idx = []
|
321 |
+
invalid_idx = []
|
322 |
+
for idx, smiles in enumerate(df['smiles']):
|
323 |
+
if Chem.MolFromSmiles(smiles):
|
324 |
+
valid_idx.append(idx)
|
325 |
+
else:
|
326 |
+
invalid_idx.append(idx)
|
327 |
+
valid_df = df.loc[valid_idx]
|
328 |
+
# get the smiles list
|
329 |
+
valid_df_smiles = valid_df['smiles'].tolist()
|
330 |
+
|
331 |
+
input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
|
332 |
+
results = self.predict(input_df, task_type)
|
333 |
+
|
334 |
+
# add the results to the dataframe
|
335 |
+
df.loc[valid_idx, 'prediction'] = results
|
336 |
+
df.loc[invalid_idx, 'prediction'] = np.nan
|
337 |
+
|
338 |
+
# drop the index column
|
339 |
+
df = df.drop(columns=['index'])
|
340 |
+
|
341 |
+
# phrase file
|
342 |
+
return df
|
|