yjwtheonly commited on
Commit
f402b50
·
1 Parent(s): ab18435

modifications

Browse files
DiseaseAgnostic/edge_to_abstract.py CHANGED
@@ -162,51 +162,12 @@ if args.mode == 'sentence':
162
 
163
  with open(f'generate_abstract/{args.init_mode}{args.reasonable_rate}_sentence.json', 'w') as fl:
164
  json.dump(single_sentence, fl, indent=4)
165
- # with open('generate_abstract/test.txt', 'w') as fl:
166
- # fl.write('\n'.join(test_text))
167
- # with open('generate_abstract/dp.txt', 'w') as fl:
168
- # fl.write('\n'.join(test_dp))
169
  with open (f'generate_abstract/path/{args.init_mode}{args.reasonable_rate}_path.json', 'w') as fl:
170
  fl.write('\n'.join(test_dp))
171
  with open (f'generate_abstract/path/{args.init_mode}{args.reasonable_rate}_temp.json', 'w') as fl:
172
  fl.write('\n'.join(test_text))
173
 
174
- elif args.mode == 'biogpt':
175
- pass
176
- # from biogpt_generate import GPT_eval
177
- # import spacy
178
-
179
- # model = GPT_eval(args.seed)
180
-
181
- # nlp = spacy.load("en_core_web_sm")
182
- # with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}_sentence.json', 'r') as fl:
183
- # data = json.load(fl)
184
-
185
- # KK = []
186
- # input = []
187
- # for i,(k, v) in enumerate(data.items()):
188
- # KK.append(k)
189
- # input.append(v)
190
- # output = model.eval(input)
191
-
192
- # ret = {}
193
- # for i, o in enumerate(output):
194
-
195
- # o = o.replace('<|abstract|>', '')
196
- # doc = nlp(o)
197
- # sen_list = []
198
- # sen_set = set()
199
- # for sen in doc.sents:
200
- # txt = sen.text
201
- # if not (txt.lower() in sen_set):
202
- # sen_set.add(txt.lower())
203
- # sen_list.append(txt)
204
- # O = ' '.join(sen_list)
205
- # ret[KK[i]] = {'in' : input[i], 'out' : O}
206
-
207
- # with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}_biogpt.json', 'w') as fl:
208
- # json.dump(ret, fl, indent=4)
209
-
210
  elif args.mode == 'finetune':
211
 
212
  import spacy
@@ -260,34 +221,6 @@ elif args.mode == 'finetune':
260
  vec[i] = True
261
  return vec, span
262
 
263
- # def mask_func(tokenized_sen, position):
264
-
265
- # if len(tokenized_sen) == 0:
266
- # return []
267
- # token_list = []
268
- # # for sen in tokenized_sen:
269
- # # for token in sen:
270
- # # token_list.append(token)
271
- # for sen in tokenized_sen:
272
- # token_list += sen.text.split(' ')
273
- # l_p = 0
274
- # r_p = 1
275
- # assert position == 'front' or position == 'back'
276
- # if position == 'back':
277
- # l_p, r_p = r_p, l_p
278
- # P = np.linspace(start = l_p, stop = r_p, num = len(token_list))
279
- # P = (P ** 3) * 0.4
280
-
281
- # ret_list = []
282
- # for t, p in zip(token_list, list(P)):
283
- # if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t:
284
- # ret_list.append(t)
285
- # else:
286
- # if np.random.rand() < p:
287
- # ret_list.append('<mask>')
288
- # else:
289
- # ret_list.append(t)
290
- # return [' '.join(ret_list)]
291
  def mask_func(tokenized_sen):
292
 
293
  if len(tokenized_sen) == 0:
@@ -441,11 +374,7 @@ elif args.mode == 'finetune':
441
  ret = {}
442
  case_study = {}
443
  p_ret = {}
444
- add = 0
445
  dpath_i = 0
446
- inner_better = 0
447
- outter_better = 0
448
- better_than_gpt = 0
449
  for i,(k, v) in enumerate(tqdm(draft.items())):
450
 
451
  span = ret_candidates[str(i)]['span']
@@ -573,80 +502,26 @@ elif args.mode == 'finetune':
573
  log_Loss = log_Loss[:old_L]
574
  # sen_list = sen_list[:old_L]
575
 
576
- # mini_span should be preserved
577
- # for j in range(len(log_Loss)):
578
- # doc = nlp(sen_list[j])
579
- # sens = [sen.text for sen in doc.sents]
580
- # Len = len(sen_list)
581
- # check_text = ' '.join(sens[j : max(0,len(sens) - Len) + j + 1])
582
- # if span not in check_text:
583
- # log_Loss[j] += 1
584
-
585
  p = np.argmin(log_Loss)
586
- if p < old_L // 2:
587
- inner_better += 1
588
- else:
589
- outter_better += 1
590
  content = []
591
  for i in range(len(real_log_Loss)):
592
  content.append([sen_list[i], str(real_log_Loss[i])])
593
  scored[k] = {'path':path_text, 'prompt': prompt, 'in':input, 's':text_s, 'o':text_o, 'out': content, 'bound': boundary}
594
  p_p = p
595
- # print('Old_L:', old_L)
596
 
597
  if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
598
  p_p = p+1+old_L
599
- if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
600
- add += 1
601
 
602
- if real_log_Loss[p] < real_log_Loss[old_L]:
603
- better_than_gpt += 1
604
- else:
605
  if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
606
  p = p+1+old_L
607
  # case_study[k] = {'path':path_text, 'entity_0': text_s, 'entity_1': text_o, 'GPT_in': input, 'Prompt': prompt, 'GPT_out': {'text': output, 'perplexity': str(np.exp(real_log_Loss[old_L]))}, 'BART_in': BART_in[p], 'BART_out': {'text': sen_list[p], 'perplexity': str(np.exp(real_log_Loss[p]))}, 'Assist': {'text': Assist[p], 'perplexity': str(np.exp(real_log_Loss[p+1+old_L]))}}
608
  ret[k] = {'prompt': prompt, 'in':input, 'out': sen_list[p]}
609
- p_ret[k] = {'prompt': prompt, 'in':input, 'out': sen_list[p_p]}
610
- print(add)
611
- print('inner_better:', inner_better)
612
- print('outter_better:', outter_better)
613
- print('better_than_gpt:', better_than_gpt)
614
- print('better_than_replace', add)
615
  with open(f'generate_abstract/{args.init_mode}{args.reasonable_rate}{args.ratio}_bioBART_finetune.json', 'w') as fl:
616
  json.dump(ret, fl, indent=4)
617
- # with open(f'generate_abstract/bioBART/case_{args.target_split}_{args.reasonable_rate}_bioBART_finetune.json', 'w') as fl:
618
- # json.dump(case_study, fl, indent=4)
619
  with open(f'generate_abstract/bioBART/{args.init_mode}{args.reasonable_rate}{args.ratio}_scored.json', 'w') as fl:
620
  json.dump(scored, fl, indent=4)
621
- with open(f'generate_abstract/bioBART/{args.init_mode}{args.reasonable_rate}{args.ratio}_perplexity.json', 'w') as fl:
622
- json.dump(p_ret, fl, indent=4)
623
-
624
- # with open(Parameters.GNBRfile+'original_entity_raw_name', 'rb') as fl:
625
- # full_entity_raw_name = pkl.load(fl)
626
- # for k, v in entity_raw_name.items():
627
- # assert v in full_entity_raw_name[k]
628
-
629
- # nlp = spacy.load("en_core_web_sm")
630
- # type_set = set()
631
- # for aa in range(36):
632
- # dependency_sen_dict = retieve_sentence_through_edgetype[aa]['manual']
633
- # tmp_dict = retieve_sentence_through_edgetype[aa]['auto']
634
- # dependencys = list(dependency_sen_dict.keys()) + list(tmp_dict.keys())
635
- # for dependency in dependencys:
636
- # dep_list = dependency.split(' ')
637
- # for sub_dep in dep_list:
638
- # sub_dep_list = sub_dep.split('|')
639
- # assert(len(sub_dep_list) == 3)
640
- # type_set.add(sub_dep_list[1])
641
-
642
- # fine_dict = {}
643
- # for k, v_dict in draft.items():
644
-
645
- # input = v_dict['in']
646
- # output = v_dict['out']
647
- # fine_dict[k] = {'in':input, 'out': input + ' ' + output}
648
-
649
- # with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}_sentence_finetune.json', 'w') as fl:
650
- # json.dump(fine_dict, fl, indent=4)
651
  else:
652
  raise Exception('Wrong mode !!')
 
162
 
163
  with open(f'generate_abstract/{args.init_mode}{args.reasonable_rate}_sentence.json', 'w') as fl:
164
  json.dump(single_sentence, fl, indent=4)
165
+
 
 
 
166
  with open (f'generate_abstract/path/{args.init_mode}{args.reasonable_rate}_path.json', 'w') as fl:
167
  fl.write('\n'.join(test_dp))
168
  with open (f'generate_abstract/path/{args.init_mode}{args.reasonable_rate}_temp.json', 'w') as fl:
169
  fl.write('\n'.join(test_text))
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  elif args.mode == 'finetune':
172
 
173
  import spacy
 
221
  vec[i] = True
222
  return vec, span
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  def mask_func(tokenized_sen):
225
 
226
  if len(tokenized_sen) == 0:
 
374
  ret = {}
375
  case_study = {}
376
  p_ret = {}
 
377
  dpath_i = 0
 
 
 
378
  for i,(k, v) in enumerate(tqdm(draft.items())):
379
 
380
  span = ret_candidates[str(i)]['span']
 
502
  log_Loss = log_Loss[:old_L]
503
  # sen_list = sen_list[:old_L]
504
 
 
 
 
 
 
 
 
 
 
505
  p = np.argmin(log_Loss)
 
 
 
 
506
  content = []
507
  for i in range(len(real_log_Loss)):
508
  content.append([sen_list[i], str(real_log_Loss[i])])
509
  scored[k] = {'path':path_text, 'prompt': prompt, 'in':input, 's':text_s, 'o':text_o, 'out': content, 'bound': boundary}
510
  p_p = p
 
511
 
512
  if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
513
  p_p = p+1+old_L
 
 
514
 
515
+ if real_log_Loss[p] > real_log_Loss[old_L]:
 
 
516
  if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
517
  p = p+1+old_L
518
  # case_study[k] = {'path':path_text, 'entity_0': text_s, 'entity_1': text_o, 'GPT_in': input, 'Prompt': prompt, 'GPT_out': {'text': output, 'perplexity': str(np.exp(real_log_Loss[old_L]))}, 'BART_in': BART_in[p], 'BART_out': {'text': sen_list[p], 'perplexity': str(np.exp(real_log_Loss[p]))}, 'Assist': {'text': Assist[p], 'perplexity': str(np.exp(real_log_Loss[p+1+old_L]))}}
519
  ret[k] = {'prompt': prompt, 'in':input, 'out': sen_list[p]}
520
+
 
 
 
 
 
521
  with open(f'generate_abstract/{args.init_mode}{args.reasonable_rate}{args.ratio}_bioBART_finetune.json', 'w') as fl:
522
  json.dump(ret, fl, indent=4)
 
 
523
  with open(f'generate_abstract/bioBART/{args.init_mode}{args.reasonable_rate}{args.ratio}_scored.json', 'w') as fl:
524
  json.dump(scored, fl, indent=4)
525
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
  else:
527
  raise Exception('Wrong mode !!')
DiseaseAgnostic/generate_abstract/random0.7_bioBART_finetune.json ADDED
The diff for this file is too large to render. See raw diff
 
DiseaseAgnostic/processed_data/attack_edge_distmult_0.7random.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0ccfbd4d67a60aeef746e45f3e322612f92d2f4ee28f4fe645a84f8284a226
3
+ size 4014
DiseaseAgnostic/processed_data/target_0.7random.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ceda69a4136eb899e6ef21a7ff56ac00d75eac71b056181e7d816532f041634
3
+ size 1214