|
|
|
|
|
|
|
|
|
import sys |
|
import io |
|
import json |
|
|
|
|
|
with open(sys.argv[1], 'r', encoding='utf8') as reader, \ |
|
open(sys.argv[2], 'w', encoding='utf8') as writer : |
|
lines_dict = json.load(reader) |
|
|
|
full_rela_lst = [] |
|
full_src_lst = [] |
|
full_tgt_lst = [] |
|
unique_src = 0 |
|
|
|
for example in lines_dict: |
|
rela_lst = [] |
|
temp_triples = '' |
|
for i, tripleset in enumerate(example['tripleset']): |
|
subj, rela, obj = tripleset |
|
rela = rela.lower() |
|
rela_lst.append(rela) |
|
if i > 0: |
|
temp_triples += ' | ' |
|
temp_triples += '{} : {} : {}'.format(subj, rela, obj) |
|
|
|
unique_src += 1 |
|
|
|
for sent in example['annotations']: |
|
full_tgt_lst.append(sent['text']) |
|
full_src_lst.append(temp_triples) |
|
full_rela_lst.append(rela_lst) |
|
|
|
print('unique source is', unique_src) |
|
|
|
for src, tgt in zip(full_src_lst, full_tgt_lst): |
|
x = {} |
|
x['context'] = src |
|
x['completion'] = tgt |
|
writer.write(json.dumps(x)+'\n') |