Spaces:
Sleeping
Sleeping
fatmacankara
commited on
Commit
·
d955409
1
Parent(s):
26f6c21
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +0 -18
code/pdb_featureVector.py
CHANGED
@@ -60,7 +60,6 @@ def pdb(input_set, mode, impute):
|
|
60 |
data = clean_data(input_set)
|
61 |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
|
62 |
out_path = path_to_output_files / 'log.txt'
|
63 |
-
st.write(out_path)
|
64 |
sys.stdout = open(out_path, 'w')
|
65 |
print('Creating directories...')
|
66 |
|
@@ -226,24 +225,18 @@ def pdb(input_set, mode, impute):
|
|
226 |
existing_pdb = [str(i) for i in existing_pdb]
|
227 |
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
|
228 |
cnt = 0
|
229 |
-
st.write('existing_pdb', existing_pdb)
|
230 |
for search in pdbs:
|
231 |
-
st.write('PDBS', search)
|
232 |
|
233 |
try:
|
234 |
if search.lower() not in existing_pdb:
|
235 |
-
st.write(Path(path_to_output_files / 'pdb_structures'))
|
236 |
file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
|
237 |
-
st.write(file)
|
238 |
else:
|
239 |
print('PDB structure file exists..')
|
240 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
241 |
-
st.write('filename', filename)
|
242 |
filename_replace_ext = filename.with_suffix(".pdb")
|
243 |
filename.rename(filename_replace_ext)
|
244 |
|
245 |
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
246 |
-
st.write('file', file)
|
247 |
|
248 |
base = os.path.splitext(str(file))[0]
|
249 |
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
@@ -253,7 +246,6 @@ def pdb(input_set, mode, impute):
|
|
253 |
resolution_method = parser.get_structure(search, file)
|
254 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
255 |
if record.dbxrefs[0].split(':')[0] == 'UNP':
|
256 |
-
st.write('RECORD', record)
|
257 |
pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
|
258 |
pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
|
259 |
pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
|
@@ -263,7 +255,6 @@ def pdb(input_set, mode, impute):
|
|
263 |
pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
|
264 |
index += 1
|
265 |
except:
|
266 |
-
st.write('ERROR INDEX')
|
267 |
IndexError
|
268 |
pdb_info.at[index, 'uniprotID'] = 'nan'
|
269 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
@@ -288,13 +279,10 @@ def pdb(input_set, mode, impute):
|
|
288 |
FileNotFoundError
|
289 |
|
290 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
291 |
-
st.write('pdb_info', pdb_info)
|
292 |
uniprot_matched = uniprot_matched.astype(str)
|
293 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
294 |
-
st.write('pdb_fasta', pdb_fasta)
|
295 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
296 |
uniprot_matched = uniprot_matched.astype(str)
|
297 |
-
st.write('uniprot_matched', uniprot_matched)
|
298 |
|
299 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
300 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
@@ -304,12 +292,10 @@ def pdb(input_set, mode, impute):
|
|
304 |
uniprot_matched.resolution == 'None'))]
|
305 |
no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
|
306 |
no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
|
307 |
-
st.write('with_pdb', with_pdb)
|
308 |
print(
|
309 |
'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
|
310 |
% (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
|
311 |
len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
|
312 |
-
st.write('with_pdb1', with_pdb)
|
313 |
|
314 |
with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
315 |
with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
@@ -388,7 +374,6 @@ def pdb(input_set, mode, impute):
|
|
388 |
# Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
|
389 |
with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
|
390 |
with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
|
391 |
-
st.write('with_pdb2', with_pdb)
|
392 |
|
393 |
dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
|
394 |
dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
@@ -415,13 +400,11 @@ def pdb(input_set, mode, impute):
|
|
415 |
existing_pdb = None
|
416 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
417 |
with_pdb = None
|
418 |
-
st.write('dfM', dfM)
|
419 |
|
420 |
|
421 |
print('Aligning sequences...\n')
|
422 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
423 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
424 |
-
st.write('aligned_m', aligned_m)
|
425 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
426 |
for i in aligned_m.index:
|
427 |
if aligned_m.at[i, 'pdbSequence'] == 'nan':
|
@@ -463,7 +446,6 @@ def pdb(input_set, mode, impute):
|
|
463 |
yes_pdb_no_match = after_up_pdb_alignment[
|
464 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
465 |
no_pdb = no_pdb.copy()
|
466 |
-
st.write('pdb_aligned', pdb_aligned)
|
467 |
|
468 |
|
469 |
print('PDB matching is completed...\n')
|
|
|
60 |
data = clean_data(input_set)
|
61 |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
|
62 |
out_path = path_to_output_files / 'log.txt'
|
|
|
63 |
sys.stdout = open(out_path, 'w')
|
64 |
print('Creating directories...')
|
65 |
|
|
|
225 |
existing_pdb = [str(i) for i in existing_pdb]
|
226 |
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
|
227 |
cnt = 0
|
|
|
228 |
for search in pdbs:
|
|
|
229 |
|
230 |
try:
|
231 |
if search.lower() not in existing_pdb:
|
|
|
232 |
file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
|
|
|
233 |
else:
|
234 |
print('PDB structure file exists..')
|
235 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
|
|
236 |
filename_replace_ext = filename.with_suffix(".pdb")
|
237 |
filename.rename(filename_replace_ext)
|
238 |
|
239 |
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
|
|
240 |
|
241 |
base = os.path.splitext(str(file))[0]
|
242 |
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
|
|
246 |
resolution_method = parser.get_structure(search, file)
|
247 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
248 |
if record.dbxrefs[0].split(':')[0] == 'UNP':
|
|
|
249 |
pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
|
250 |
pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
|
251 |
pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
|
|
|
255 |
pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
|
256 |
index += 1
|
257 |
except:
|
|
|
258 |
IndexError
|
259 |
pdb_info.at[index, 'uniprotID'] = 'nan'
|
260 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
|
|
279 |
FileNotFoundError
|
280 |
|
281 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
|
|
282 |
uniprot_matched = uniprot_matched.astype(str)
|
283 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
|
284 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
285 |
uniprot_matched = uniprot_matched.astype(str)
|
|
|
286 |
|
287 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
288 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
|
|
292 |
uniprot_matched.resolution == 'None'))]
|
293 |
no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
|
294 |
no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
|
|
|
295 |
print(
|
296 |
'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
|
297 |
% (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
|
298 |
len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
|
|
|
299 |
|
300 |
with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
301 |
with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
|
|
374 |
# Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
|
375 |
with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
|
376 |
with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
|
|
|
377 |
|
378 |
dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
|
379 |
dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
|
|
400 |
existing_pdb = None
|
401 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
402 |
with_pdb = None
|
|
|
403 |
|
404 |
|
405 |
print('Aligning sequences...\n')
|
406 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
407 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
408 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
409 |
for i in aligned_m.index:
|
410 |
if aligned_m.at[i, 'pdbSequence'] == 'nan':
|
|
|
446 |
yes_pdb_no_match = after_up_pdb_alignment[
|
447 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
448 |
no_pdb = no_pdb.copy()
|
|
|
449 |
|
450 |
|
451 |
print('PDB matching is completed...\n')
|