Spaces:
Sleeping
Sleeping
Commit
·
9e94583
1
Parent(s):
42b9925
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +200 -206
code/pdb_featureVector.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
# IMPORT NECESSARY MODULES AND LIBRARIES
|
2 |
from timeit import default_timer as timer
|
3 |
import xml.etree.ElementTree as ET
|
@@ -25,13 +26,13 @@ from Bio.PDB import PDBList
|
|
25 |
from Bio import Align
|
26 |
from Bio import SeqIO
|
27 |
from Bio.PDB import *
|
28 |
-
|
29 |
warnings.filterwarnings("ignore")
|
30 |
start = timer()
|
31 |
import streamlit as st
|
32 |
# FUNCTIONS
|
33 |
|
34 |
|
|
|
35 |
# FUNCTIONS
|
36 |
from calc_pc_property import *
|
37 |
from add_domains import *
|
@@ -57,16 +58,14 @@ def pdb(input_set, mode, impute):
|
|
57 |
Add datapoint identifier and remove non-standard input.
|
58 |
"""
|
59 |
data = clean_data(input_set)
|
60 |
-
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =
|
61 |
-
mode)
|
62 |
out_path = path_to_output_files / 'log.txt'
|
63 |
sys.stdout = open(out_path, 'w')
|
64 |
print('Creating directories...')
|
65 |
|
66 |
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
67 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
68 |
-
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
69 |
-
'region',
|
70 |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
71 |
'transitPeptide', 'glycosylation', 'propeptide']
|
72 |
|
@@ -141,14 +140,12 @@ def pdb(input_set, mode, impute):
|
|
141 |
if wt == can:
|
142 |
data.at[i, 'wt_sequence_match'] = 'm'
|
143 |
elif wt != can:
|
144 |
-
isoList = isoform_fasta[
|
145 |
-
isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
|
146 |
for k in isoList:
|
147 |
if len(k) >= int(data.at[i, 'pos']):
|
148 |
resInIso = k[int(int(data.at[i, 'pos']) - 1)]
|
149 |
if wt == resInIso:
|
150 |
-
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
|
151 |
-
0]
|
152 |
data.at[i, 'wt_sequence_match'] = 'i'
|
153 |
data.at[i, 'whichIsoform'] = whichIsoform
|
154 |
break
|
@@ -193,16 +190,24 @@ def pdb(input_set, mode, impute):
|
|
193 |
for prot in protein:
|
194 |
pdbs.append(get_pdb_ids(prot))
|
195 |
print('PDBs', pdbs)
|
196 |
-
if len(pdbs)
|
197 |
print('pdbs not empty')
|
198 |
pdbs = [item for sublist in pdbs for item in sublist]
|
199 |
print('NEW', pdbs)
|
200 |
else:
|
201 |
print('pdbs empty')
|
202 |
-
pdbs =
|
203 |
print('Processing PDB structures...\n')
|
204 |
if pdbs == []:
|
205 |
print('No PDB structure found for the query. ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
print('Starting PDB structures download...\n')
|
207 |
pdbs = list(filter(None, pdbs))
|
208 |
pdbs = (set(pdbs))
|
@@ -214,69 +219,59 @@ def pdb(input_set, mode, impute):
|
|
214 |
try:
|
215 |
shutil.rmtree('obsolete')
|
216 |
except OSError as e:
|
217 |
-
pass
|
218 |
-
|
|
|
|
|
|
|
|
|
219 |
cnt = 0
|
220 |
st.write('this is the pdbs', pdbs)
|
221 |
-
def fetch_uniprot_ids(pdb_code):
|
222 |
-
try:
|
223 |
-
response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
|
224 |
-
response.raise_for_status() # Check for a successful response
|
225 |
-
data = response.json()
|
226 |
-
st.write(list(list(list(data.values())[0].values())[0].keys()))
|
227 |
-
return list(list(list(data.values())[0].values())[0].keys())
|
228 |
-
except :
|
229 |
-
return []
|
230 |
for search in pdbs:
|
231 |
-
|
232 |
-
pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
|
233 |
-
st.write(pdb_url)
|
234 |
try:
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
print()
|
278 |
-
st.write()
|
279 |
-
st.write(pdb_info)
|
280 |
print('PDB file processing finished..')
|
281 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
282 |
try:
|
@@ -329,11 +324,13 @@ def pdb(input_set, mode, impute):
|
|
329 |
TypeError
|
330 |
with_pdb.at[i, 'pdbInfo'] = 'nan'
|
331 |
|
332 |
-
with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume',
|
333 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
|
334 |
'wt_sequence_match',
|
335 |
'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
|
336 |
|
|
|
|
|
337 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
338 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
339 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
@@ -347,8 +344,7 @@ def pdb(input_set, mode, impute):
|
|
347 |
if len(with_pdb) > 0:
|
348 |
with_pdb = add_annotations(with_pdb)
|
349 |
else:
|
350 |
-
new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
|
351 |
-
'dnaBinding',
|
352 |
'activeSite',
|
353 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
354 |
'crosslink', 'mutagenesis', 'strand',
|
@@ -367,7 +363,7 @@ def pdb(input_set, mode, impute):
|
|
367 |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
368 |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
369 |
'glycosylationBinary', 'propeptideBinary']
|
370 |
-
with_pdb = pd.DataFrame(columns=new_cols)
|
371 |
try:
|
372 |
with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
|
373 |
except:
|
@@ -379,7 +375,7 @@ def pdb(input_set, mode, impute):
|
|
379 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
380 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
381 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
382 |
-
|
383 |
"""
|
384 |
STEP 7
|
385 |
Do alignment for PDB
|
@@ -411,11 +407,11 @@ def pdb(input_set, mode, impute):
|
|
411 |
pdb_fasta = None
|
412 |
pdb_info = None
|
413 |
pdbs = None
|
414 |
-
|
415 |
-
g_pdb = None
|
416 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
417 |
with_pdb = None
|
418 |
-
|
|
|
419 |
print('Aligning sequences...\n')
|
420 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
421 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
@@ -438,6 +434,7 @@ def pdb(input_set, mode, impute):
|
|
438 |
aligned_m = aligned_m.astype(str)
|
439 |
aligned_nm = aligned_nm.astype(str)
|
440 |
|
|
|
441 |
frames = [aligned_m, aligned_nm]
|
442 |
after_up_pdb_alignment = pd.concat(frames, sort=False)
|
443 |
if len(after_up_pdb_alignment) == 0:
|
@@ -460,6 +457,7 @@ def pdb(input_set, mode, impute):
|
|
460 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
461 |
no_pdb = no_pdb.copy()
|
462 |
|
|
|
463 |
print('PDB matching is completed...\n')
|
464 |
print('SUMMARY')
|
465 |
print('-------')
|
@@ -474,6 +472,7 @@ def pdb(input_set, mode, impute):
|
|
474 |
print('--%d will be searched in Swiss-Model database.\n' % (
|
475 |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
|
476 |
|
|
|
477 |
dfM = None
|
478 |
dfNM = None
|
479 |
aligned_nm = None
|
@@ -529,8 +528,7 @@ def pdb(input_set, mode, impute):
|
|
529 |
swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
|
530 |
dtype=str, header=None, skiprows=1,
|
531 |
names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
|
532 |
-
'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
|
533 |
-
'qmean_norm', 'seqid', 'url'])
|
534 |
|
535 |
else:
|
536 |
swiss_model = pd.DataFrame(
|
@@ -550,13 +548,13 @@ def pdb(input_set, mode, impute):
|
|
550 |
swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
|
551 |
else:
|
552 |
swiss_model.at[ind, 'whichIsoform'] = 'nan'
|
553 |
-
|
554 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
555 |
print('Index File Processed...\n')
|
556 |
|
|
|
557 |
# Get relevant columns
|
558 |
-
swiss_model = swiss_model[
|
559 |
-
['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
560 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
561 |
swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
|
562 |
swiss_model.reset_index(inplace=True)
|
@@ -713,6 +711,7 @@ def pdb(input_set, mode, impute):
|
|
713 |
ascending=[True, False])
|
714 |
swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
|
715 |
|
|
|
716 |
swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
|
717 |
swiss_models_with_data.reset_index(inplace=True)
|
718 |
swiss_models_with_data.drop(['index'], axis=1, inplace=True)
|
@@ -729,6 +728,7 @@ def pdb(input_set, mode, impute):
|
|
729 |
|
730 |
swiss_models_with_data = swiss_models_with_data1.copy()
|
731 |
|
|
|
732 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
|
733 |
swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
|
734 |
axis=0, ascending=[True, True, True, False])
|
@@ -738,8 +738,7 @@ def pdb(input_set, mode, impute):
|
|
738 |
keep='first')
|
739 |
swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
|
740 |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
|
741 |
-
len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
|
742 |
-
broken_swiss.drop_duplicates(['datapoint'])) + len(
|
743 |
no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
|
744 |
# This printed data here includes all possible models with different qualities,
|
745 |
# because we may get a hit in either of them.
|
@@ -766,10 +765,10 @@ def pdb(input_set, mode, impute):
|
|
766 |
|
767 |
swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
|
768 |
swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
|
769 |
-
swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
|
770 |
-
path_to_output_files / 'alignment_files')
|
771 |
swiss_models_with_data = None
|
772 |
|
|
|
773 |
if len(swiss_model_aligned) == 0:
|
774 |
swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
|
775 |
swiss_model_aligned['qmean_norm'] = 'nan'
|
@@ -862,7 +861,7 @@ def pdb(input_set, mode, impute):
|
|
862 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
863 |
print(url)
|
864 |
req = requests.get(url)
|
865 |
-
name = path_to_output_files / 'modbase_structures' /
|
866 |
with open(name, 'wb') as f:
|
867 |
f.write(req.content)
|
868 |
else:
|
@@ -879,7 +878,7 @@ def pdb(input_set, mode, impute):
|
|
879 |
individual.write(str('UniProt ID: ' + protein))
|
880 |
individual.write('\n')
|
881 |
individual.write(str(pdb.contents[3])[10:-11].strip())
|
882 |
-
with open(path_to_output_files / 'modbase_structures_individual'
|
883 |
encoding="utf8") as f:
|
884 |
fasta = ''
|
885 |
chain = ''
|
@@ -962,6 +961,7 @@ def pdb(input_set, mode, impute):
|
|
962 |
existing_modbase_models = None
|
963 |
existing_modbase_models_ind = None
|
964 |
|
|
|
965 |
model_info_added = model_info_added.drop(['UniprotID'], axis=1)
|
966 |
model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
|
967 |
'PDBCode': 'template', 'PDBChain': 'chain',
|
@@ -1014,8 +1014,7 @@ def pdb(input_set, mode, impute):
|
|
1014 |
with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
|
1015 |
axis=0,
|
1016 |
ascending=[True, True, True, True, False, True, False])
|
1017 |
-
with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
|
1018 |
-
keep='first')
|
1019 |
|
1020 |
with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
|
1021 |
with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
|
@@ -1029,6 +1028,7 @@ def pdb(input_set, mode, impute):
|
|
1029 |
with_modbase_info.reset_index(inplace=True)
|
1030 |
with_modbase_info.drop('index', axis=1, inplace=True)
|
1031 |
|
|
|
1032 |
align = with_modbase_info[
|
1033 |
with_modbase_info.fasta != 'nan']
|
1034 |
yes_pdb_no_match = with_modbase_info[
|
@@ -1047,6 +1047,7 @@ def pdb(input_set, mode, impute):
|
|
1047 |
modbase_aligned = modbase_aligned.astype(str)
|
1048 |
modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
|
1049 |
|
|
|
1050 |
# Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
|
1051 |
if len(with_modbase_info) != 0:
|
1052 |
not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
|
@@ -1054,30 +1055,29 @@ def pdb(input_set, mode, impute):
|
|
1054 |
['datapoint'],
|
1055 |
keep=False)
|
1056 |
else:
|
1057 |
-
not_in_aligned = pd.DataFrame(
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
1078 |
-
|
1079 |
-
|
1080 |
-
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
|
1081 |
with_modbase_info = None
|
1082 |
if len(not_in_aligned) != 0:
|
1083 |
not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
|
@@ -1094,8 +1094,7 @@ def pdb(input_set, mode, impute):
|
|
1094 |
nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
|
1095 |
not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
|
1096 |
not_nan.score = not_nan.score.astype(float)
|
1097 |
-
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
|
1098 |
-
inplace=True)
|
1099 |
|
1100 |
not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
|
1101 |
ascending=[True, True, False])
|
@@ -1107,7 +1106,7 @@ def pdb(input_set, mode, impute):
|
|
1107 |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
|
1108 |
if len(which_ones_are_match) == 0:
|
1109 |
which_ones_are_match = pd.DataFrame(
|
1110 |
-
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume',
|
1111 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1112 |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
1113 |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
@@ -1143,6 +1142,7 @@ def pdb(input_set, mode, impute):
|
|
1143 |
not_nan = None
|
1144 |
nan = None
|
1145 |
|
|
|
1146 |
# merge not_in_align and modbase_not_match as they were both excluded from modbase match.
|
1147 |
|
1148 |
# No model
|
@@ -1171,10 +1171,9 @@ def pdb(input_set, mode, impute):
|
|
1171 |
elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
|
1172 |
rest = no_info
|
1173 |
else:
|
1174 |
-
rest = pd.DataFrame(
|
1175 |
-
|
1176 |
-
|
1177 |
-
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
1178 |
|
1179 |
rest = rest[to_swiss_columns]
|
1180 |
rest = rest.drop_duplicates()
|
@@ -1186,53 +1185,49 @@ def pdb(input_set, mode, impute):
|
|
1186 |
|
1187 |
else:
|
1188 |
|
1189 |
-
modbase_match = pd.DataFrame(
|
1190 |
-
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
1198 |
-
|
1199 |
-
|
1200 |
-
|
1201 |
-
|
1202 |
-
|
1203 |
-
|
1204 |
-
|
1205 |
-
|
1206 |
-
|
1207 |
-
|
1208 |
-
|
1209 |
-
|
1210 |
-
|
1211 |
-
|
1212 |
-
|
1213 |
-
|
1214 |
-
|
1215 |
-
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
-
|
1221 |
-
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
1228 |
-
|
1229 |
-
|
1230 |
-
|
1231 |
-
|
1232 |
-
rest = pd.DataFrame(
|
1233 |
-
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
1234 |
-
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1235 |
-
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
1236 |
|
1237 |
rest = rest[to_swiss_columns]
|
1238 |
rest = rest.drop_duplicates()
|
@@ -1268,6 +1263,7 @@ def pdb(input_set, mode, impute):
|
|
1268 |
not_models = None
|
1269 |
modbase_not_match = None
|
1270 |
|
|
|
1271 |
# Final corrections
|
1272 |
|
1273 |
# Now 3D alignment.
|
@@ -1289,6 +1285,7 @@ def pdb(input_set, mode, impute):
|
|
1289 |
|
1290 |
# Fix the axes and merge all data.
|
1291 |
|
|
|
1292 |
pdb.drop(['pdbInfo'], axis=1, inplace=True)
|
1293 |
pdb.rename(columns={'resolution': 'score'}, inplace=True)
|
1294 |
swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
|
@@ -1301,6 +1298,7 @@ def pdb(input_set, mode, impute):
|
|
1301 |
modbase['source'] = 'MODBASE'
|
1302 |
data = pd.concat([swiss, modbase, pdb])
|
1303 |
|
|
|
1304 |
data.reset_index(inplace=True)
|
1305 |
data.drop(['index'], axis=1, inplace=True)
|
1306 |
data = data.astype('str')
|
@@ -1324,10 +1322,10 @@ def pdb(input_set, mode, impute):
|
|
1324 |
for pdbID in pdb_only.pdbID.to_list():
|
1325 |
if pdbID not in existing_free_sasa:
|
1326 |
(run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
|
1327 |
-
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
1328 |
-
include_hetatms=True,
|
1329 |
outdir=None, force_rerun=False, file_type='pdb'))
|
1330 |
|
|
|
1331 |
print('Calculation RSA for SwissModel Files...\n')
|
1332 |
swiss_only = data[data.source == 'SWISSMODEL']
|
1333 |
swiss_dp = []
|
@@ -1345,8 +1343,7 @@ def pdb(input_set, mode, impute):
|
|
1345 |
for pdbID in modbase_only.pdbID.to_list():
|
1346 |
if pdbID not in existing_free_sasa:
|
1347 |
(run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
|
1348 |
-
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
1349 |
-
include_hetatms=True,
|
1350 |
outdir=None, force_rerun=False, file_type='pdb'))
|
1351 |
|
1352 |
# This annotation list is different than the prev one, keep it.
|
@@ -1384,18 +1381,16 @@ def pdb(input_set, mode, impute):
|
|
1384 |
chain = data.at[i, 'chain']
|
1385 |
uniprotID = data.at[i, 'uniprotID']
|
1386 |
pdbID = data.at[i, 'pdbID']
|
1387 |
-
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
|
1388 |
-
Path(path_to_output_files / '3D_alignment'), file_format='gzip')
|
1389 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
1390 |
try:
|
1391 |
-
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
1392 |
except:
|
1393 |
ValueError
|
1394 |
coordMut = 'nan'
|
1395 |
try:
|
1396 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
1397 |
-
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
1398 |
-
data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
|
1399 |
except:
|
1400 |
ValueError
|
1401 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
@@ -1443,9 +1438,11 @@ def pdb(input_set, mode, impute):
|
|
1443 |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
|
1444 |
float(data.at[i, 'domainEndonPDB']))
|
1445 |
|
|
|
1446 |
data = data.astype(str)
|
1447 |
data.replace({'NaN': 'nan'}, inplace=True)
|
1448 |
|
|
|
1449 |
# Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
|
1450 |
|
1451 |
# Get interface positions from ECLAIR. Download HQ human
|
@@ -1466,29 +1463,28 @@ def pdb(input_set, mode, impute):
|
|
1466 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
1467 |
|
1468 |
if len(data) == 0:
|
1469 |
-
data = pd.DataFrame(
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
|
1474 |
-
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
1478 |
-
|
1479 |
-
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
|
1484 |
-
|
1485 |
-
|
1486 |
-
|
1487 |
-
|
1488 |
-
|
1489 |
-
|
1490 |
-
|
1491 |
-
'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
|
1492 |
else:
|
1493 |
data.sasa = data.sasa.astype('str')
|
1494 |
|
@@ -1527,6 +1523,7 @@ def pdb(input_set, mode, impute):
|
|
1527 |
|
1528 |
data.drop(['positions'], axis=1, inplace=True)
|
1529 |
|
|
|
1530 |
# OPTIONAL
|
1531 |
# DOMAIN SELECTION
|
1532 |
# Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
|
@@ -1545,8 +1542,7 @@ def pdb(input_set, mode, impute):
|
|
1545 |
# nan--> 0, 0 -->1 and 1 -->2
|
1546 |
|
1547 |
print('Final adjustments are being done...\n')
|
1548 |
-
binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
|
1549 |
-
'dnaBindingBinary',
|
1550 |
'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
1551 |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
1552 |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
@@ -1648,8 +1644,7 @@ def pdb(input_set, mode, impute):
|
|
1648 |
ready = data.copy()
|
1649 |
# Imputation
|
1650 |
if (impute == 'True') or (impute == 'true') or (impute == True):
|
1651 |
-
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
|
1652 |
-
16.82,
|
1653 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
1654 |
col_index = 0
|
1655 |
for col_ in ready.columns[-30:]:
|
@@ -1664,8 +1659,7 @@ def pdb(input_set, mode, impute):
|
|
1664 |
ready = ready.replace({'nan': np.NaN})
|
1665 |
ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
1666 |
if len(ready) == 0:
|
1667 |
-
print(
|
1668 |
-
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
1669 |
print(ready)
|
1670 |
print('Feature vector successfully created...')
|
1671 |
return ready
|
|
|
1 |
+
|
2 |
# IMPORT NECESSARY MODULES AND LIBRARIES
|
3 |
from timeit import default_timer as timer
|
4 |
import xml.etree.ElementTree as ET
|
|
|
26 |
from Bio import Align
|
27 |
from Bio import SeqIO
|
28 |
from Bio.PDB import *
|
|
|
29 |
warnings.filterwarnings("ignore")
|
30 |
start = timer()
|
31 |
import streamlit as st
|
32 |
# FUNCTIONS
|
33 |
|
34 |
|
35 |
+
|
36 |
# FUNCTIONS
|
37 |
from calc_pc_property import *
|
38 |
from add_domains import *
|
|
|
58 |
Add datapoint identifier and remove non-standard input.
|
59 |
"""
|
60 |
data = clean_data(input_set)
|
61 |
+
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
|
|
|
62 |
out_path = path_to_output_files / 'log.txt'
|
63 |
sys.stdout = open(out_path, 'w')
|
64 |
print('Creating directories...')
|
65 |
|
66 |
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
67 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
68 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
|
|
69 |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
70 |
'transitPeptide', 'glycosylation', 'propeptide']
|
71 |
|
|
|
140 |
if wt == can:
|
141 |
data.at[i, 'wt_sequence_match'] = 'm'
|
142 |
elif wt != can:
|
143 |
+
isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
|
|
|
144 |
for k in isoList:
|
145 |
if len(k) >= int(data.at[i, 'pos']):
|
146 |
resInIso = k[int(int(data.at[i, 'pos']) - 1)]
|
147 |
if wt == resInIso:
|
148 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
|
|
149 |
data.at[i, 'wt_sequence_match'] = 'i'
|
150 |
data.at[i, 'whichIsoform'] = whichIsoform
|
151 |
break
|
|
|
190 |
for prot in protein:
|
191 |
pdbs.append(get_pdb_ids(prot))
|
192 |
print('PDBs', pdbs)
|
193 |
+
if len(pdbs)>=1:
|
194 |
print('pdbs not empty')
|
195 |
pdbs = [item for sublist in pdbs for item in sublist]
|
196 |
print('NEW', pdbs)
|
197 |
else:
|
198 |
print('pdbs empty')
|
199 |
+
pdbs =[]
|
200 |
print('Processing PDB structures...\n')
|
201 |
if pdbs == []:
|
202 |
print('No PDB structure found for the query. ')
|
203 |
+
"""
|
204 |
+
try:
|
205 |
+
pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in
|
206 |
+
((',').join([str(item) for item in pdbs])).split(',')]
|
207 |
+
except IndexError:
|
208 |
+
pdbs = []
|
209 |
+
print('No PDB structure found for the query. ')
|
210 |
+
"""
|
211 |
print('Starting PDB structures download...\n')
|
212 |
pdbs = list(filter(None, pdbs))
|
213 |
pdbs = (set(pdbs))
|
|
|
219 |
try:
|
220 |
shutil.rmtree('obsolete')
|
221 |
except OSError as e:
|
222 |
+
pass
|
223 |
+
existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
|
224 |
+
st.write('existing_pdb')
|
225 |
+
st.write(existing_pdb)
|
226 |
+
existing_pdb = [str(i) for i in existing_pdb]
|
227 |
+
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
|
228 |
cnt = 0
|
229 |
st.write('this is the pdbs', pdbs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
for search in pdbs:
|
231 |
+
st.write('searching for pdb:', search)
|
|
|
|
|
232 |
try:
|
233 |
+
if search.lower() not in existing_pdb:
|
234 |
+
path_pdb = 'out_files/pdb/pdb_structures'
|
235 |
+
st.write('path for pdb: ',path_pdb)
|
236 |
+
file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
|
237 |
+
st.write('file: ',file)
|
238 |
+
existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
|
239 |
+
st.write('after download:', existing_pdb)
|
240 |
+
st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
|
241 |
+
existing_pdb = list(path_pdb.glob("*"))
|
242 |
+
st.write('after download:', existing_pdb)
|
243 |
+
else:
|
244 |
+
print('PDB structure file exists..')
|
245 |
+
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
246 |
+
filename_replace_ext = filename.with_suffix(".pdb")
|
247 |
+
filename.rename(filename_replace_ext)
|
248 |
+
|
249 |
+
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
250 |
+
|
251 |
+
base = os.path.splitext(str(file))[0]
|
252 |
+
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
253 |
+
os.rename(file, base + ".ent")
|
254 |
+
file = base + '.ent'
|
255 |
+
|
256 |
+
resolution_method = parser.get_structure(search, file)
|
257 |
+
for record in SeqIO.parse(file, "pdb-seqres"):
|
258 |
+
if record.dbxrefs[0].split(':')[0] == 'UNP':
|
259 |
+
pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
|
260 |
+
pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
|
261 |
+
pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
|
262 |
+
pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
|
263 |
+
pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
|
264 |
+
pdb_info.at[index, 'chain'] = record.annotations["chain"]
|
265 |
+
pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
|
266 |
+
index += 1
|
267 |
+
except IndexError as a:
|
268 |
+
st.write(a)
|
269 |
+
pdb_info.at[index, 'uniprotID'] = 'nan'
|
270 |
+
pdb_info.at[index, 'pdbID'] = 'nan'
|
271 |
+
pdb_info.at[index, 'chain'] = 'nan'
|
272 |
+
pdb_info.at[index, 'resolution'] = 'nan'
|
273 |
+
cnt +=1
|
|
|
274 |
print()
|
|
|
|
|
275 |
print('PDB file processing finished..')
|
276 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
277 |
try:
|
|
|
324 |
TypeError
|
325 |
with_pdb.at[i, 'pdbInfo'] = 'nan'
|
326 |
|
327 |
+
with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
328 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
|
329 |
'wt_sequence_match',
|
330 |
'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
|
331 |
|
332 |
+
|
333 |
+
|
334 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
335 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
336 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
|
|
344 |
if len(with_pdb) > 0:
|
345 |
with_pdb = add_annotations(with_pdb)
|
346 |
else:
|
347 |
+
new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
|
|
|
348 |
'activeSite',
|
349 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
350 |
'crosslink', 'mutagenesis', 'strand',
|
|
|
363 |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
364 |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
365 |
'glycosylationBinary', 'propeptideBinary']
|
366 |
+
with_pdb = pd.DataFrame(columns = new_cols)
|
367 |
try:
|
368 |
with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
|
369 |
except:
|
|
|
375 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
376 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
377 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
378 |
+
|
379 |
"""
|
380 |
STEP 7
|
381 |
Do alignment for PDB
|
|
|
407 |
pdb_fasta = None
|
408 |
pdb_info = None
|
409 |
pdbs = None
|
410 |
+
existing_pdb = None
|
|
|
411 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
412 |
with_pdb = None
|
413 |
+
|
414 |
+
|
415 |
print('Aligning sequences...\n')
|
416 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
417 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
434 |
aligned_m = aligned_m.astype(str)
|
435 |
aligned_nm = aligned_nm.astype(str)
|
436 |
|
437 |
+
|
438 |
frames = [aligned_m, aligned_nm]
|
439 |
after_up_pdb_alignment = pd.concat(frames, sort=False)
|
440 |
if len(after_up_pdb_alignment) == 0:
|
|
|
457 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
458 |
no_pdb = no_pdb.copy()
|
459 |
|
460 |
+
|
461 |
print('PDB matching is completed...\n')
|
462 |
print('SUMMARY')
|
463 |
print('-------')
|
|
|
472 |
print('--%d will be searched in Swiss-Model database.\n' % (
|
473 |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
|
474 |
|
475 |
+
|
476 |
dfM = None
|
477 |
dfNM = None
|
478 |
aligned_nm = None
|
|
|
528 |
swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
|
529 |
dtype=str, header=None, skiprows=1,
|
530 |
names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
|
531 |
+
'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
|
|
|
532 |
|
533 |
else:
|
534 |
swiss_model = pd.DataFrame(
|
|
|
548 |
swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
|
549 |
else:
|
550 |
swiss_model.at[ind, 'whichIsoform'] = 'nan'
|
551 |
+
# swiss_model.drop(['input'], axis=1, inplace=True)
|
552 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
553 |
print('Index File Processed...\n')
|
554 |
|
555 |
+
|
556 |
# Get relevant columns
|
557 |
+
swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
|
|
558 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
559 |
swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
|
560 |
swiss_model.reset_index(inplace=True)
|
|
|
711 |
ascending=[True, False])
|
712 |
swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
|
713 |
|
714 |
+
|
715 |
swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
|
716 |
swiss_models_with_data.reset_index(inplace=True)
|
717 |
swiss_models_with_data.drop(['index'], axis=1, inplace=True)
|
|
|
728 |
|
729 |
swiss_models_with_data = swiss_models_with_data1.copy()
|
730 |
|
731 |
+
|
732 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
|
733 |
swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
|
734 |
axis=0, ascending=[True, True, True, False])
|
|
|
738 |
keep='first')
|
739 |
swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
|
740 |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
|
741 |
+
len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
|
|
|
742 |
no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
|
743 |
# This printed data here includes all possible models with different qualities,
|
744 |
# because we may get a hit in either of them.
|
|
|
765 |
|
766 |
swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
|
767 |
swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
|
768 |
+
swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
|
|
|
769 |
swiss_models_with_data = None
|
770 |
|
771 |
+
|
772 |
if len(swiss_model_aligned) == 0:
|
773 |
swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
|
774 |
swiss_model_aligned['qmean_norm'] = 'nan'
|
|
|
861 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
862 |
print(url)
|
863 |
req = requests.get(url)
|
864 |
+
name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
865 |
with open(name, 'wb') as f:
|
866 |
f.write(req.content)
|
867 |
else:
|
|
|
878 |
individual.write(str('UniProt ID: ' + protein))
|
879 |
individual.write('\n')
|
880 |
individual.write(str(pdb.contents[3])[10:-11].strip())
|
881 |
+
with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
|
882 |
encoding="utf8") as f:
|
883 |
fasta = ''
|
884 |
chain = ''
|
|
|
961 |
existing_modbase_models = None
|
962 |
existing_modbase_models_ind = None
|
963 |
|
964 |
+
|
965 |
model_info_added = model_info_added.drop(['UniprotID'], axis=1)
|
966 |
model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
|
967 |
'PDBCode': 'template', 'PDBChain': 'chain',
|
|
|
1014 |
with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
|
1015 |
axis=0,
|
1016 |
ascending=[True, True, True, True, False, True, False])
|
1017 |
+
with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
|
|
|
1018 |
|
1019 |
with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
|
1020 |
with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
|
|
|
1028 |
with_modbase_info.reset_index(inplace=True)
|
1029 |
with_modbase_info.drop('index', axis=1, inplace=True)
|
1030 |
|
1031 |
+
|
1032 |
align = with_modbase_info[
|
1033 |
with_modbase_info.fasta != 'nan']
|
1034 |
yes_pdb_no_match = with_modbase_info[
|
|
|
1047 |
modbase_aligned = modbase_aligned.astype(str)
|
1048 |
modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
|
1049 |
|
1050 |
+
|
1051 |
# Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
|
1052 |
if len(with_modbase_info) != 0:
|
1053 |
not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
|
|
|
1055 |
['datapoint'],
|
1056 |
keep=False)
|
1057 |
else:
|
1058 |
+
not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1059 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1060 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
|
1061 |
+
'intMet',
|
1062 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
1063 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
1064 |
+
'crosslink',
|
1065 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1066 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1067 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
|
1068 |
+
'coiledCoil',
|
1069 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
1070 |
+
'disulfide',
|
1071 |
+
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
|
1072 |
+
'activeSite',
|
1073 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
1074 |
+
'crosslink',
|
1075 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1076 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1077 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
|
1078 |
+
'coiledCoil',
|
1079 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
|
1080 |
+
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
|
|
|
1081 |
with_modbase_info = None
|
1082 |
if len(not_in_aligned) != 0:
|
1083 |
not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
|
|
|
1094 |
nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
|
1095 |
not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
|
1096 |
not_nan.score = not_nan.score.astype(float)
|
1097 |
+
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
|
|
|
1098 |
|
1099 |
not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
|
1100 |
ascending=[True, True, False])
|
|
|
1106 |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
|
1107 |
if len(which_ones_are_match) == 0:
|
1108 |
which_ones_are_match = pd.DataFrame(
|
1109 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1110 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1111 |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
1112 |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
|
|
1142 |
not_nan = None
|
1143 |
nan = None
|
1144 |
|
1145 |
+
|
1146 |
# merge not_in_align and modbase_not_match as they were both excluded from modbase match.
|
1147 |
|
1148 |
# No model
|
|
|
1171 |
elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
|
1172 |
rest = no_info
|
1173 |
else:
|
1174 |
+
rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1175 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1176 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
|
|
1177 |
|
1178 |
rest = rest[to_swiss_columns]
|
1179 |
rest = rest.drop_duplicates()
|
|
|
1185 |
|
1186 |
else:
|
1187 |
|
1188 |
+
modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1189 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1190 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
1191 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
1192 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
1193 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1194 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1195 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
1196 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
1197 |
+
'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
|
1198 |
+
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
|
1199 |
+
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
1200 |
+
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
1201 |
+
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
1202 |
+
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
|
1203 |
+
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
|
1204 |
+
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
1205 |
+
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
1206 |
+
'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
|
1207 |
+
'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
|
1208 |
+
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
|
1209 |
+
not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
1210 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1211 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
1212 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
1213 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
1214 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1215 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1216 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
1217 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
|
1218 |
+
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
1219 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
1220 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1221 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1222 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
1223 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
|
1224 |
+
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
|
1225 |
+
no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1226 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1227 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
1228 |
+
rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
1229 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1230 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
|
|
|
|
|
|
|
|
1231 |
|
1232 |
rest = rest[to_swiss_columns]
|
1233 |
rest = rest.drop_duplicates()
|
|
|
1263 |
not_models = None
|
1264 |
modbase_not_match = None
|
1265 |
|
1266 |
+
|
1267 |
# Final corrections
|
1268 |
|
1269 |
# Now 3D alignment.
|
|
|
1285 |
|
1286 |
# Fix the axes and merge all data.
|
1287 |
|
1288 |
+
|
1289 |
pdb.drop(['pdbInfo'], axis=1, inplace=True)
|
1290 |
pdb.rename(columns={'resolution': 'score'}, inplace=True)
|
1291 |
swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
|
|
|
1298 |
modbase['source'] = 'MODBASE'
|
1299 |
data = pd.concat([swiss, modbase, pdb])
|
1300 |
|
1301 |
+
|
1302 |
data.reset_index(inplace=True)
|
1303 |
data.drop(['index'], axis=1, inplace=True)
|
1304 |
data = data.astype('str')
|
|
|
1322 |
for pdbID in pdb_only.pdbID.to_list():
|
1323 |
if pdbID not in existing_free_sasa:
|
1324 |
(run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
|
1325 |
+
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
|
|
|
1326 |
outdir=None, force_rerun=False, file_type='pdb'))
|
1327 |
|
1328 |
+
|
1329 |
print('Calculation RSA for SwissModel Files...\n')
|
1330 |
swiss_only = data[data.source == 'SWISSMODEL']
|
1331 |
swiss_dp = []
|
|
|
1343 |
for pdbID in modbase_only.pdbID.to_list():
|
1344 |
if pdbID not in existing_free_sasa:
|
1345 |
(run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
|
1346 |
+
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
|
|
|
1347 |
outdir=None, force_rerun=False, file_type='pdb'))
|
1348 |
|
1349 |
# This annotation list is different than the prev one, keep it.
|
|
|
1381 |
chain = data.at[i, 'chain']
|
1382 |
uniprotID = data.at[i, 'uniprotID']
|
1383 |
pdbID = data.at[i, 'pdbID']
|
1384 |
+
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
|
|
1385 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
1386 |
try:
|
1387 |
+
coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
|
1388 |
except:
|
1389 |
ValueError
|
1390 |
coordMut = 'nan'
|
1391 |
try:
|
1392 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
1393 |
+
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
|
|
|
1394 |
except:
|
1395 |
ValueError
|
1396 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
|
|
1438 |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
|
1439 |
float(data.at[i, 'domainEndonPDB']))
|
1440 |
|
1441 |
+
|
1442 |
data = data.astype(str)
|
1443 |
data.replace({'NaN': 'nan'}, inplace=True)
|
1444 |
|
1445 |
+
|
1446 |
# Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
|
1447 |
|
1448 |
# Get interface positions from ECLAIR. Download HQ human
|
|
|
1463 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
1464 |
|
1465 |
if len(data) == 0:
|
1466 |
+
data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
1467 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
1468 |
+
'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
|
1469 |
+
'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
|
1470 |
+
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
|
1471 |
+
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
|
1472 |
+
'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
1473 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
1474 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
1475 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
1476 |
+
'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
|
1477 |
+
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
|
1478 |
+
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
1479 |
+
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
1480 |
+
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
1481 |
+
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
|
1482 |
+
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
|
1483 |
+
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
1484 |
+
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
1485 |
+
'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
|
1486 |
+
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
|
1487 |
+
'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
|
|
|
1488 |
else:
|
1489 |
data.sasa = data.sasa.astype('str')
|
1490 |
|
|
|
1523 |
|
1524 |
data.drop(['positions'], axis=1, inplace=True)
|
1525 |
|
1526 |
+
|
1527 |
# OPTIONAL
|
1528 |
# DOMAIN SELECTION
|
1529 |
# Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
|
|
|
1542 |
# nan--> 0, 0 -->1 and 1 -->2
|
1543 |
|
1544 |
print('Final adjustments are being done...\n')
|
1545 |
+
binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
|
|
|
1546 |
'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
1547 |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
1548 |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
|
|
1644 |
ready = data.copy()
|
1645 |
# Imputation
|
1646 |
if (impute == 'True') or (impute == 'true') or (impute == True):
|
1647 |
+
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
|
|
|
1648 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
1649 |
col_index = 0
|
1650 |
for col_ in ready.columns[-30:]:
|
|
|
1659 |
ready = ready.replace({'nan': np.NaN})
|
1660 |
ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
1661 |
if len(ready) == 0:
|
1662 |
+
print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
|
|
1663 |
print(ready)
|
1664 |
print('Feature vector successfully created...')
|
1665 |
return ready
|